From c78d1aac4780bf0af9ea2705d5aa3c7c917104a1 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani Date: Mon, 7 Oct 2024 11:46:43 +0200 Subject: [PATCH 01/90] fix conflicts --- README.md | 1 + s3-source-connector/README.md | 56 ++ s3-source-connector/build.gradle.kts | 257 ++++++ s3-source-connector/gradle.properties | 4 + s3-source-connector/licenses/LICENSE-aws.txt | 63 ++ s3-source-connector/notices/NOTICE-aws.txt | 13 + .../connect/source/s3/ConnectRunner.java | 119 +++ .../connect/source/s3/IntegrationBase.java | 94 +++ .../connect/source/s3/IntegrationTest.java | 107 +++ .../resources/logback-test.xml | 17 + .../AivenKafkaConnectS3SourceConnector.java | 69 ++ .../kafka/connect/source/s3/S3SourceTask.java | 62 ++ .../kafka/connect/source/s3/Version.java | 43 + .../source/s3/config/S3SourceConfig.java | 68 ++ .../source/s3/config/S3SourceConfigDef.java | 30 + ...nector-for-apache-kafka-version.properties | 16 + .../src/test/resources/blns.txt | 739 ++++++++++++++++++ .../src/test/resources/logback-test.xml | 12 + settings.gradle.kts | 2 + 19 files changed, 1772 insertions(+) create mode 100644 s3-source-connector/README.md create mode 100644 s3-source-connector/build.gradle.kts create mode 100644 s3-source-connector/gradle.properties create mode 100644 s3-source-connector/licenses/LICENSE-aws.txt create mode 100644 s3-source-connector/notices/NOTICE-aws.txt create mode 100644 s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java create mode 100644 s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java create mode 100644 s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java create mode 100644 s3-source-connector/src/integration-test/resources/logback-test.xml create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java create mode 100644 s3-source-connector/src/main/resources/source-s3-connector-for-apache-kafka-version.properties create mode 100644 s3-source-connector/src/test/resources/blns.txt create mode 100644 s3-source-connector/src/test/resources/logback-test.xml diff --git a/README.md b/README.md index b8bd950e8..b8f0ff2e2 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ - [Aiven GCS Sink Connector](./gcs-sink-connector/README.md) - [Aiven S3 Sink Connector](./s3-sink-connector/README.md) - [Aiven Azure Blob Sink Connector](./azure-sink-connector/README.md) +- [Aiven S3 Source Connector](./s3-source-connector/README.md) # Development diff --git a/s3-source-connector/README.md b/s3-source-connector/README.md new file mode 100644 index 000000000..2ee9caacc --- /dev/null +++ b/s3-source-connector/README.md @@ -0,0 +1,56 @@ +# Aiven's S3 Source Connector for Apache Kafka + +This is a source Apache Kafka Connect connector that stores Apache Kafka messages in an AWS S3 bucket. + +**Table of Contents** + +- [How it works](#how-it-works) +- [Data Format](#data-format) +- [Usage](#usage) +- [Configuration](#configuration) +- [Development](#development) + + +## How it works + +The connector connects to Amazon S3 and periodically queries its data +sources. Each object from the s3 response is transformed into a record and +published into the corresponding Kafka topic. + +### Requirements + +The connector requires Java 11 or newer for development and production. + +## TODO update documentation + +## Development + +### Developing together with Commons library + +This project depends on [Common Module for Apache Kafka Connect](../commons/README.md). + +### Integration testing + +Integration tests are implemented using JUnit, Gradle and Docker. + +To run them, you need: +- Docker installed. + +Integration testing doesn't require valid AWS credentials. + +To simulate AWS S3 behaviour, tests use [LocalStack](https://github.com/localstack/localstack-java-utils). + +In order to run the integration tests, execute from the project root +directory: + +```bash +./gradlew clean integrationTest +``` + +## License + +This project is licensed under the [Apache License, Version 2.0](LICENSE). + +## Trademarks + +Apache Kafka, Apache Kafka Connect are either registered trademarks or trademarks of the Apache Software Foundation in the United States and/or other countries. AWS S3 is a trademark and property of their respective owners. All product and service names used in this website are for identification purposes only and do not imply endorsement. diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts new file mode 100644 index 000000000..57eb9e259 --- /dev/null +++ b/s3-source-connector/build.gradle.kts @@ -0,0 +1,257 @@ +import com.github.spotbugs.snom.SpotBugsTask + +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +plugins { id("aiven-apache-kafka-connectors-all.java-conventions") } + +val integrationTest: SourceSet = + sourceSets.create("integrationTest") { + java { srcDir("src/integration-test/java") } + resources { srcDir("src/integration-test/resources") } + compileClasspath += sourceSets.main.get().output + configurations.testRuntimeClasspath.get() + runtimeClasspath += output + compileClasspath + } + +val integrationTestImplementation: Configuration by + configurations.getting { extendsFrom(configurations.implementation.get()) } + +tasks.register("integrationTest") { + description = "Runs the integration tests." + group = "verification" + testClassesDirs = integrationTest.output.classesDirs + classpath = integrationTest.runtimeClasspath + + // defines testing order + shouldRunAfter("test") + // requires archive for connect runner + dependsOn("distTar") + useJUnitPlatform() + + // Run always. + outputs.upToDateWhen { false } + + val distTarTask = tasks.get("distTar") as Tar + val distributionFilePath = distTarTask.archiveFile.get().asFile.path + systemProperty("integration-test.distribution.file.path", distributionFilePath) +} + +idea { + module { + testSources.from(integrationTest.java.srcDirs) + testSources.from(integrationTest.resources.srcDirs) + } +} + +dependencies { + compileOnly(apache.kafka.connect.api) + compileOnly(apache.kafka.connect.runtime) + + implementation(project(":commons")) + + implementation(tools.spotbugs.annotations) + implementation(logginglibs.slf4j) + + testImplementation(compressionlibs.snappy) + testImplementation(compressionlibs.zstd.jni) + + testImplementation(apache.kafka.connect.api) + testImplementation(apache.kafka.connect.runtime) + testImplementation(apache.kafka.connect.json) + + testImplementation(testinglibs.junit.jupiter) + testImplementation(testinglibs.assertj.core) + + testImplementation(testinglibs.mockito.core) + + testRuntimeOnly(testinglibs.junit.jupiter.engine) + testImplementation(testinglibs.mockito.junit.jupiter) + + testRuntimeOnly(logginglibs.logback.classic) + + integrationTestImplementation(testinglibs.localstack) + integrationTestImplementation(testcontainers.junit.jupiter) + integrationTestImplementation(testcontainers.kafka) // this is not Kafka version + integrationTestImplementation(testcontainers.localstack) + integrationTestImplementation(testinglibs.wiremock) + + // TODO: add avro-converter to ConnectRunner via plugin.path instead of on worker classpath + integrationTestImplementation(confluent.kafka.connect.avro.converter) { + exclude(group = "org.apache.kafka", module = "kafka-clients") + } + + integrationTestImplementation(apache.avro) + + testImplementation(apache.parquet.tools) { exclude(group = "org.slf4j", module = "slf4j-api") } + testImplementation(apache.hadoop.mapreduce.client.core) { + exclude(group = "org.apache.hadoop", module = "hadoop-yarn-client") + exclude(group = "org.apache.hadoop.thirdparty", module = "hadoop-shaded-protobuf_3_7") + exclude(group = "com.google.guava", module = "guava") + exclude(group = "commons-cli", module = "commons-cli") + exclude(group = "org.apache.commons", module = "commons-math3") + exclude(group = "org.apache.httpcomponents", module = "httpclient") + exclude(group = "commons-codec", module = "commons-codec") + exclude(group = "commons-io", module = "commons-io") + exclude(group = "commons-net", module = "commons-net") + exclude(group = "org.eclipse.jetty") + exclude(group = "org.eclipse.jetty.websocket") + exclude(group = "javax.servlet") + exclude(group = "javax.servlet.jsp") + exclude(group = "javax.activation") + exclude(group = "com.sun.jersey") + exclude(group = "log4j") + exclude(group = "org.apache.commons", module = "commons-text") + exclude(group = "org.slf4j", module = "slf4j-api") + exclude(group = "org.apache.hadoop", module = "hadoop-auth") + exclude(group = "org.apache.hadoop", module = "hadoop-yarn-api") + exclude(group = "com.google.re2j") + exclude(group = "com.google.protobuf") + exclude(group = "com.google.code.gson") + exclude(group = "com.jcraft") + exclude(group = "org.apache.curator") + exclude(group = "org.apache.zookeeper") + exclude(group = "org.apache.htrace") + exclude(group = "com.google.code.findbugs") + exclude(group = "org.apache.kerby") + exclude(group = "com.fasterxml.jackson.core") + exclude(group = "com.fasterxml.woodstox", module = "woodstox-core:5.0.3") + exclude(group = "org.apache.avro", module = "avro") + exclude(group = "org.apache.hadoop", module = "hadoop-yarn-common") + exclude(group = "com.google.inject.extensions", module = "guice-servlet") + exclude(group = "io.netty", module = "netty") + } + + // Make test utils from 'test' available in 'integration-test' + integrationTestImplementation(sourceSets["test"].output) + integrationTestImplementation(testinglibs.awaitility) +} + +tasks.named("pmdIntegrationTest") { + ruleSetFiles = files("${project.rootDir}/gradle-config/aiven-pmd-test-ruleset.xml") + ruleSets = emptyList() // Clear the default rulesets +} + +tasks.named("spotbugsIntegrationTest") { + reports.create("html") { setStylesheet("fancy-hist.xsl") } +} + +tasks.processResources { + filesMatching("s3-source-connector-for-apache-kafka-version.properties") { + expand(mapOf("version" to version)) + } +} + +tasks.jar { manifest { attributes(mapOf("Version" to project.version)) } } + +tasks.distTar { dependsOn(":commons:jar") } + +tasks.distZip { dependsOn(":commons:jar") } + +distributions { + main { + contents { + from("jar") + from(configurations.runtimeClasspath.get().map { if (it.isDirectory) it else zipTree(it) }) + + into("/") { + from("$projectDir") + include("version.txt", "README*", "LICENSE*", "NOTICE*", "licenses/") + include("config/") + } + } + } +} + +publishing { + publications { + create("publishMavenJavaArtifact") { + groupId = group.toString() + artifactId = "s3-source-connector-for-apache-kafka" + version = version.toString() + + from(components["java"]) + + pom { + name = "Aiven's S3 Source Connector for Apache Kafka" + description = "Aiven's S3 Source Connector for Apache Kafka" + url = "https://github.com/aiven-open/s3-source-connector-for-apache-kafka" + organization { + name = "Aiven Oy" + url = "https://aiven.io" + } + + licenses { + license { + name = "Apache 2.0" + url = "http://www.apache.org/licenses/LICENSE-2.0" + distribution = "repo" + } + } + + developers { + developer { + id = "aiven" + name = "Aiven Opensource" + email = "opensource@aiven.io" + } + } + + scm { + connection = "scm:git:git://github.com:aiven/s3-source-connector-for-apache-kafka.git" + developerConnection = + "scm:git:ssh://github.com:aiven/s3-source-connector-for-apache-kafka.git" + url = "https://github.com/aiven-open/s3-source-connector-for-apache-kafka" + } + } + } + } + + repositories { + maven { + name = "sonatype" + + val releasesRepoUrl = uri("https://oss.sonatype.org/service/local/staging/deploy/maven2") + val snapshotsRepoUrl = uri("https://oss.sonatype.org/content/repositories/snapshots") + url = if (version.toString().endsWith("SNAPSHOT")) snapshotsRepoUrl else releasesRepoUrl + + credentials(PasswordCredentials::class) + } + } +} + +signing { + sign(publishing.publications["publishMavenJavaArtifact"]) + useGpgCmd() + // Some issue in the plugin: + // GPG outputs already armored signatures. The plugin also does armoring for `asc` files. + // This results in double armored signatures, i.e. garbage. + // Override the signature type provider to use unarmored output for `asc` files, which works well + // with GPG. + class ASCSignatureProvider() : AbstractSignatureTypeProvider() { + val binary = + object : BinarySignatureType() { + override fun getExtension(): String { + return "asc" + } + } + + init { + register(binary) + setDefaultType(binary.extension) + } + } + signatureTypes = ASCSignatureProvider() +} diff --git a/s3-source-connector/gradle.properties b/s3-source-connector/gradle.properties new file mode 100644 index 000000000..e1c4d767e --- /dev/null +++ b/s3-source-connector/gradle.properties @@ -0,0 +1,4 @@ +version=0.0.1-SNAPSHOT + +sonatypeUsername= +sonatypePassword= diff --git a/s3-source-connector/licenses/LICENSE-aws.txt b/s3-source-connector/licenses/LICENSE-aws.txt new file mode 100644 index 000000000..aeea99958 --- /dev/null +++ b/s3-source-connector/licenses/LICENSE-aws.txt @@ -0,0 +1,63 @@ +Apache License +Version 2.0, January 2004 + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + + 1. You must give any other recipients of the Work or Derivative Works a copy of this License; and + 2. You must cause any modified files to carry prominent notices stating that You changed the files; and + 3. You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and + 4. If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. + +You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +Note: Other license terms may apply to certain, identified software files contained within or distributed with the accompanying software if such terms are included in the directory containing the accompanying software. Such other license terms will then apply in lieu of the terms of the software license above. + +JSON processing code subject to the JSON License from JSON.org: + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +The Software shall be used for Good, not Evil. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/s3-source-connector/notices/NOTICE-aws.txt b/s3-source-connector/notices/NOTICE-aws.txt new file mode 100644 index 000000000..979460ec7 --- /dev/null +++ b/s3-source-connector/notices/NOTICE-aws.txt @@ -0,0 +1,13 @@ +AWS IoT Device SDK for Java +Copyright 2010-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +This product includes software developed by +Amazon Technologies, Inc (http://www.amazon.com/). + +********************** +THIRD PARTY COMPONENTS +********************** +This software includes third party software subject to the following copyrights: +- PKCS#1 and PKCS#8 PEM encoded private key parsing and utility functions from oauth.googlecode.com - Copyright 1998-2010 AOL Inc. + +The licenses for these third party components are included in LICENSE.txt \ No newline at end of file diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java new file mode 100644 index 000000000..5aab1c99f --- /dev/null +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java @@ -0,0 +1,119 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ExecutionException; + +import org.apache.kafka.common.utils.Time; +import org.apache.kafka.connect.runtime.Connect; +import org.apache.kafka.connect.runtime.ConnectorConfig; +import org.apache.kafka.connect.runtime.Herder; +import org.apache.kafka.connect.runtime.Worker; +import org.apache.kafka.connect.runtime.isolation.Plugins; +import org.apache.kafka.connect.runtime.rest.RestServer; +import org.apache.kafka.connect.runtime.rest.entities.ConnectorInfo; +import org.apache.kafka.connect.runtime.standalone.StandaloneConfig; +import org.apache.kafka.connect.runtime.standalone.StandaloneHerder; +import org.apache.kafka.connect.storage.MemoryOffsetBackingStore; +import org.apache.kafka.connect.util.FutureCallback; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +final class ConnectRunner { + private static final Logger LOGGER = LoggerFactory.getLogger(ConnectRunner.class); + + private final File pluginDir; + private final String bootstrapServers; + private final int offsetFlushInterval; + + private Herder herder; + private Connect connect; + + public ConnectRunner(final File pluginDir, final String bootstrapServers, final int offsetFlushIntervalMs) { + this.pluginDir = pluginDir; + this.bootstrapServers = bootstrapServers; + this.offsetFlushInterval = offsetFlushIntervalMs; + } + + void start() { + final Map workerProps = new HashMap<>(); + workerProps.put("bootstrap.servers", bootstrapServers); + + workerProps.put("offset.flush.interval.ms", Integer.toString(offsetFlushInterval)); + + // These don't matter much (each connector sets its own converters), but need to be filled with valid classes. + workerProps.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + workerProps.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + workerProps.put("internal.key.converter", "org.apache.kafka.connect.json.JsonConverter"); + workerProps.put("internal.key.converter.schemas.enable", "false"); + workerProps.put("internal.value.converter", "org.apache.kafka.connect.json.JsonConverter"); + workerProps.put("internal.value.converter.schemas.enable", "false"); + + // Don't need it since we'll memory MemoryOffsetBackingStore. + workerProps.put("offset.storage.file.filename", ""); + + workerProps.put("plugin.path", pluginDir.getPath()); + + final Time time = Time.SYSTEM; + final String workerId = "test-worker"; + final String kafkaClusterId = "test-cluster"; + + final Plugins plugins = new Plugins(workerProps); + final StandaloneConfig config = new StandaloneConfig(workerProps); + + final Worker worker = new Worker(workerId, time, plugins, config, new MemoryOffsetBackingStore()); + herder = new StandaloneHerder(worker, kafkaClusterId); + + final RestServer rest = new RestServer(config); + + connect = new Connect(herder, rest); + + connect.start(); + } + + void createConnector(final Map config) throws ExecutionException, InterruptedException { + assert herder != null; + + final FutureCallback> callback = new FutureCallback<>((error, info) -> { + if (error != null) { + LOGGER.error("Failed to create job"); + } else { + LOGGER.info("Created connector {}", info.result().name()); + } + }); + herder.putConnectorConfig(config.get(ConnectorConfig.NAME_CONFIG), config, false, callback); + + final Herder.Created connectorInfoCreated = callback.get(); + assert connectorInfoCreated.created(); + assertThat(connectorInfoCreated.result().config().get("connector.class")) + .isEqualTo(AivenKafkaConnectS3SourceConnector.class.getName()); + } + + void stop() { + connect.stop(); + } + + void awaitStop() { + connect.awaitStop(); + } +} diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java new file mode 100644 index 000000000..be21ec8f1 --- /dev/null +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java @@ -0,0 +1,94 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.time.Duration; +import java.util.List; +import java.util.Properties; +import java.util.concurrent.ExecutionException; +import java.util.stream.Collectors; + +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.AdminClientConfig; +import org.apache.kafka.clients.admin.NewTopic; + +import com.github.dockerjava.api.model.Ulimit; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.TestInfo; +import org.testcontainers.containers.Container; +import org.testcontainers.containers.KafkaContainer; +import org.testcontainers.containers.Network; +import org.testcontainers.utility.DockerImageName; + +public interface IntegrationBase { + + String DOCKER_IMAGE_KAFKA = "confluentinc/cp-kafka:7.7.0"; + + default AdminClient newAdminClient(final KafkaContainer kafka) { + final Properties adminClientConfig = new Properties(); + adminClientConfig.put(AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, kafka.getBootstrapServers()); + return AdminClient.create(adminClientConfig); + } + + default ConnectRunner newConnectRunner(final KafkaContainer kafka, final File pluginDir, + final int offsetFlushIntervalMs) { + return new ConnectRunner(pluginDir, kafka.getBootstrapServers(), offsetFlushIntervalMs); + } + + static void extractConnectorPlugin(File pluginDir) throws IOException, InterruptedException { + final File distFile = new File(System.getProperty("integration-test.distribution.file.path")); + assert distFile.exists(); + + final String cmd = String.format("tar -xf %s --strip-components=1 -C %s", distFile, pluginDir.toString()); + final Process process = Runtime.getRuntime().exec(cmd); + assert process.waitFor() == 0; + } + + static File getPluginDir() throws IOException { + final File testDir = Files.createTempDirectory("s3-source-connector-for-apache-kafka-test-").toFile(); + + final File pluginDir = new File(testDir, "plugins/s3-source-connector-for-apache-kafka/"); + assert pluginDir.mkdirs(); + return pluginDir; + } + + static KafkaContainer createKafkaContainer() { + return new KafkaContainer(DockerImageName.parse(DOCKER_IMAGE_KAFKA)) + .withEnv("KAFKA_AUTO_CREATE_TOPICS_ENABLE", "false") + .withNetwork(Network.newNetwork()) + .withExposedPorts(KafkaContainer.KAFKA_PORT, 9092) + .withCreateContainerCmdModifier( + cmd -> cmd.getHostConfig().withUlimits(List.of(new Ulimit("nofile", 30_000L, 30_000L)))); + } + + static String topicName(final TestInfo testInfo) { + return testInfo.getTestMethod().get().getName() + "-" + testInfo.getDisplayName().hashCode(); + } + + static void createTopics(final AdminClient adminClient, final List topicNames) + throws ExecutionException, InterruptedException { + final var newTopics = topicNames.stream().map(s -> new NewTopic(s, 4, (short) 1)).collect(Collectors.toList()); + adminClient.createTopics(newTopics).all().get(); + } + + static void waitForRunningContainer(final Container kafka) { + Awaitility.await().atMost(Duration.ofMinutes(1)).until(kafka::isRunning); + } +} diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java new file mode 100644 index 000000000..921f97715 --- /dev/null +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java @@ -0,0 +1,107 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; + +import org.apache.kafka.clients.admin.AdminClient; + +import org.junit.Ignore; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; +import org.testcontainers.containers.KafkaContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +@Ignore +@Testcontainers +final class IntegrationTest implements IntegrationBase { + private static final String CONNECTOR_NAME = "aiven-s3-source-connector"; + private static final int OFFSET_FLUSH_INTERVAL_MS = 5000; + + private static File pluginDir; + + @Container + private static final KafkaContainer KAFKA = IntegrationBase.createKafkaContainer(); + private AdminClient adminClient; + private ConnectRunner connectRunner; + + @BeforeAll + static void setUpAll() throws IOException, InterruptedException { + pluginDir = IntegrationBase.getPluginDir(); + IntegrationBase.extractConnectorPlugin(pluginDir); + IntegrationBase.waitForRunningContainer(KAFKA); + } + + @BeforeEach + void setUp(final TestInfo testInfo) throws ExecutionException, InterruptedException { + adminClient = newAdminClient(KAFKA); + + final var topicName = IntegrationBase.topicName(testInfo); + final var topics = List.of(topicName); + IntegrationBase.createTopics(adminClient, topics); + + connectRunner = newConnectRunner(KAFKA, pluginDir, OFFSET_FLUSH_INTERVAL_MS); + connectRunner.start(); + } + + @AfterEach + void tearDown() { + connectRunner.stop(); + adminClient.close(); + + connectRunner.awaitStop(); + } + + @Test + void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedException { + final var topicName = IntegrationBase.topicName(testInfo); + final Map connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + connectRunner.createConnector(connectorConfig); + + assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); + } + + private Map getConfig(final Map config, final String topicName) { + return getConfig(config, List.of(topicName)); + } + + private Map getConfig(final Map config, final List topicNames) { + config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); + config.put("topics", String.join(",", topicNames)); + return config; + } + + private Map basicConnectorConfig(final String connectorName) { + final Map config = new HashMap<>(); + config.put("name", connectorName); + config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put("tasks.max", "1"); + return config; + } +} diff --git a/s3-source-connector/src/integration-test/resources/logback-test.xml b/s3-source-connector/src/integration-test/resources/logback-test.xml new file mode 100644 index 000000000..fd146afc4 --- /dev/null +++ b/s3-source-connector/src/integration-test/resources/logback-test.xml @@ -0,0 +1,17 @@ + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + + + \ No newline at end of file diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java new file mode 100644 index 000000000..16488e130 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java @@ -0,0 +1,69 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.connect.connector.Task; +import org.apache.kafka.connect.source.SourceConnector; + +import io.aiven.kafka.connect.source.s3.config.S3SourceConfig; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * AivenKafkaConnectS3SourceConnector is a Kafka Connect Connector implementation that watches a S3 bucket and generates + * tasks to ingest contents. + */ +public class AivenKafkaConnectS3SourceConnector extends SourceConnector { + + private static final Logger LOGGER = LoggerFactory.getLogger(AivenKafkaConnectS3SourceConnector.class); + + @Override + public ConfigDef config() { + return S3SourceConfig.configDef(); + } + + @Override + public String version() { + return Version.VERSION; + } + + @Override + public Class taskClass() { + return S3SourceTask.class; + } + + @Override + public List> taskConfigs(final int maxTasks) { + return Collections.emptyList(); + } + + @Override + public void start(final Map properties) { + LOGGER.info("Start S3 Source connector"); + } + + @Override + public void stop() { + LOGGER.info("Stop S3 Source connector"); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java new file mode 100644 index 000000000..704579fba --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java @@ -0,0 +1,62 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import org.apache.kafka.connect.source.SourceRecord; +import org.apache.kafka.connect.source.SourceTask; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * S3SourceTask is a Kafka Connect SourceTask implementation that reads from source-s3 buckets and generates Kafka + * Connect records. + */ +public class S3SourceTask extends SourceTask { + + private static final Logger LOGGER = LoggerFactory.getLogger(AivenKafkaConnectS3SourceConnector.class); + + @SuppressWarnings("PMD.UnnecessaryConstructor") // required by Connect + public S3SourceTask() { + super(); + } + + @Override + public String version() { + return null; + } + + @Override + public void start(final Map props) { + LOGGER.info("S3 Source task started."); + Objects.requireNonNull(props, "props hasn't been set"); + } + + @Override + public List poll() { + return Collections.emptyList(); + } + + @Override + public void stop() { + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java new file mode 100644 index 000000000..b5e5cdc85 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java @@ -0,0 +1,43 @@ +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3; + +import java.io.InputStream; +import java.util.Properties; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +final class Version { + private static final Logger LOGGER = LoggerFactory.getLogger(Version.class); + + private static final String PROPERTIES_FILENAME = "s3-source-connector-for-apache-kafka-version.properties"; + + static final String VERSION; // NOPMD AvoidFieldNameMatchingTypeName + + static { + final Properties props = new Properties(); + try (InputStream resourceStream = Thread.currentThread() + .getContextClassLoader() + .getResourceAsStream(PROPERTIES_FILENAME)) { + props.load(resourceStream); + } catch (final Exception e) { // NOPMD AvoidCatchingGenericException + LOGGER.warn("Error while loading {}: {}", PROPERTIES_FILENAME, e.getMessage()); + } + VERSION = props.getProperty("version", "unknown").trim(); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java new file mode 100644 index 000000000..a77caeec4 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java @@ -0,0 +1,68 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3.config; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import org.apache.kafka.common.config.ConfigDef; + +import io.aiven.kafka.connect.common.config.AivenCommonConfig; +import io.aiven.kafka.connect.common.config.CompressionType; +import io.aiven.kafka.connect.common.config.OutputField; +import io.aiven.kafka.connect.common.config.OutputFieldEncodingType; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@SuppressWarnings({ "PMD.TooManyMethods", "PMD.GodClass", "PMD.ExcessiveImports" }) +final public class S3SourceConfig extends AivenCommonConfig { + + public static final Logger LOGGER = LoggerFactory.getLogger(S3SourceConfig.class); + + public S3SourceConfig(final Map properties) { + super(configDef(), preprocessProperties(properties)); + validate(); + } + + static Map preprocessProperties(final Map properties) { + return Collections.emptyMap(); + } + + public static ConfigDef configDef() { + return new S3SourceConfigDef(); + } + + private void validate() { + LOGGER.debug("Validating config."); + } + @Override + public CompressionType getCompressionType() { + return CompressionType.GZIP; + } + + @Override + public List getOutputFields() { + return Collections.emptyList(); + } + + @Override + public OutputFieldEncodingType getOutputFieldEncodingType() { + return null; + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java new file mode 100644 index 000000000..12fa37d77 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java @@ -0,0 +1,30 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3.config; + +import java.util.List; +import java.util.Map; + +import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.common.config.ConfigValue; + +public class S3SourceConfigDef extends ConfigDef { + @Override + public List validate(final Map props) { + return super.validate(S3SourceConfig.preprocessProperties(props)); + } +} diff --git a/s3-source-connector/src/main/resources/source-s3-connector-for-apache-kafka-version.properties b/s3-source-connector/src/main/resources/source-s3-connector-for-apache-kafka-version.properties new file mode 100644 index 000000000..9c2421c8a --- /dev/null +++ b/s3-source-connector/src/main/resources/source-s3-connector-for-apache-kafka-version.properties @@ -0,0 +1,16 @@ +## +# Copyright 2024 Aiven Oy +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +## +version=${version ?: 'unknown'} diff --git a/s3-source-connector/src/test/resources/blns.txt b/s3-source-connector/src/test/resources/blns.txt new file mode 100644 index 000000000..ef5671914 --- /dev/null +++ b/s3-source-connector/src/test/resources/blns.txt @@ -0,0 +1,739 @@ +# The MIT License (MIT) +# +# Copyright (c) 2015 Max Woolf +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE + +# +# The Big List of Naughty Strings +# https://github.com/minimaxir/big-list-of-naughty-strings +# + +# Reserved Strings +# +# Strings which may be used elsewhere in code + +undefined +undef +null +NULL +(null) +nil +NIL +true +false +True +False +TRUE +FALSE +None +hasOwnProperty +then +\ +\\ + +# Numeric Strings +# +# Strings which can be interpreted as numeric + +0 +1 +1.00 +$1.00 +1/2 +1E2 +1E02 +1E+02 +-1 +-1.00 +-$1.00 +-1/2 +-1E2 +-1E02 +-1E+02 +1/0 +0/0 +-2147483648/-1 +-9223372036854775808/-1 +-0 +-0.0 ++0 ++0.0 +0.00 +0..0 +. +0.0.0 +0,00 +0,,0 +, +0,0,0 +0.0/0 +1.0/0.0 +0.0/0.0 +1,0/0,0 +0,0/0,0 +--1 +- +-. +-, +999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999 +NaN +Infinity +-Infinity +INF +1#INF +-1#IND +1#QNAN +1#SNAN +1#IND +0x0 +0xffffffff +0xffffffffffffffff +0xabad1dea +123456789012345678901234567890123456789 +1,000.00 +1 000.00 +1'000.00 +1,000,000.00 +1 000 000.00 +1'000'000.00 +1.000,00 +1 000,00 +1'000,00 +1.000.000,00 +1 000 000,00 +1'000'000,00 +01000 +08 +09 +2.2250738585072011e-308 + +# Special Characters +# +# ASCII punctuation. All of these characters may need to be escaped in some +# contexts. Divided into three groups based on (US-layout) keyboard position. + +,./;'[]\-= +<>?:"{}|_+ +!@#$%^&*()`~ + +# Non-whitespace C0 controls: U+0001 through U+0008, U+000E through U+001F, +# and U+007F (DEL) +# Often forbidden to appear in various text-based file formats (e.g. XML), +# or reused for internal delimiters on the theory that they should never +# appear in input. +# The next line may appear to be blank or mojibake in some viewers. + + +# Non-whitespace C1 controls: U+0080 through U+0084 and U+0086 through U+009F. +# Commonly misinterpreted as additional graphic characters. +# The next line may appear to be blank, mojibake, or dingbats in some viewers. +€‚ƒ„†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ + +# Whitespace: all of the characters with category Zs, Zl, or Zp (in Unicode +# version 8.0.0), plus U+0009 (HT), U+000B (VT), U+000C (FF), U+0085 (NEL), +# and U+200B (ZERO WIDTH SPACE), which are in the C categories but are often +# treated as whitespace in some contexts. +# This file unfortunately cannot express strings containing +# U+0000, U+000A, or U+000D (NUL, LF, CR). +# The next line may appear to be blank or mojibake in some viewers. +# The next line may be flagged for "trailing whitespace" in some viewers. + …             ​

    + +# Unicode additional control characters: all of the characters with +# general category Cf (in Unicode 8.0.0). +# The next line may appear to be blank or mojibake in some viewers. +­؀؁؂؃؄؅؜۝܏᠎​‌‍‎‏‪‫‬‭‮⁠⁡⁢⁣⁤⁦⁧⁨⁩𑂽𛲠𛲡𛲢𛲣𝅳𝅴𝅵𝅶𝅷𝅸𝅹𝅺󠀁󠀠󠀡󠀢󠀣󠀤󠀥󠀦󠀧󠀨󠀩󠀪󠀫󠀬󠀭󠀮󠀯󠀰󠀱󠀲󠀳󠀴󠀵󠀶󠀷󠀸󠀹󠀺󠀻󠀼󠀽󠀾󠀿󠁀󠁁󠁂󠁃󠁄󠁅󠁆󠁇󠁈󠁉󠁊󠁋󠁌󠁍󠁎󠁏󠁐󠁑󠁒󠁓󠁔󠁕󠁖󠁗󠁘󠁙󠁚󠁛󠁜󠁝󠁞󠁟󠁠󠁡󠁢󠁣󠁤󠁥󠁦󠁧󠁨󠁩󠁪󠁫󠁬󠁭󠁮󠁯󠁰󠁱󠁲󠁳󠁴󠁵󠁶󠁷󠁸󠁹󠁺󠁻󠁼󠁽󠁾󠁿 + +# "Byte order marks", U+FEFF and U+FFFE, each on its own line. +# The next two lines may appear to be blank or mojibake in some viewers. + +￾ + +# Unicode Symbols +# +# Strings which contain common unicode symbols (e.g. smart quotes) + +Ω≈ç√∫˜µ≤≥÷ +åß∂ƒ©˙∆˚¬…æ +œ∑´®†¥¨ˆøπ“‘ +¡™£¢∞§¶•ªº–≠ +¸˛Ç◊ı˜Â¯˘¿ +ÅÍÎÏ˝ÓÔÒÚÆ☃ +Œ„´‰ˇÁ¨ˆØ∏”’ +`⁄€‹›fifl‡°·‚—± +⅛⅜⅝⅞ +ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя +٠١٢٣٤٥٦٧٨٩ + +# Unicode Subscript/Superscript/Accents +# +# Strings which contain unicode subscripts/superscripts; can cause rendering issues + +⁰⁴⁵ +₀₁₂ +⁰⁴⁵₀₁₂ +ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ + +# Quotation Marks +# +# Strings which contain misplaced quotation marks; can cause encoding errors + +' +" +'' +"" +'"' +"''''"'" +"'"'"''''" + + + + + +# Two-Byte Characters +# +# Strings which contain two-byte characters: can cause rendering issues or character-length issues + +田中さんにあげて下さい +パーティーへ行かないか +和製漢語 +部落格 +사회과학원 어학연구소 +찦차를 타고 온 펲시맨과 쑛다리 똠방각하 +社會科學院語學研究所 +울란바토르 +𠜎𠜱𠝹𠱓𠱸𠲖𠳏 + +# Special Unicode Characters Union +# +# A super string recommended by VMware Inc. Globalization Team: can effectively cause rendering issues or character-length issues to validate product globalization readiness. +# +# 表 CJK_UNIFIED_IDEOGRAPHS (U+8868) +# ポ KATAKANA LETTER PO (U+30DD) +# あ HIRAGANA LETTER A (U+3042) +# A LATIN CAPITAL LETTER A (U+0041) +# 鷗 CJK_UNIFIED_IDEOGRAPHS (U+9DD7) +# Œ LATIN SMALL LIGATURE OE (U+0153) +# é LATIN SMALL LETTER E WITH ACUTE (U+00E9) +# B FULLWIDTH LATIN CAPITAL LETTER B (U+FF22) +# 逍 CJK_UNIFIED_IDEOGRAPHS (U+900D) +# Ü LATIN SMALL LETTER U WITH DIAERESIS (U+00FC) +# ß LATIN SMALL LETTER SHARP S (U+00DF) +# ª FEMININE ORDINAL INDICATOR (U+00AA) +# ą LATIN SMALL LETTER A WITH OGONEK (U+0105) +# ñ LATIN SMALL LETTER N WITH TILDE (U+00F1) +# 丂 CJK_UNIFIED_IDEOGRAPHS (U+4E02) +# 㐀 CJK Ideograph Extension A, First (U+3400) +# 𠀀 CJK Ideograph Extension B, First (U+20000) + +表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀 + +# Changing length when lowercased +# +# Characters which increase in length (2 to 3 bytes) when lowercased +# Credit: https://twitter.com/jifa/status/625776454479970304 + +Ⱥ +Ⱦ + +# Japanese Emoticons +# +# Strings which consists of Japanese-style emoticons which are popular on the web + +ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ +(。◕ ∀ ◕。) +`ィ(´∀`∩ +__ロ(,_,*) +・( ̄∀ ̄)・:*: +゚・✿ヾ╲(。◕‿◕。)╱✿・゚ +,。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’ +(╯°□°)╯︵ ┻━┻) +(ノಥ益ಥ)ノ ┻━┻ +┬─┬ノ( º _ ºノ) +( ͡° ͜ʖ ͡°) +¯\_(ツ)_/¯ + +# Emoji +# +# Strings which contain Emoji; should be the same behavior as two-byte characters, but not always + +😍 +👩🏽 +👾 🙇 💁 🙅 🙆 🙋 🙎 🙍 +🐵 🙈 🙉 🙊 +❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙 +✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿 +🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧 +0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ 🔟 + +# Regional Indicator Symbols +# +# Regional Indicator Symbols can be displayed differently across +# fonts, and have a number of special behaviors + +🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸 +🇺🇸🇷🇺🇸🇦🇫🇦🇲 +🇺🇸🇷🇺🇸🇦 + +# Unicode Numbers +# +# Strings which contain unicode numbers; if the code is localized, it should see the input as numeric + +123 +١٢٣ + +# Right-To-Left Strings +# +# Strings which contain text that should be rendered RTL if possible (e.g. Arabic, Hebrew) + +ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو. +בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ +הָיְתָהtestالصفحات التّحول +﷽ +ﷺ +مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ، + +# Trick Unicode +# +# Strings which contain unicode with unusual properties (e.g. Right-to-left override) (c.f. http://www.unicode.org/charts/PDF/U2000.pdf) + +‪‪test‪ +‫test‫ +
test
 +test⁠test‫ +⁦test⁧ + +# Zalgo Text +# +# Strings which contain "corrupted" text. The corruption will not appear in non-HTML text, however. (via http://www.eeemo.net) + +Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣ +̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰ +̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟ +̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕ +Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮ + +# Unicode Upsidedown +# +# Strings which contain unicode with an "upsidedown" effect (via http://www.upsidedowntext.com) + +˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥ +00˙Ɩ$- + +# Unicode font +# +# Strings which contain bold/italic/etc. versions of normal characters + +The quick brown fox jumps over the lazy dog +𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠 +𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 +𝑻𝒉𝒆 𝒒𝒖𝒊𝒄𝒌 𝒃𝒓𝒐𝒘𝒏 𝒇𝒐𝒙 𝒋𝒖𝒎𝒑𝒔 𝒐𝒗𝒆𝒓 𝒕𝒉𝒆 𝒍𝒂𝒛𝒚 𝒅𝒐𝒈 +𝓣𝓱𝓮 𝓺𝓾𝓲𝓬𝓴 𝓫𝓻𝓸𝔀𝓷 𝓯𝓸𝔁 𝓳𝓾𝓶𝓹𝓼 𝓸𝓿𝓮𝓻 𝓽𝓱𝓮 𝓵𝓪𝔃𝔂 𝓭𝓸𝓰 +𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 +𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐 +⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢ + +# Script Injection +# +# Strings which attempt to invoke a benign script injection; shows vulnerability to XSS + + +<script>alert('123');</script> + + +"> +'> +> + +< / script >< script >alert(123)< / script > + onfocus=JaVaSCript:alert(123) autofocus +" onfocus=JaVaSCript:alert(123) autofocus +' onfocus=JaVaSCript:alert(123) autofocus +<script>alert(123)</script> +ript>alert(123)ript> +--> +";alert(123);t=" +';alert(123);t=' +JavaSCript:alert(123) +;alert(123); +src=JaVaSCript:prompt(132) +">javascript:alert(1); +javascript:alert(1); +javascript:alert(1); +javascript:alert(1); +javascript:alert(1); +javascript:alert(1); +javascript:alert(1); +'`"><\x3Cscript>javascript:alert(1) +'`"><\x00script>javascript:alert(1) +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +`"'> +`"'> +`"'> +`"'> +`"'> +`"'> +`"'> +`"'> +`"'> +`"'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +XXX + + + +<a href=http://foo.bar/#x=`y></a><img alt="`><img src=x:x onerror=javascript:alert(1)></a>"> +<!--[if]><script>javascript:alert(1)</script --> +<!--[if<img src=x onerror=javascript:alert(1)//]> --> +<script src="/\%(jscript)s"></script> +<script src="\\%(jscript)s"></script> +<IMG """><SCRIPT>alert("XSS")</SCRIPT>"> +<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))> +<IMG SRC=# onmouseover="alert('xxs')"> +<IMG SRC= onmouseover="alert('xxs')"> +<IMG onmouseover="alert('xxs')"> +<IMG SRC=javascript:alert('XSS')> +<IMG SRC=javascript:alert('XSS')> +<IMG SRC=javascript:alert('XSS')> +<IMG SRC="jav ascript:alert('XSS');"> +<IMG SRC="jav ascript:alert('XSS');"> +<IMG SRC="jav ascript:alert('XSS');"> +<IMG SRC="jav ascript:alert('XSS');"> +perl -e 'print "<IMG SRC=java\0script:alert(\"XSS\")>";' > out +<IMG SRC="  javascript:alert('XSS');"> +<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT> +<BODY onload!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")> +<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT> +<<SCRIPT>alert("XSS");//<</SCRIPT> +<SCRIPT SRC=http://ha.ckers.org/xss.js?< B > +<SCRIPT SRC=//ha.ckers.org/.j> +<IMG SRC="javascript:alert('XSS')" +<iframe src=http://ha.ckers.org/scriptlet.html < +\";alert('XSS');// +<u oncopy=alert()> Copy me</u> +<i onwheel=alert(1)> Scroll over me </i> +<plaintext> +http://a/%%30%30 +</textarea><script>alert(123)</script> + +# SQL Injection +# +# Strings which can cause a SQL injection if inputs are not sanitized + +1;DROP TABLE users +1'; DROP TABLE users-- 1 +' OR 1=1 -- 1 +' OR '1'='1 + +% +_ + +# Server Code Injection +# +# Strings which can cause user to run code on server as a privileged user (c.f. https://news.ycombinator.com/item?id=7665153) + +- +-- +--version +--help +$USER +/dev/null; touch /tmp/blns.fail ; echo +`touch /tmp/blns.fail` +$(touch /tmp/blns.fail) +@{[system "touch /tmp/blns.fail"]} + +# Command Injection (Ruby) +# +# Strings which can call system commands within Ruby/Rails applications + +eval("puts 'hello world'") +System("ls -al /") +`ls -al /` +Kernel.exec("ls -al /") +Kernel.exit(1) +%x('ls -al /') + +# XXE Injection (XML) +# +# String which can reveal system files when parsed by a badly configured XML parser + +<?xml version="1.0" encoding="ISO-8859-1"?><!DOCTYPE foo [ <!ELEMENT foo ANY ><!ENTITY xxe SYSTEM "file:///etc/passwd" >]><foo>&xxe;</foo> + +# Unwanted Interpolation +# +# Strings which can be accidentally expanded into different strings if evaluated in the wrong context, e.g. used as a printf format string or via Perl or shell eval. Might expose sensitive data from the program doing the interpolation, or might just represent the wrong string. + +$HOME +$ENV{'HOME'} +%d +%s%s%s%s%s +{0} +%*.*s +%@ +%n +File:/// + +# File Inclusion +# +# Strings which can cause user to pull in files that should not be a part of a web server + +../../../../../../../../../../../etc/passwd%00 +../../../../../../../../../../../etc/hosts + +# Known CVEs and Vulnerabilities +# +# Strings that test for known vulnerabilities + +() { 0; }; touch /tmp/blns.shellshock1.fail; +() { _; } >_[$($())] { touch /tmp/blns.shellshock2.fail; } +<<< %s(un='%s') = %u ++++ATH0 + +# MSDOS/Windows Special Filenames +# +# Strings which are reserved characters in MSDOS/Windows + +CON +PRN +AUX +CLOCK$ +NUL +A: +ZZ: +COM1 +LPT1 +LPT2 +LPT3 +COM2 +COM3 +COM4 + +# IRC specific strings +# +# Strings that may occur on IRC clients that make security products freak out + +DCC SEND STARTKEYLOGGER 0 0 0 + +# Scunthorpe Problem +# +# Innocuous strings which may be blocked by profanity filters (https://en.wikipedia.org/wiki/Scunthorpe_problem) + +Scunthorpe General Hospital +Penistone Community Church +Lightwater Country Park +Jimmy Clitheroe +Horniman Museum +shitake mushrooms +RomansInSussex.co.uk +http://www.cum.qc.ca/ +Craig Cockburn, Software Specialist +Linda Callahan +Dr. Herman I. Libshitz +magna cum laude +Super Bowl XXX +medieval erection of parapets +evaluate +mocha +expression +Arsenal canal +classic +Tyson Gay +Dick Van Dyke +basement + +# Human injection +# +# Strings which may cause human to reinterpret worldview + +If you're reading this, you've been in a coma for almost 20 years now. We're trying a new technique. We don't know where this message will end up in your dream, but we hope it works. Please wake up, we miss you. + +# Terminal escape codes +# +# Strings which punish the fools who use cat/type on this file + +Roses are red, violets are blue. Hope you enjoy terminal hue +But now...for my greatest trick... +The quick brown fox... [Beeeep] + +# iOS Vulnerabilities +# +# Strings which crashed iMessage in various versions of iOS + +Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗 +🏳0🌈️ +జ్ఞ‌ా diff --git a/s3-source-connector/src/test/resources/logback-test.xml b/s3-source-connector/src/test/resources/logback-test.xml new file mode 100644 index 000000000..f1d0b0cb6 --- /dev/null +++ b/s3-source-connector/src/test/resources/logback-test.xml @@ -0,0 +1,12 @@ +<configuration> + + <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender"> + <encoder> + <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> + </encoder> + </appender> + + <root level="debug"> + <appender-ref ref="STDOUT"/> + </root> +</configuration> diff --git a/settings.gradle.kts b/settings.gradle.kts index 5bded6986..1f4c61c96 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -98,3 +98,5 @@ include("gcs-sink-connector") include("s3-sink-connector") include("azure-sink-connector") + +include("s3-source-connector") From 093a0132dc515b8ff119d1d961818ab28d254385 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Mon, 9 Sep 2024 13:55:10 +0200 Subject: [PATCH 02/90] rename props file --- s3-source-connector/README.md | 9 +++---- .../source/s3/config/S3SourceConfig.java | 27 +++---------------- .../source/s3/config/S3SourceConfigDef.java | 2 +- ...ector-for-apache-kafka-version.properties} | 0 4 files changed, 8 insertions(+), 30 deletions(-) rename s3-source-connector/src/main/resources/{source-s3-connector-for-apache-kafka-version.properties => s3-source-connector-for-apache-kafka-version.properties} (100%) diff --git a/s3-source-connector/README.md b/s3-source-connector/README.md index 2ee9caacc..6f72e485a 100644 --- a/s3-source-connector/README.md +++ b/s3-source-connector/README.md @@ -1,13 +1,12 @@ # Aiven's S3 Source Connector for Apache Kafka -This is a source Apache Kafka Connect connector that stores Apache Kafka messages in an AWS S3 bucket. +This is a source Apache Kafka Connect connector that stores AWS S3 bucket objects in Apache Kafka. + +## This connector is in development phase ## **Table of Contents** - [How it works](#how-it-works) -- [Data Format](#data-format) -- [Usage](#usage) -- [Configuration](#configuration) - [Development](#development) @@ -21,7 +20,7 @@ published into the corresponding Kafka topic. The connector requires Java 11 or newer for development and production. -## TODO update documentation +### TODO update documentation ## Development diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java index a77caeec4..6dfe3d64b 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java @@ -17,30 +17,23 @@ package io.aiven.kafka.connect.source.s3.config; import java.util.Collections; -import java.util.List; import java.util.Map; import org.apache.kafka.common.config.ConfigDef; -import io.aiven.kafka.connect.common.config.AivenCommonConfig; -import io.aiven.kafka.connect.common.config.CompressionType; -import io.aiven.kafka.connect.common.config.OutputField; -import io.aiven.kafka.connect.common.config.OutputFieldEncodingType; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @SuppressWarnings({ "PMD.TooManyMethods", "PMD.GodClass", "PMD.ExcessiveImports" }) -final public class S3SourceConfig extends AivenCommonConfig { +final public class S3SourceConfig { public static final Logger LOGGER = LoggerFactory.getLogger(S3SourceConfig.class); - public S3SourceConfig(final Map<String, String> properties) { - super(configDef(), preprocessProperties(properties)); + public S3SourceConfig() { validate(); } - static Map<String, String> preprocessProperties(final Map<String, String> properties) { + static Map<String, String> preprocessProperties() { return Collections.emptyMap(); } @@ -51,18 +44,4 @@ public static ConfigDef configDef() { private void validate() { LOGGER.debug("Validating config."); } - @Override - public CompressionType getCompressionType() { - return CompressionType.GZIP; - } - - @Override - public List<OutputField> getOutputFields() { - return Collections.emptyList(); - } - - @Override - public OutputFieldEncodingType getOutputFieldEncodingType() { - return null; - } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java index 12fa37d77..7e549f903 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java @@ -25,6 +25,6 @@ public class S3SourceConfigDef extends ConfigDef { @Override public List<ConfigValue> validate(final Map<String, String> props) { - return super.validate(S3SourceConfig.preprocessProperties(props)); + return super.validate(S3SourceConfig.preprocessProperties()); } } diff --git a/s3-source-connector/src/main/resources/source-s3-connector-for-apache-kafka-version.properties b/s3-source-connector/src/main/resources/s3-source-connector-for-apache-kafka-version.properties similarity index 100% rename from s3-source-connector/src/main/resources/source-s3-connector-for-apache-kafka-version.properties rename to s3-source-connector/src/main/resources/s3-source-connector-for-apache-kafka-version.properties From 0b6b75d5d7732a8c060b8540bda084f30b3845f4 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Tue, 10 Sep 2024 10:38:13 +0200 Subject: [PATCH 03/90] EC-243 Rename package --- .../kafka/connect/{source/s3 => s3/source}/ConnectRunner.java | 2 +- .../connect/{source/s3 => s3/source}/IntegrationBase.java | 2 +- .../connect/{source/s3 => s3/source}/IntegrationTest.java | 2 +- .../s3 => s3/source}/AivenKafkaConnectS3SourceConnector.java | 4 ++-- .../kafka/connect/{source/s3 => s3/source}/S3SourceTask.java | 2 +- .../aiven/kafka/connect/{source/s3 => s3/source}/Version.java | 2 +- .../{source/s3 => s3/source}/config/S3SourceConfig.java | 2 +- .../{source/s3 => s3/source}/config/S3SourceConfigDef.java | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) rename s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/{source/s3 => s3/source}/ConnectRunner.java (99%) rename s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/{source/s3 => s3/source}/IntegrationBase.java (98%) rename s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/{source/s3 => s3/source}/IntegrationTest.java (98%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/{source/s3 => s3/source}/AivenKafkaConnectS3SourceConnector.java (94%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/{source/s3 => s3/source}/S3SourceTask.java (97%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/{source/s3 => s3/source}/Version.java (97%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/{source/s3 => s3/source}/config/S3SourceConfig.java (96%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/{source/s3 => s3/source}/config/S3SourceConfigDef.java (95%) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/ConnectRunner.java similarity index 99% rename from s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java rename to s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/ConnectRunner.java index 5aab1c99f..593705dd1 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/ConnectRunner.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.source.s3; +package io.aiven.kafka.connect.s3.source; import static org.assertj.core.api.Assertions.assertThat; diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java similarity index 98% rename from s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java rename to s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index be21ec8f1..f64bce3ac 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.source.s3; +package io.aiven.kafka.connect.s3.source; import java.io.File; import java.io.IOException; diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java similarity index 98% rename from s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java rename to s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 921f97715..156361e4f 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.source.s3; +package io.aiven.kafka.connect.s3.source; import static org.assertj.core.api.Assertions.assertThat; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java similarity index 94% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java index 16488e130..ddb662cd0 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.source.s3; +package io.aiven.kafka.connect.s3.source; import java.util.Collections; import java.util.List; @@ -24,7 +24,7 @@ import org.apache.kafka.connect.connector.Task; import org.apache.kafka.connect.source.SourceConnector; -import io.aiven.kafka.connect.source.s3.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java similarity index 97% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 704579fba..f158bb2ba 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.source.s3; +package io.aiven.kafka.connect.s3.source; import java.util.Collections; import java.util.List; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/Version.java similarity index 97% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/Version.java index b5e5cdc85..2ee4feb44 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/Version.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.source.s3; +package io.aiven.kafka.connect.s3.source; import java.io.InputStream; import java.util.Properties; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java similarity index 96% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 6dfe3d64b..8f4c1486c 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.source.s3.config; +package io.aiven.kafka.connect.s3.source.config; import java.util.Collections; import java.util.Map; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigDef.java similarity index 95% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigDef.java index 7e549f903..1847c83bd 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigDef.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.source.s3.config; +package io.aiven.kafka.connect.s3.source.config; import java.util.List; import java.util.Map; From 0eaa7fb2e75ac49be1a08dab877bdb0ece51213f Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Mon, 7 Oct 2024 11:53:47 +0200 Subject: [PATCH 04/90] Fix conflicts --- s3-source-connector/README.md | 9 +- .../connect/source/s3/ConnectRunner.java | 119 ++++++++++++++++++ .../connect/source/s3/IntegrationBase.java | 94 ++++++++++++++ .../connect/source/s3/IntegrationTest.java | 107 ++++++++++++++++ .../AivenKafkaConnectS3SourceConnector.java | 69 ++++++++++ .../kafka/connect/source/s3/S3SourceTask.java | 62 +++++++++ .../kafka/connect/source/s3/Version.java | 43 +++++++ .../source/s3/config/S3SourceConfig.java | 68 ++++++++++ .../source/s3/config/S3SourceConfigDef.java | 30 +++++ ...nector-for-apache-kafka-version.properties | 16 +++ 10 files changed, 613 insertions(+), 4 deletions(-) create mode 100644 s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java create mode 100644 s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java create mode 100644 s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java create mode 100644 s3-source-connector/src/main/resources/source-s3-connector-for-apache-kafka-version.properties diff --git a/s3-source-connector/README.md b/s3-source-connector/README.md index 6f72e485a..2ee9caacc 100644 --- a/s3-source-connector/README.md +++ b/s3-source-connector/README.md @@ -1,12 +1,13 @@ # Aiven's S3 Source Connector for Apache Kafka -This is a source Apache Kafka Connect connector that stores AWS S3 bucket objects in Apache Kafka. - -## This connector is in development phase ## +This is a source Apache Kafka Connect connector that stores Apache Kafka messages in an AWS S3 bucket. **Table of Contents** - [How it works](#how-it-works) +- [Data Format](#data-format) +- [Usage](#usage) +- [Configuration](#configuration) - [Development](#development) @@ -20,7 +21,7 @@ published into the corresponding Kafka topic. The connector requires Java 11 or newer for development and production. -### TODO update documentation +## TODO update documentation ## Development diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java new file mode 100644 index 000000000..5aab1c99f --- /dev/null +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java @@ -0,0 +1,119 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ExecutionException; + +import org.apache.kafka.common.utils.Time; +import org.apache.kafka.connect.runtime.Connect; +import org.apache.kafka.connect.runtime.ConnectorConfig; +import org.apache.kafka.connect.runtime.Herder; +import org.apache.kafka.connect.runtime.Worker; +import org.apache.kafka.connect.runtime.isolation.Plugins; +import org.apache.kafka.connect.runtime.rest.RestServer; +import org.apache.kafka.connect.runtime.rest.entities.ConnectorInfo; +import org.apache.kafka.connect.runtime.standalone.StandaloneConfig; +import org.apache.kafka.connect.runtime.standalone.StandaloneHerder; +import org.apache.kafka.connect.storage.MemoryOffsetBackingStore; +import org.apache.kafka.connect.util.FutureCallback; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +final class ConnectRunner { + private static final Logger LOGGER = LoggerFactory.getLogger(ConnectRunner.class); + + private final File pluginDir; + private final String bootstrapServers; + private final int offsetFlushInterval; + + private Herder herder; + private Connect connect; + + public ConnectRunner(final File pluginDir, final String bootstrapServers, final int offsetFlushIntervalMs) { + this.pluginDir = pluginDir; + this.bootstrapServers = bootstrapServers; + this.offsetFlushInterval = offsetFlushIntervalMs; + } + + void start() { + final Map<String, String> workerProps = new HashMap<>(); + workerProps.put("bootstrap.servers", bootstrapServers); + + workerProps.put("offset.flush.interval.ms", Integer.toString(offsetFlushInterval)); + + // These don't matter much (each connector sets its own converters), but need to be filled with valid classes. + workerProps.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + workerProps.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + workerProps.put("internal.key.converter", "org.apache.kafka.connect.json.JsonConverter"); + workerProps.put("internal.key.converter.schemas.enable", "false"); + workerProps.put("internal.value.converter", "org.apache.kafka.connect.json.JsonConverter"); + workerProps.put("internal.value.converter.schemas.enable", "false"); + + // Don't need it since we'll memory MemoryOffsetBackingStore. + workerProps.put("offset.storage.file.filename", ""); + + workerProps.put("plugin.path", pluginDir.getPath()); + + final Time time = Time.SYSTEM; + final String workerId = "test-worker"; + final String kafkaClusterId = "test-cluster"; + + final Plugins plugins = new Plugins(workerProps); + final StandaloneConfig config = new StandaloneConfig(workerProps); + + final Worker worker = new Worker(workerId, time, plugins, config, new MemoryOffsetBackingStore()); + herder = new StandaloneHerder(worker, kafkaClusterId); + + final RestServer rest = new RestServer(config); + + connect = new Connect(herder, rest); + + connect.start(); + } + + void createConnector(final Map<String, String> config) throws ExecutionException, InterruptedException { + assert herder != null; + + final FutureCallback<Herder.Created<ConnectorInfo>> callback = new FutureCallback<>((error, info) -> { + if (error != null) { + LOGGER.error("Failed to create job"); + } else { + LOGGER.info("Created connector {}", info.result().name()); + } + }); + herder.putConnectorConfig(config.get(ConnectorConfig.NAME_CONFIG), config, false, callback); + + final Herder.Created<ConnectorInfo> connectorInfoCreated = callback.get(); + assert connectorInfoCreated.created(); + assertThat(connectorInfoCreated.result().config().get("connector.class")) + .isEqualTo(AivenKafkaConnectS3SourceConnector.class.getName()); + } + + void stop() { + connect.stop(); + } + + void awaitStop() { + connect.awaitStop(); + } +} diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java new file mode 100644 index 000000000..be21ec8f1 --- /dev/null +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java @@ -0,0 +1,94 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.time.Duration; +import java.util.List; +import java.util.Properties; +import java.util.concurrent.ExecutionException; +import java.util.stream.Collectors; + +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.AdminClientConfig; +import org.apache.kafka.clients.admin.NewTopic; + +import com.github.dockerjava.api.model.Ulimit; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.TestInfo; +import org.testcontainers.containers.Container; +import org.testcontainers.containers.KafkaContainer; +import org.testcontainers.containers.Network; +import org.testcontainers.utility.DockerImageName; + +public interface IntegrationBase { + + String DOCKER_IMAGE_KAFKA = "confluentinc/cp-kafka:7.7.0"; + + default AdminClient newAdminClient(final KafkaContainer kafka) { + final Properties adminClientConfig = new Properties(); + adminClientConfig.put(AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, kafka.getBootstrapServers()); + return AdminClient.create(adminClientConfig); + } + + default ConnectRunner newConnectRunner(final KafkaContainer kafka, final File pluginDir, + final int offsetFlushIntervalMs) { + return new ConnectRunner(pluginDir, kafka.getBootstrapServers(), offsetFlushIntervalMs); + } + + static void extractConnectorPlugin(File pluginDir) throws IOException, InterruptedException { + final File distFile = new File(System.getProperty("integration-test.distribution.file.path")); + assert distFile.exists(); + + final String cmd = String.format("tar -xf %s --strip-components=1 -C %s", distFile, pluginDir.toString()); + final Process process = Runtime.getRuntime().exec(cmd); + assert process.waitFor() == 0; + } + + static File getPluginDir() throws IOException { + final File testDir = Files.createTempDirectory("s3-source-connector-for-apache-kafka-test-").toFile(); + + final File pluginDir = new File(testDir, "plugins/s3-source-connector-for-apache-kafka/"); + assert pluginDir.mkdirs(); + return pluginDir; + } + + static KafkaContainer createKafkaContainer() { + return new KafkaContainer(DockerImageName.parse(DOCKER_IMAGE_KAFKA)) + .withEnv("KAFKA_AUTO_CREATE_TOPICS_ENABLE", "false") + .withNetwork(Network.newNetwork()) + .withExposedPorts(KafkaContainer.KAFKA_PORT, 9092) + .withCreateContainerCmdModifier( + cmd -> cmd.getHostConfig().withUlimits(List.of(new Ulimit("nofile", 30_000L, 30_000L)))); + } + + static String topicName(final TestInfo testInfo) { + return testInfo.getTestMethod().get().getName() + "-" + testInfo.getDisplayName().hashCode(); + } + + static void createTopics(final AdminClient adminClient, final List<String> topicNames) + throws ExecutionException, InterruptedException { + final var newTopics = topicNames.stream().map(s -> new NewTopic(s, 4, (short) 1)).collect(Collectors.toList()); + adminClient.createTopics(newTopics).all().get(); + } + + static void waitForRunningContainer(final Container<?> kafka) { + Awaitility.await().atMost(Duration.ofMinutes(1)).until(kafka::isRunning); + } +} diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java new file mode 100644 index 000000000..921f97715 --- /dev/null +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java @@ -0,0 +1,107 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; + +import org.apache.kafka.clients.admin.AdminClient; + +import org.junit.Ignore; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; +import org.testcontainers.containers.KafkaContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +@Ignore +@Testcontainers +final class IntegrationTest implements IntegrationBase { + private static final String CONNECTOR_NAME = "aiven-s3-source-connector"; + private static final int OFFSET_FLUSH_INTERVAL_MS = 5000; + + private static File pluginDir; + + @Container + private static final KafkaContainer KAFKA = IntegrationBase.createKafkaContainer(); + private AdminClient adminClient; + private ConnectRunner connectRunner; + + @BeforeAll + static void setUpAll() throws IOException, InterruptedException { + pluginDir = IntegrationBase.getPluginDir(); + IntegrationBase.extractConnectorPlugin(pluginDir); + IntegrationBase.waitForRunningContainer(KAFKA); + } + + @BeforeEach + void setUp(final TestInfo testInfo) throws ExecutionException, InterruptedException { + adminClient = newAdminClient(KAFKA); + + final var topicName = IntegrationBase.topicName(testInfo); + final var topics = List.of(topicName); + IntegrationBase.createTopics(adminClient, topics); + + connectRunner = newConnectRunner(KAFKA, pluginDir, OFFSET_FLUSH_INTERVAL_MS); + connectRunner.start(); + } + + @AfterEach + void tearDown() { + connectRunner.stop(); + adminClient.close(); + + connectRunner.awaitStop(); + } + + @Test + void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedException { + final var topicName = IntegrationBase.topicName(testInfo); + final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + connectRunner.createConnector(connectorConfig); + + assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); + } + + private Map<String, String> getConfig(final Map<String, String> config, final String topicName) { + return getConfig(config, List.of(topicName)); + } + + private Map<String, String> getConfig(final Map<String, String> config, final List<String> topicNames) { + config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); + config.put("topics", String.join(",", topicNames)); + return config; + } + + private Map<String, String> basicConnectorConfig(final String connectorName) { + final Map<String, String> config = new HashMap<>(); + config.put("name", connectorName); + config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put("tasks.max", "1"); + return config; + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java new file mode 100644 index 000000000..16488e130 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java @@ -0,0 +1,69 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.connect.connector.Task; +import org.apache.kafka.connect.source.SourceConnector; + +import io.aiven.kafka.connect.source.s3.config.S3SourceConfig; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * AivenKafkaConnectS3SourceConnector is a Kafka Connect Connector implementation that watches a S3 bucket and generates + * tasks to ingest contents. + */ +public class AivenKafkaConnectS3SourceConnector extends SourceConnector { + + private static final Logger LOGGER = LoggerFactory.getLogger(AivenKafkaConnectS3SourceConnector.class); + + @Override + public ConfigDef config() { + return S3SourceConfig.configDef(); + } + + @Override + public String version() { + return Version.VERSION; + } + + @Override + public Class<? extends Task> taskClass() { + return S3SourceTask.class; + } + + @Override + public List<Map<String, String>> taskConfigs(final int maxTasks) { + return Collections.emptyList(); + } + + @Override + public void start(final Map<String, String> properties) { + LOGGER.info("Start S3 Source connector"); + } + + @Override + public void stop() { + LOGGER.info("Stop S3 Source connector"); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java new file mode 100644 index 000000000..704579fba --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java @@ -0,0 +1,62 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import org.apache.kafka.connect.source.SourceRecord; +import org.apache.kafka.connect.source.SourceTask; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * S3SourceTask is a Kafka Connect SourceTask implementation that reads from source-s3 buckets and generates Kafka + * Connect records. + */ +public class S3SourceTask extends SourceTask { + + private static final Logger LOGGER = LoggerFactory.getLogger(AivenKafkaConnectS3SourceConnector.class); + + @SuppressWarnings("PMD.UnnecessaryConstructor") // required by Connect + public S3SourceTask() { + super(); + } + + @Override + public String version() { + return null; + } + + @Override + public void start(final Map<String, String> props) { + LOGGER.info("S3 Source task started."); + Objects.requireNonNull(props, "props hasn't been set"); + } + + @Override + public List<SourceRecord> poll() { + return Collections.emptyList(); + } + + @Override + public void stop() { + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java new file mode 100644 index 000000000..b5e5cdc85 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java @@ -0,0 +1,43 @@ +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3; + +import java.io.InputStream; +import java.util.Properties; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +final class Version { + private static final Logger LOGGER = LoggerFactory.getLogger(Version.class); + + private static final String PROPERTIES_FILENAME = "s3-source-connector-for-apache-kafka-version.properties"; + + static final String VERSION; // NOPMD AvoidFieldNameMatchingTypeName + + static { + final Properties props = new Properties(); + try (InputStream resourceStream = Thread.currentThread() + .getContextClassLoader() + .getResourceAsStream(PROPERTIES_FILENAME)) { + props.load(resourceStream); + } catch (final Exception e) { // NOPMD AvoidCatchingGenericException + LOGGER.warn("Error while loading {}: {}", PROPERTIES_FILENAME, e.getMessage()); + } + VERSION = props.getProperty("version", "unknown").trim(); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java new file mode 100644 index 000000000..a77caeec4 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java @@ -0,0 +1,68 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3.config; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import org.apache.kafka.common.config.ConfigDef; + +import io.aiven.kafka.connect.common.config.AivenCommonConfig; +import io.aiven.kafka.connect.common.config.CompressionType; +import io.aiven.kafka.connect.common.config.OutputField; +import io.aiven.kafka.connect.common.config.OutputFieldEncodingType; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@SuppressWarnings({ "PMD.TooManyMethods", "PMD.GodClass", "PMD.ExcessiveImports" }) +final public class S3SourceConfig extends AivenCommonConfig { + + public static final Logger LOGGER = LoggerFactory.getLogger(S3SourceConfig.class); + + public S3SourceConfig(final Map<String, String> properties) { + super(configDef(), preprocessProperties(properties)); + validate(); + } + + static Map<String, String> preprocessProperties(final Map<String, String> properties) { + return Collections.emptyMap(); + } + + public static ConfigDef configDef() { + return new S3SourceConfigDef(); + } + + private void validate() { + LOGGER.debug("Validating config."); + } + @Override + public CompressionType getCompressionType() { + return CompressionType.GZIP; + } + + @Override + public List<OutputField> getOutputFields() { + return Collections.emptyList(); + } + + @Override + public OutputFieldEncodingType getOutputFieldEncodingType() { + return null; + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java new file mode 100644 index 000000000..12fa37d77 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java @@ -0,0 +1,30 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.source.s3.config; + +import java.util.List; +import java.util.Map; + +import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.common.config.ConfigValue; + +public class S3SourceConfigDef extends ConfigDef { + @Override + public List<ConfigValue> validate(final Map<String, String> props) { + return super.validate(S3SourceConfig.preprocessProperties(props)); + } +} diff --git a/s3-source-connector/src/main/resources/source-s3-connector-for-apache-kafka-version.properties b/s3-source-connector/src/main/resources/source-s3-connector-for-apache-kafka-version.properties new file mode 100644 index 000000000..9c2421c8a --- /dev/null +++ b/s3-source-connector/src/main/resources/source-s3-connector-for-apache-kafka-version.properties @@ -0,0 +1,16 @@ +## +# Copyright 2024 Aiven Oy +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +## +version=${version ?: 'unknown'} From a67a559e9a07d7fb473bed44984749b1923fed4d Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Tue, 17 Sep 2024 17:06:09 +0200 Subject: [PATCH 05/90] fix conflicts --- s3-source-connector/build.gradle.kts | 5 + .../AivenKafkaConnectS3SourceConnector.java | 14 ++- .../kafka/connect/s3/source/S3SourceTask.java | 48 ++++++++ .../s3/source/config/AwsAccessSecret.java | 43 +++++++ .../config/AwsCredentialProviderFactory.java | 61 ++++++++++ .../source/config/AwsStsEndpointConfig.java | 43 +++++++ .../connect/s3/source/config/AwsStsRole.java | 62 ++++++++++ .../s3/source/config/S3SourceConfig.java | 114 +++++++++++++++++- .../s3/source/config/S3SourceConfigDef.java | 2 +- 9 files changed, 385 insertions(+), 7 deletions(-) create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsAccessSecret.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsCredentialProviderFactory.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsEndpointConfig.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsRole.java diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts index 57eb9e259..2505715ea 100644 --- a/s3-source-connector/build.gradle.kts +++ b/s3-source-connector/build.gradle.kts @@ -18,6 +18,9 @@ import com.github.spotbugs.snom.SpotBugsTask plugins { id("aiven-apache-kafka-connectors-all.java-conventions") } +val amazonS3Version by extra("1.12.729") +val amazonSTSVersion by extra("1.12.729") + val integrationTest: SourceSet = sourceSets.create("integrationTest") { java { srcDir("src/integration-test/java") } @@ -61,6 +64,8 @@ dependencies { compileOnly(apache.kafka.connect.runtime) implementation(project(":commons")) + implementation("com.amazonaws:aws-java-sdk-s3:$amazonS3Version") + implementation("com.amazonaws:aws-java-sdk-sts:$amazonSTSVersion") implementation(tools.spotbugs.annotations) implementation(logginglibs.slf4j) diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java index ddb662cd0..308ea39da 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java @@ -16,9 +16,10 @@ package io.aiven.kafka.connect.s3.source; -import java.util.Collections; +import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Objects; import org.apache.kafka.common.config.ConfigDef; import org.apache.kafka.connect.connector.Task; @@ -37,6 +38,8 @@ public class AivenKafkaConnectS3SourceConnector extends SourceConnector { private static final Logger LOGGER = LoggerFactory.getLogger(AivenKafkaConnectS3SourceConnector.class); + private Map<String, String> configProperties; + @Override public ConfigDef config() { return S3SourceConfig.configDef(); @@ -54,11 +57,18 @@ public Class<? extends Task> taskClass() { @Override public List<Map<String, String>> taskConfigs(final int maxTasks) { - return Collections.emptyList(); + final var taskProps = new ArrayList<Map<String, String>>(); + for (int i = 0; i < maxTasks; i++) { + final var props = Map.copyOf(configProperties); + taskProps.add(props); + } + return taskProps; } @Override public void start(final Map<String, String> properties) { + Objects.requireNonNull(properties, "properties haven't been set"); + configProperties = Map.copyOf(properties); LOGGER.info("Start S3 Source connector"); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index f158bb2ba..960110cf1 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -24,6 +24,16 @@ import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.source.SourceTask; +import io.aiven.kafka.connect.s3.source.config.AwsCredentialProviderFactory; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import com.amazonaws.PredefinedClientConfigurations; +import com.amazonaws.client.builder.AwsClientBuilder; +import com.amazonaws.retry.PredefinedBackoffStrategies; +import com.amazonaws.retry.PredefinedRetryPolicies; +import com.amazonaws.retry.RetryPolicy; +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.AmazonS3ClientBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,6 +45,12 @@ public class S3SourceTask extends SourceTask { private static final Logger LOGGER = LoggerFactory.getLogger(AivenKafkaConnectS3SourceConnector.class); + private S3SourceConfig config; + + private AmazonS3 s3Client; + + AwsCredentialProviderFactory credentialFactory = new AwsCredentialProviderFactory(); + @SuppressWarnings("PMD.UnnecessaryConstructor") // required by Connect public S3SourceTask() { super(); @@ -49,10 +65,42 @@ public String version() { public void start(final Map<String, String> props) { LOGGER.info("S3 Source task started."); Objects.requireNonNull(props, "props hasn't been set"); + config = new S3SourceConfig(props); + + s3Client = createAmazonS3Client(config); + LOGGER.info("S3 client initialized " + s3Client.getBucketLocation("")); + // prepareReaderFromOffsetStorageReader(); + } + + private AmazonS3 createAmazonS3Client(final S3SourceConfig config) { + final var awsEndpointConfig = newEndpointConfiguration(this.config); + final var clientConfig = PredefinedClientConfigurations.defaultConfig() + .withRetryPolicy(new RetryPolicy(PredefinedRetryPolicies.DEFAULT_RETRY_CONDITION, + new PredefinedBackoffStrategies.FullJitterBackoffStrategy( + Math.toIntExact(config.getS3RetryBackoffDelayMs()), + Math.toIntExact(config.getS3RetryBackoffMaxDelayMs())), + config.getS3RetryBackoffMaxRetries(), false)); + final var s3ClientBuilder = AmazonS3ClientBuilder.standard() + .withCredentials(credentialFactory.getProvider(config)) + .withClientConfiguration(clientConfig); + if (Objects.isNull(awsEndpointConfig)) { + s3ClientBuilder.withRegion(config.getAwsS3Region().getName()); + } else { + s3ClientBuilder.withEndpointConfiguration(awsEndpointConfig).withPathStyleAccessEnabled(true); + } + return s3ClientBuilder.build(); + } + + private AwsClientBuilder.EndpointConfiguration newEndpointConfiguration(final S3SourceConfig config) { + if (Objects.isNull(config.getAwsS3EndPoint())) { + return null; + } + return new AwsClientBuilder.EndpointConfiguration(config.getAwsS3EndPoint(), config.getAwsS3Region().getName()); } @Override public List<SourceRecord> poll() { + LOGGER.info("Using S3 client and poll " + s3Client.getBucketLocation("")); return Collections.emptyList(); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsAccessSecret.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsAccessSecret.java new file mode 100644 index 000000000..2e9d2ac55 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsAccessSecret.java @@ -0,0 +1,43 @@ +/* + * Copyright 2021 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.config; + +import java.util.Objects; + +import org.apache.kafka.common.config.types.Password; + +final class AwsAccessSecret { + private final Password accessKeyId; + private final Password secretAccessKey; + + public AwsAccessSecret(final Password accessKeyId, final Password secretAccessKey) { + this.accessKeyId = accessKeyId; + this.secretAccessKey = secretAccessKey; + } + + public Password getAccessKeyId() { + return accessKeyId; + } + + public Password getSecretAccessKey() { + return secretAccessKey; + } + + public Boolean isValid() { + return Objects.nonNull(accessKeyId) && Objects.nonNull(secretAccessKey); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsCredentialProviderFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsCredentialProviderFactory.java new file mode 100644 index 000000000..d0fa8f55b --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsCredentialProviderFactory.java @@ -0,0 +1,61 @@ +/* + * Copyright 2021 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.config; + +import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.AWSStaticCredentialsProvider; +import com.amazonaws.auth.BasicAWSCredentials; +import com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider; +import com.amazonaws.client.builder.AwsClientBuilder; +import com.amazonaws.services.securitytoken.AWSSecurityTokenService; +import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClientBuilder; + +public class AwsCredentialProviderFactory { + public AWSCredentialsProvider getProvider(final S3SourceConfig config) { + if (config.hasAwsStsRole()) { + return getStsProvider(config); + } + final AwsAccessSecret awsCredentials = config.getAwsCredentials(); + if (!awsCredentials.isValid()) { + return config.getCustomCredentialsProvider(); + } + return new AWSStaticCredentialsProvider(new BasicAWSCredentials(awsCredentials.getAccessKeyId().value(), + awsCredentials.getSecretAccessKey().value())); + } + + private AWSCredentialsProvider getStsProvider(final S3SourceConfig config) { + final AwsStsRole awsstsRole = config.getStsRole(); + final AWSSecurityTokenService sts = securityTokenService(config); + return new STSAssumeRoleSessionCredentialsProvider.Builder(awsstsRole.getArn(), awsstsRole.getSessionName()) + .withStsClient(sts) + .withExternalId(awsstsRole.getExternalId()) + .withRoleSessionDurationSeconds(awsstsRole.getSessionDurationSeconds()) + .build(); + } + + private AWSSecurityTokenService securityTokenService(final S3SourceConfig config) { + if (config.hasStsEndpointConfig()) { + final AwsStsEndpointConfig endpointConfig = config.getStsEndpointConfig(); + final AwsClientBuilder.EndpointConfiguration stsConfig = new AwsClientBuilder.EndpointConfiguration( + endpointConfig.getServiceEndpoint(), endpointConfig.getSigningRegion()); + final AWSSecurityTokenServiceClientBuilder stsBuilder = AWSSecurityTokenServiceClientBuilder.standard(); + stsBuilder.setEndpointConfiguration(stsConfig); + return stsBuilder.build(); + } + return AWSSecurityTokenServiceClientBuilder.defaultClient(); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsEndpointConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsEndpointConfig.java new file mode 100644 index 000000000..219db5114 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsEndpointConfig.java @@ -0,0 +1,43 @@ +/* + * Copyright 2021 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.config; + +import java.util.Objects; + +final class AwsStsEndpointConfig { + public static final String AWS_STS_GLOBAL_ENDPOINT = "https://sts.amazonaws.com"; + + private final String serviceEndpoint; + private final String signingRegion; + + public AwsStsEndpointConfig(final String serviceEndpoint, final String signingRegion) { + this.serviceEndpoint = serviceEndpoint; + this.signingRegion = signingRegion; + } + + public String getServiceEndpoint() { + return serviceEndpoint; + } + + public String getSigningRegion() { + return signingRegion; + } + + public Boolean isValid() { + return Objects.nonNull(signingRegion); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsRole.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsRole.java new file mode 100644 index 000000000..aa4adb6da --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsRole.java @@ -0,0 +1,62 @@ +/* + * Copyright 2021 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.config; + +import java.util.Objects; + +import com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider; + +final class AwsStsRole { + + // AssumeRole request limit details here: + // https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html + public static final int MIN_SESSION_DURATION = STSAssumeRoleSessionCredentialsProvider.DEFAULT_DURATION_SECONDS; + public static final int MAX_SESSION_DURATION = 43_200; + + private final String arn; + private final String externalId; + private final String sessionName; + private final int sessionDurationSeconds; + + public AwsStsRole(final String arn, final String externalId, final String sessionName, + final int sessionDurationSeconds) { + this.arn = arn; + this.externalId = externalId; + this.sessionName = sessionName; + this.sessionDurationSeconds = sessionDurationSeconds; + } + + public String getArn() { + return arn; + } + + public String getExternalId() { + return externalId; + } + + public String getSessionName() { + return sessionName; + } + + public int getSessionDurationSeconds() { + return sessionDurationSeconds; + } + + public Boolean isValid() { + return Objects.nonNull(arn) && Objects.nonNull(sessionName); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 8f4c1486c..c40724890 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -18,22 +18,65 @@ import java.util.Collections; import java.util.Map; +import java.util.Objects; +import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; +import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.regions.Region; +import com.amazonaws.regions.RegionUtils; +import com.amazonaws.regions.Regions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @SuppressWarnings({ "PMD.TooManyMethods", "PMD.GodClass", "PMD.ExcessiveImports" }) -final public class S3SourceConfig { +final public class S3SourceConfig extends AbstractConfig { public static final Logger LOGGER = LoggerFactory.getLogger(S3SourceConfig.class); - public S3SourceConfig() { - validate(); + @Deprecated + public static final String AWS_ACCESS_KEY_ID = "aws_access_key_id"; + + public static final String AWS_S3_RETRY_BACKOFF_DELAY_MS_CONFIG = "aws.s3.backoff.delay.ms"; + + public static final String AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_CONFIG = "aws.s3.backoff.max.delay.ms"; + + public static final String AWS_S3_RETRY_BACKOFF_MAX_RETRIES_CONFIG = "aws.s3.backoff.max.retries"; + + public static final String AWS_S3_REGION_CONFIG = "aws.s3.region"; + + public static final String AWS_S3_ENDPOINT_CONFIG = "aws.s3.endpoint"; + + @Deprecated + public static final String AWS_S3_ENDPOINT = "aws_s3_endpoint"; + + @Deprecated + public static final String AWS_S3_REGION = "aws_s3_region"; + + public static final String AWS_STS_ROLE_ARN = "aws.sts.role.arn"; + + public static final String AWS_STS_ROLE_EXTERNAL_ID = "aws.sts.role.external.id"; + + public static final String AWS_STS_ROLE_SESSION_NAME = "aws.sts.role.session.name"; + public static final String AWS_STS_ROLE_SESSION_DURATION = "aws.sts.role.session.duration"; + public static final String AWS_STS_CONFIG_ENDPOINT = "aws.sts.config.endpoint"; + + @Deprecated + public static final String AWS_SECRET_ACCESS_KEY = "aws_secret_access_key"; + + public static final String AWS_ACCESS_KEY_ID_CONFIG = "aws.access.key.id"; + public static final String AWS_SECRET_ACCESS_KEY_CONFIG = "aws.secret.access.key"; + + public static final String AWS_CREDENTIALS_PROVIDER_CONFIG = "aws.credentials.provider"; + + public S3SourceConfig(final Map<String, String> properties) { + super(configDef(), preprocessProperties(properties)); + validate(); // NOPMD ConstructorCallsOverridableMethod getStsRole is called } - static Map<String, String> preprocessProperties() { + static Map<String, String> preprocessProperties(final Map<String, String> properties) { + LOGGER.info("preprocessProperties " + properties); return Collections.emptyMap(); } @@ -44,4 +87,67 @@ public static ConfigDef configDef() { private void validate() { LOGGER.debug("Validating config."); } + + public long getS3RetryBackoffDelayMs() { + return getLong(AWS_S3_RETRY_BACKOFF_DELAY_MS_CONFIG); + } + + public long getS3RetryBackoffMaxDelayMs() { + return getLong(AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_CONFIG); + } + + public int getS3RetryBackoffMaxRetries() { + return getInt(AWS_S3_RETRY_BACKOFF_MAX_RETRIES_CONFIG); + } + + public Region getAwsS3Region() { + // we have priority of properties if old one not set or both old and new one set + // the new property value will be selected + if (Objects.nonNull(getString(AWS_S3_REGION_CONFIG))) { + return RegionUtils.getRegion(getString(AWS_S3_REGION_CONFIG)); + } else if (Objects.nonNull(getString(AWS_S3_REGION))) { + return RegionUtils.getRegion(getString(AWS_S3_REGION)); + } else { + return RegionUtils.getRegion(Regions.US_EAST_1.getName()); + } + } + + public String getAwsS3EndPoint() { + return Objects.nonNull(getString(AWS_S3_ENDPOINT_CONFIG)) + ? getString(AWS_S3_ENDPOINT_CONFIG) + : getString(AWS_S3_ENDPOINT); + } + + public boolean hasAwsStsRole() { + return getStsRole().isValid(); + } + + public AwsStsRole getStsRole() { + return new AwsStsRole(getString(AWS_STS_ROLE_ARN), getString(AWS_STS_ROLE_EXTERNAL_ID), + getString(AWS_STS_ROLE_SESSION_NAME), getInt(AWS_STS_ROLE_SESSION_DURATION)); + } + + public boolean hasStsEndpointConfig() { + return getStsEndpointConfig().isValid(); + } + + public AwsStsEndpointConfig getStsEndpointConfig() { + return new AwsStsEndpointConfig(getString(AWS_STS_CONFIG_ENDPOINT), getString(AWS_S3_REGION_CONFIG)); + } + + public AwsAccessSecret getAwsCredentials() { + return getNewAwsCredentials().isValid() ? getNewAwsCredentials() : getOldAwsCredentials(); + } + + public AwsAccessSecret getNewAwsCredentials() { + return new AwsAccessSecret(getPassword(AWS_ACCESS_KEY_ID_CONFIG), getPassword(AWS_SECRET_ACCESS_KEY_CONFIG)); + } + + public AwsAccessSecret getOldAwsCredentials() { + return new AwsAccessSecret(getPassword(AWS_ACCESS_KEY_ID), getPassword(AWS_SECRET_ACCESS_KEY)); + } + + public AWSCredentialsProvider getCustomCredentialsProvider() { + return getConfiguredInstance(AWS_CREDENTIALS_PROVIDER_CONFIG, AWSCredentialsProvider.class); + } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigDef.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigDef.java index 1847c83bd..8153213a2 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigDef.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigDef.java @@ -25,6 +25,6 @@ public class S3SourceConfigDef extends ConfigDef { @Override public List<ConfigValue> validate(final Map<String, String> props) { - return super.validate(S3SourceConfig.preprocessProperties()); + return super.validate(S3SourceConfig.preprocessProperties(props)); } } From 68705694a6de1488f788acaef6bb1c26e540e488 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Thu, 12 Sep 2024 16:42:37 +0200 Subject: [PATCH 06/90] Adding required files --- .../connect/s3/source/IntegrationBase.java | 21 ++ .../connect/s3/source/IntegrationTest.java | 24 +++ .../s3/source/DelimitedRecordReader.java | 152 +++++++++++++++ .../connect/s3/source/S3FilesReader.java | 171 +++++++++++++++++ .../kafka/connect/s3/source/S3Offset.java | 50 +++++ .../kafka/connect/s3/source/S3Partition.java | 101 ++++++++++ .../connect/s3/source/S3SourceRecord.java | 62 ++++++ .../kafka/connect/s3/source/S3SourceTask.java | 150 ++++++++++++++- .../s3/source/config/S3SourceConfig.java | 8 + .../s3/source/testutils/BucketAccessor.java | 179 ++++++++++++++++++ .../s3/source/testutils/IndexesToString.java | 22 +++ .../source/testutils/KeyValueGenerator.java | 62 ++++++ .../s3/source/testutils/KeyValueMessage.java | 33 ++++ 13 files changed, 1026 insertions(+), 9 deletions(-) create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3FilesReader.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Partition.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/IndexesToString.java create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/KeyValueGenerator.java create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/KeyValueMessage.java diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index f64bce3ac..c43401457 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -29,12 +29,18 @@ import org.apache.kafka.clients.admin.AdminClientConfig; import org.apache.kafka.clients.admin.NewTopic; +import com.amazonaws.auth.AWSStaticCredentialsProvider; +import com.amazonaws.auth.BasicAWSCredentials; +import com.amazonaws.client.builder.AwsClientBuilder; +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.AmazonS3ClientBuilder; import com.github.dockerjava.api.model.Ulimit; import org.awaitility.Awaitility; import org.junit.jupiter.api.TestInfo; import org.testcontainers.containers.Container; import org.testcontainers.containers.KafkaContainer; import org.testcontainers.containers.Network; +import org.testcontainers.containers.localstack.LocalStackContainer; import org.testcontainers.utility.DockerImageName; public interface IntegrationBase { @@ -91,4 +97,19 @@ static void createTopics(final AdminClient adminClient, final List<String> topic static void waitForRunningContainer(final Container<?> kafka) { Awaitility.await().atMost(Duration.ofMinutes(1)).until(kafka::isRunning); } + + static AmazonS3 createS3Client(final LocalStackContainer localStackContainer) { + return AmazonS3ClientBuilder.standard() + .withEndpointConfiguration(new AwsClientBuilder.EndpointConfiguration( + localStackContainer.getEndpointOverride(LocalStackContainer.Service.S3).toString(), + localStackContainer.getRegion())) + .withCredentials(new AWSStaticCredentialsProvider(new BasicAWSCredentials( + localStackContainer.getAccessKey(), localStackContainer.getSecretKey()))) + .build(); + } + + static LocalStackContainer createS3Container() { + return new LocalStackContainer(DockerImageName.parse("localstack/localstack:2.0.2")) + .withServices(LocalStackContainer.Service.S3); + } } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 156361e4f..7ae37197a 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -20,6 +20,8 @@ import java.io.File; import java.io.IOException; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -27,6 +29,9 @@ import org.apache.kafka.clients.admin.AdminClient; +import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; + +import com.amazonaws.services.s3.AmazonS3; import org.junit.Ignore; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; @@ -34,6 +39,7 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; import org.testcontainers.containers.KafkaContainer; +import org.testcontainers.containers.localstack.LocalStackContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; @@ -41,10 +47,20 @@ @Testcontainers final class IntegrationTest implements IntegrationBase { private static final String CONNECTOR_NAME = "aiven-s3-source-connector"; + private static final String COMMON_PREFIX = "s3-source-connector-for-apache-kafka-test-"; private static final int OFFSET_FLUSH_INTERVAL_MS = 5000; + private static final String TEST_BUCKET_NAME = "test-bucket0"; + + private static String s3Endpoint; + private static String s3Prefix; + private static BucketAccessor testBucketAccessor; + private static File pluginDir; + @Container + public static final LocalStackContainer LOCALSTACK = IntegrationBase.createS3Container(); + @Container private static final KafkaContainer KAFKA = IntegrationBase.createKafkaContainer(); private AdminClient adminClient; @@ -52,6 +68,12 @@ final class IntegrationTest implements IntegrationBase { @BeforeAll static void setUpAll() throws IOException, InterruptedException { + s3Prefix = COMMON_PREFIX + ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) + "/"; + + final AmazonS3 s3Client = IntegrationBase.createS3Client(LOCALSTACK); + s3Endpoint = LOCALSTACK.getEndpoint().toString(); + testBucketAccessor = new BucketAccessor(s3Client, TEST_BUCKET_NAME); + pluginDir = IntegrationBase.getPluginDir(); IntegrationBase.extractConnectorPlugin(pluginDir); IntegrationBase.waitForRunningContainer(KAFKA); @@ -59,6 +81,7 @@ static void setUpAll() throws IOException, InterruptedException { @BeforeEach void setUp(final TestInfo testInfo) throws ExecutionException, InterruptedException { + testBucketAccessor.createBucket(); adminClient = newAdminClient(KAFKA); final var topicName = IntegrationBase.topicName(testInfo); @@ -71,6 +94,7 @@ void setUp(final TestInfo testInfo) throws ExecutionException, InterruptedExcept @AfterEach void tearDown() { + testBucketAccessor.removeBucket(); connectRunner.stop(); adminClient.close(); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java new file mode 100644 index 000000000..05a7389bb --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java @@ -0,0 +1,152 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import static java.util.Optional.ofNullable; + +import java.io.BufferedInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Optional; + +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.connect.errors.DataException; + +/** + * Reads records that are followed by byte delimiters. + */ +public class DelimitedRecordReader { + private final byte[] valueDelimiter; + + private final Optional<byte[]> keyDelimiter; + + public DelimitedRecordReader(final byte[] valueDelimiter, final Optional<byte[]> keyDelimiter) { + this.valueDelimiter = Arrays.copyOf(valueDelimiter, valueDelimiter.length); + this.keyDelimiter = keyDelimiter.map(delimiter -> Arrays.copyOf(delimiter, delimiter.length)); + } + + public ConsumerRecord<byte[], byte[]> read(final String topic, final int partition, final long offset, + final BufferedInputStream data) throws IOException { + Optional<byte[]> key = Optional.empty(); + if (keyDelimiter.isPresent()) { + key = ofNullable(readTo(data, keyDelimiter.get())); + if (!key.isPresent()) { + return null; + } + } + final byte[] value = readTo(data, valueDelimiter); + if (value == null) { + if (key.isPresent()) { + throw new IllegalStateException("missing value for key!" + new String(key.get())); + } + return null; + } + return new ConsumerRecord<>(topic, partition, offset, key.orElse(null), value); + } + + // read up to and including the given multi-byte delimeter + private byte[] readTo(final BufferedInputStream data, final byte[] del) throws IOException { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final int lastByte = del[del.length - 1] & 0xff; + int readCount; + while (true) { + readCount = data.read(); + if (readCount == -1) { + break; + } + baos.write(readCount); + if (readCount == lastByte && baos.size() >= del.length) { + final byte[] bytes = baos.toByteArray(); + if (endsWith(bytes, del)) { + final byte[] undelimited = new byte[bytes.length - del.length]; + System.arraycopy(bytes, 0, undelimited, 0, undelimited.length); + return undelimited; + } + } + } + // if we got here, we got EOF before we got the delimiter + return (baos.size() == 0) ? null : baos.toByteArray(); + } + + private boolean endsWith(final byte[] bytes, final byte[] suffix) { + for (int i = 0; i < suffix.length; i++) { + if (bytes[bytes.length - suffix.length + i] != suffix[i]) { + return false; + } + } + return true; + } + + private static byte[] delimiterBytes(final String value, final String encoding) { + return ofNullable(value).orElse("\n") + .getBytes(ofNullable(encoding).map(Charset::forName).orElse(StandardCharsets.UTF_8)); + } + + public static DelimitedRecordReader from(final Map<String, String> taskConfig) { + return new DelimitedRecordReader( + delimiterBytes(taskConfig.get("value.converter.delimiter"), taskConfig.get("value.converter.encoding")), + taskConfig.containsKey("key.converter") + ? Optional.of(delimiterBytes(taskConfig.get("key.converter.delimiter"), + taskConfig.get("key.converter.encoding"))) + : Optional.empty()); + } + + Iterator<ConsumerRecord<byte[], byte[]>> readAll(final String topic, final int partition, + final InputStream inputStream, final long startOffset) { + return new Iterator<ConsumerRecord<byte[], byte[]>>() { + ConsumerRecord<byte[], byte[]> nextConsumerRecord; + + final BufferedInputStream buffered = new BufferedInputStream(inputStream); + + long offset = startOffset; + + @Override + public boolean hasNext() { + try { + if (nextConsumerRecord == null) { + nextConsumerRecord = read(topic, partition, offset++, buffered); + } + } catch (IOException e) { + throw new DataException(e); + } + return nextConsumerRecord != null; + } + + @Override + public ConsumerRecord<byte[], byte[]> next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + final ConsumerRecord<byte[], byte[]> record = this.nextConsumerRecord; + nextConsumerRecord = null; + return record; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3FilesReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3FilesReader.java new file mode 100644 index 000000000..9685c951c --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3FilesReader.java @@ -0,0 +1,171 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.START_MARKER_KEY; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.kafka.clients.consumer.ConsumerRecord; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import com.amazonaws.AmazonClientException; +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.model.ListObjectsRequest; +import com.amazonaws.services.s3.model.ObjectListing; +import com.amazonaws.services.s3.model.S3Object; +import com.amazonaws.services.s3.model.S3ObjectSummary; + +/** + * Helpers for reading records out of S3. Not thread safe. Records should be in order since S3 lists files in + * lexicographic order. It is strongly recommended that you use a unique key prefix per topic as there is no option to + * restrict this reader by topic. + * <p> + * NOTE: hasNext() on the returned iterators may throw AmazonClientException if there was a problem communicating with + * S3 or reading an object. Your code should catch AmazonClientException and implement back-off and retry as desired. + * <p> + * Any other exception should be considered a permanent failure. + */ +public class S3FilesReader implements Iterable<S3SourceRecord> { + + public static final Pattern DEFAULT_PATTERN = Pattern.compile("(\\/|^)" // match the / or the start of the key so we + // shouldn't have to worry about prefix + + "(?<topic>[^/]+?)-" // assuming no / in topic names + + "(?<partition>\\d{5})-" + "(?<offset>\\d{12})\\.gz$"); + + private final AmazonS3 s3Client; + + private final DelimitedRecordReader makeReader; + + private final Map<S3Partition, S3Offset> offsets; + + private final S3SourceConfig s3SourceConfig; + private final String bucketName; + private final String s3Prefix; + + public S3FilesReader(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, + final String s3Prefix, final Map<S3Partition, S3Offset> offsets, final DelimitedRecordReader recordReader) { + this.s3SourceConfig = s3SourceConfig; + this.offsets = Optional.ofNullable(offsets).orElseGet(HashMap::new); + this.s3Client = s3Client; + this.makeReader = recordReader; + this.bucketName = bucketName; + this.s3Prefix = s3Prefix; + } + + @Override + public Iterator<S3SourceRecord> iterator() { + return readAll(); + } + + public Iterator<S3SourceRecord> readAll() { + return new Iterator<>() { + String currentKey; + Iterator<S3ObjectSummary> nextFile; + Iterator<ConsumerRecord<byte[], byte[]>> iterator = Collections.emptyIterator(); + + // Initialize once by listing all objects matching the criteria + { + // Fetch all objects in one go + final ObjectListing objectListing = s3Client.listObjects(new ListObjectsRequest(bucketName, s3Prefix, + s3SourceConfig.getString(START_MARKER_KEY), null, s3SourceConfig.getInt(FETCH_PAGE_SIZE) * 2)); + + // Filter and collect the relevant object summaries + final List<S3ObjectSummary> chunks = new ArrayList<>(objectListing.getObjectSummaries()); + + // Set up the iterator for files + nextFile = chunks.iterator(); + } + + private void nextObject() { + if (!nextFile.hasNext()) { + iterator = Collections.emptyIterator(); + return; + } + + try { + final S3ObjectSummary file = nextFile.next(); + currentKey = file.getKey(); + try (InputStream content = getContent(s3Client.getObject(bucketName, currentKey))) { + iterator = parseKey(currentKey, (topic, partition, startOffset) -> makeReader.readAll(topic, + partition, content, startOffset)); + } + } catch (IOException e) { + throw new AmazonClientException(e); + } + } + + private InputStream getContent(final S3Object object) throws IOException { + return object.getObjectContent(); + } + + @Override + public boolean hasNext() { + while (!iterator.hasNext() && nextFile.hasNext()) { + nextObject(); + } + return iterator.hasNext(); + } + + @Override + public S3SourceRecord next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + final ConsumerRecord<byte[], byte[]> record = iterator.next(); + return new S3SourceRecord(S3Partition.from(bucketName, s3Prefix, record.topic(), record.partition()), + S3Offset.from(currentKey, record.offset()), record.topic(), record.partition(), record.key(), + record.value()); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + private <T> T parseKey(final String key, final KeyConsumer<T> consumer) throws IOException { + final Matcher matcher = DEFAULT_PATTERN.matcher(key); + if (!matcher.find()) { + throw new IllegalArgumentException("Not a valid chunk filename! " + key); + } + final String topic = matcher.group("topic"); + final int partition = Integer.parseInt(matcher.group("partition")); + final long startOffset = Long.parseLong(matcher.group("offset")); + + return consumer.consume(topic, partition, startOffset); + } + + private interface KeyConsumer<T> { + T consume(String topic, int partition, long startOffset) throws IOException; + } + +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java new file mode 100644 index 000000000..4b44e1ab5 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java @@ -0,0 +1,50 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import java.util.Map; + +public class S3Offset implements Comparable<S3Offset> { + + private final String s3key; + + private final long offset; + + public S3Offset(final String s3key, final long offset) { + this.s3key = s3key; + this.offset = offset; + } + + public static S3Offset from(final String s3key, final long offset) { + return new S3Offset(s3key, offset); + } + + public static S3Offset from(final Map<String, Object> map) { + return from((String) map.get("s3key"), (Long) map.get("originalOffset")); + } + + @Override + public String toString() { + return s3key + "@" + offset; + } + + @Override + public int compareTo(final S3Offset s3Offset) { + final int compareTo = s3key.compareTo(s3Offset.s3key); + return compareTo == 0 ? (int) (offset - s3Offset.offset) : compareTo; + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Partition.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Partition.java new file mode 100644 index 000000000..322344883 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Partition.java @@ -0,0 +1,101 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; + +public class S3Partition { + + private final String bucket; + private final String keyPrefix; + private final String topic; + private final int partition; + + public S3Partition(final String bucket, final String keyPrefix, final String topic, final int partition) { + this.bucket = bucket; + this.keyPrefix = normalizePrefix(keyPrefix); + this.topic = topic; + this.partition = partition; + } + + public static S3Partition from(final String bucket, final String keyPrefix, final String topic, + final int partition) { + return new S3Partition(bucket, keyPrefix, topic, partition); + } + + public static S3Partition from(final Map<String, Object> map) { + final String bucket = (String) map.get("bucket"); + final String keyPrefix = (String) map.get("keyPrefix"); + final String topic = (String) map.get("topic"); + final int partition = ((Number) map.get("kafkaPartition")).intValue(); + return from(bucket, keyPrefix, topic, partition); + } + + public static String normalizePrefix(final String keyPrefix) { + return keyPrefix == null ? "" : keyPrefix.endsWith("/") ? keyPrefix : keyPrefix + "/"; + } + + public Map<String, Object> asMap() { + final Map<String, Object> map = new HashMap<>(); + map.put("bucket", bucket); + map.put("keyPrefix", keyPrefix); + map.put("topic", topic); + map.put("kafkaPartition", partition); + return map; + } + + public String getBucket() { + return bucket; + } + + public String getKeyPrefix() { + return keyPrefix; + } + + public String getTopic() { + return topic; + } + + public int getPartition() { + return partition; + } + + @Override + public boolean equals(final Object s3Partition) { + if (this == s3Partition) { + return true; + } + if (s3Partition == null || getClass() != s3Partition.getClass()) { + return false; + } + final S3Partition thatS3Partition = (S3Partition) s3Partition; + return partition == thatS3Partition.partition && Objects.equals(bucket, thatS3Partition.bucket) + && Objects.equals(keyPrefix, thatS3Partition.keyPrefix) && Objects.equals(topic, thatS3Partition.topic); + } + + @Override + public int hashCode() { + return Objects.hash(bucket, keyPrefix, topic, partition); + } + + @Override + public String toString() { + return bucket + "/" + keyPrefix + "/" + topic + "-" + partition; + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java new file mode 100644 index 000000000..cff16264e --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java @@ -0,0 +1,62 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import java.util.Arrays; + +public class S3SourceRecord { + private final S3Partition s3Partition; + private final S3Offset s3Offset; + private final String toTopic; + private final int topicPartition; + private final byte[] recordKey; + private final byte[] recordValue; + + public S3SourceRecord(final S3Partition s3Partition, final S3Offset s3Offset, final String toTopic, + final int topicPartition, final byte[] recordKey, final byte[] recordValue) { + this.s3Partition = s3Partition; + this.s3Offset = s3Offset; + this.toTopic = toTopic; + this.topicPartition = topicPartition; + this.recordKey = Arrays.copyOf(recordKey, recordKey.length); + this.recordValue = Arrays.copyOf(recordValue, recordValue.length); + } + + public S3Partition file() { + return s3Partition; + } + + public S3Offset offset() { + return s3Offset; + } + + public String getToTopic() { + return toTopic; + } + + public int partition() { + return topicPartition; + } + + public byte[] key() { + return recordKey.clone(); + } + + public byte[] value() { + return recordValue.clone(); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 960110cf1..32bc49bdd 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -16,10 +16,24 @@ package io.aiven.kafka.connect.s3.source; -import java.util.Collections; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TOPICS_KEY; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TOPIC_PARTITIONS_KEY; +import static java.util.stream.Collectors.toList; +import static java.util.stream.Collectors.toMap; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.stream.Collectors; import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.source.SourceTask; @@ -34,6 +48,7 @@ import com.amazonaws.retry.RetryPolicy; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3ClientBuilder; +import com.amazonaws.services.s3.model.AmazonS3Exception; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -45,10 +60,20 @@ public class S3SourceTask extends SourceTask { private static final Logger LOGGER = LoggerFactory.getLogger(AivenKafkaConnectS3SourceConnector.class); - private S3SourceConfig config; + private S3SourceConfig s3SourceConfig; + + private Map<S3Partition, S3Offset> offsets; private AmazonS3 s3Client; + Iterator<S3SourceRecord> sourceRecordIterator; + + private final AtomicBoolean stopped = new AtomicBoolean(); + + private final static long S_3_POLL_INTERVAL = 10_000L; + + private final static long ERROR_BACKOFF = 1000L; + AwsCredentialProviderFactory credentialFactory = new AwsCredentialProviderFactory(); @SuppressWarnings("PMD.UnnecessaryConstructor") // required by Connect @@ -65,15 +90,79 @@ public String version() { public void start(final Map<String, String> props) { LOGGER.info("S3 Source task started."); Objects.requireNonNull(props, "props hasn't been set"); - config = new S3SourceConfig(props); + s3SourceConfig = new S3SourceConfig(props); - s3Client = createAmazonS3Client(config); + s3Client = createAmazonS3Client(s3SourceConfig); LOGGER.info("S3 client initialized " + s3Client.getBucketLocation("")); - // prepareReaderFromOffsetStorageReader(); + prepareReaderFromOffsetStorageReader(); + } + + private void prepareReaderFromOffsetStorageReader() { + final String s3Prefix = s3SourceConfig.getString("aws.s3.prefix"); + final String s3Bucket = s3SourceConfig.getString("aws.s3.bucket"); + + final Set<Integer> partitionList = getPartitions(); + final Set<String> topics = getTopics(); + + // map to s3 partitions + final List<S3Partition> s3Partitions = partitionList.stream() + .flatMap(p -> topics.stream().map(t -> S3Partition.from(s3Bucket, s3Prefix, t, p))) + .collect(toList()); + + // get partition offsets + final Map<Map<String, Object>, Map<String, Object>> offsetMap = context.offsetStorageReader() + .offsets(s3Partitions.stream().map(S3Partition::asMap).collect(toList())); + + if (offsets == null) { + offsets = offsetMap.entrySet() + .stream() + .filter(e -> e.getValue() != null) + .collect( + toMap(entry -> S3Partition.from(entry.getKey()), entry -> S3Offset.from(entry.getValue()))); + } + + LOGGER.info("{} reading from S3 with offsets {}", s3SourceConfig.getString("name"), offsets); + + final byte[] valueDelimiter = Optional.ofNullable(s3SourceConfig.getString("value.delimiter")) + .map(Object::toString) + .orElse("\n") + .getBytes(parseEncoding(s3SourceConfig, "value.encoding")); + + final Optional<byte[]> keyDelimiter = Optional.ofNullable(s3SourceConfig.getString("key.delimiter")) + .map(Object::toString) + .map(s -> s.getBytes(parseEncoding(s3SourceConfig, "key.encoding"))); + + sourceRecordIterator = new S3FilesReader(s3SourceConfig, s3Client, s3Bucket, s3Prefix, offsets, + new DelimitedRecordReader(valueDelimiter, keyDelimiter)).readAll(); + } + + private Set<Integer> getPartitions() { + final String partitionString = s3SourceConfig.getString(TOPIC_PARTITIONS_KEY); + if (Objects.nonNull(partitionString)) { + return Arrays.stream(partitionString.split(",")).map(Integer::parseInt).collect(Collectors.toSet()); + } else { + throw new IllegalStateException("Partition list is not configured."); + } + } + + private Set<String> getTopics() { + final String topicString = s3SourceConfig.getString(TOPICS_KEY); + if (Objects.nonNull(topicString)) { + return Arrays.stream(topicString.split(",")).collect(Collectors.toSet()); + } else { + throw new IllegalStateException("Topics list is not configured."); + } + } + + private Charset parseEncoding(final S3SourceConfig s3SourceConfig, final String key) { + return Optional.ofNullable(s3SourceConfig.getString(key)) + .map(Object::toString) + .map(Charset::forName) + .orElse(StandardCharsets.UTF_8); } private AmazonS3 createAmazonS3Client(final S3SourceConfig config) { - final var awsEndpointConfig = newEndpointConfiguration(this.config); + final var awsEndpointConfig = newEndpointConfiguration(this.s3SourceConfig); final var clientConfig = PredefinedClientConfigurations.defaultConfig() .withRetryPolicy(new RetryPolicy(PredefinedRetryPolicies.DEFAULT_RETRY_CONDITION, new PredefinedBackoffStrategies.FullJitterBackoffStrategy( @@ -99,9 +188,52 @@ private AwsClientBuilder.EndpointConfiguration newEndpointConfiguration(final S3 } @Override - public List<SourceRecord> poll() { - LOGGER.info("Using S3 client and poll " + s3Client.getBucketLocation("")); - return Collections.emptyList(); + public List<SourceRecord> poll() throws InterruptedException { + // read up to the configured poll size + final List<SourceRecord> results = new ArrayList<>(s3SourceConfig.getInt(MAX_POLL_RECORDS)); + + if (stopped.get()) { + return results; + } + + // AWS errors will happen. Nothing to do about it but sleep and try again. + while (!stopped.get()) { + try { + return getSourceRecords(results); + } catch (AmazonS3Exception e) { + if (e.isRetryable()) { + LOGGER.warn("Retryable error while polling. Will sleep and try again.", e); + Thread.sleep(ERROR_BACKOFF); + prepareReaderFromOffsetStorageReader(); + } else { + // die + throw e; + } + } + } + return results; + } + + private List<SourceRecord> getSourceRecords(final List<SourceRecord> results) throws InterruptedException { + while (!sourceRecordIterator.hasNext() && !stopped.get()) { + LOGGER.debug("Blocking until new S3 files are available."); + // sleep and block here until new files are available + Thread.sleep(S_3_POLL_INTERVAL); + prepareReaderFromOffsetStorageReader(); + } + + if (stopped.get()) { + return results; + } + + for (int i = 0; sourceRecordIterator.hasNext() && i < s3SourceConfig.getInt(MAX_POLL_RECORDS) + && !stopped.get(); i++) { + final S3SourceRecord record = sourceRecordIterator.next(); + LOGGER.info(record.offset() + record.getToTopic() + record.partition()); + } + + LOGGER.debug("{} returning {} records.", s3SourceConfig.getString("name"), results.size()); + return results; } @Override diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index c40724890..8bc4c3b5d 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -70,6 +70,14 @@ final public class S3SourceConfig extends AbstractConfig { public static final String AWS_CREDENTIALS_PROVIDER_CONFIG = "aws.credentials.provider"; + public static final String TOPIC_PARTITIONS_KEY = "topic.assigned.partitions"; + public static final String TOPICS_KEY = "topics"; + + public static final String START_MARKER_KEY = "aws.s3.start.marker"; + public static final String FETCH_PAGE_SIZE = "aws.s3.fetch.page.size"; + + public static final String MAX_POLL_RECORDS = "max.poll.records"; + public S3SourceConfig(final Map<String, String> properties) { super(configDef(), preprocessProperties(properties)); validate(); // NOPMD ConstructorCallsOverridableMethod getStsRole is called diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java new file mode 100644 index 000000000..65b914822 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java @@ -0,0 +1,179 @@ +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.testutils; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Base64; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; + +import io.aiven.kafka.connect.common.config.CompressionType; + +import com.amazonaws.AmazonClientException; +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.model.DeleteObjectsRequest; +import com.amazonaws.services.s3.model.MultiObjectDeleteException; +import com.amazonaws.services.s3.model.S3ObjectSummary; +import com.github.luben.zstd.ZstdInputStream; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xerial.snappy.SnappyInputStream; + +public class BucketAccessor { + + private final String bucketName; + private final AmazonS3 s3Client; + + private static final Logger LOGGER = LoggerFactory.getLogger(BucketAccessor.class); + + @SuppressFBWarnings(value = "EI_EXPOSE_REP2", justification = "stores mutable s3Client object") + public BucketAccessor(final AmazonS3 s3Client, final String bucketName) { + this.bucketName = bucketName; + this.s3Client = s3Client; + } + + public final void createBucket() { + s3Client.createBucket(bucketName); + } + + public final void removeBucket() { + final var chunk = s3Client.listObjects(bucketName) + .getObjectSummaries() + .stream() + .map(S3ObjectSummary::getKey) + .toArray(String[]::new); + + final var deleteObjectsRequest = new DeleteObjectsRequest(bucketName).withKeys(chunk); + try { + s3Client.deleteObjects(deleteObjectsRequest); + } catch (final MultiObjectDeleteException e) { + for (final var err : e.getErrors()) { + LOGGER.warn(String.format("Couldn't delete object: %s. Reason: [%s] %s", err.getKey(), err.getCode(), + err.getMessage())); + } + } catch (final AmazonClientException e) { + LOGGER.error( + "Couldn't delete objects: " + Arrays.stream(chunk).reduce(" ", String::concat) + e.getMessage()); + } + s3Client.deleteBucket(bucketName); + } + + public final Boolean doesObjectExist(final String objectName) { + return s3Client.doesObjectExist(bucketName, objectName); + } + + public final List<List<String>> readAndDecodeLines(final String blobName, final String compression, + final int... fieldsToDecode) throws IOException { + Objects.requireNonNull(blobName, "blobName cannot be null"); + Objects.requireNonNull(fieldsToDecode, "fieldsToDecode cannot be null"); + + return readAndDecodeLines0(blobName, compression, fieldsToDecode); + } + + private List<List<String>> readAndDecodeLines0(final String blobName, final String compression, + final int[] fieldsToDecode) throws IOException { + return readLines(blobName, compression).stream() + .map(l -> l.split(",")) + .map(fields -> decodeRequiredFields(fields, fieldsToDecode)) + .collect(Collectors.toList()); + } + + public final byte[] readBytes(final String blobName, final String compression) throws IOException { + Objects.requireNonNull(blobName, "blobName cannot be null"); + final byte[] blobBytes = s3Client.getObject(bucketName, blobName).getObjectContent().readAllBytes(); + try (ByteArrayInputStream bais = new ByteArrayInputStream(blobBytes); + InputStream decompressedStream = getDecompressedStream(bais, compression); + ByteArrayOutputStream decompressedBytes = new ByteArrayOutputStream()) { + final byte[] readBuffer = new byte[1024]; + int bytesRead; + while ((bytesRead = decompressedStream.read(readBuffer)) != -1) { // NOPMD AssignmentInOperand + decompressedBytes.write(readBuffer, 0, bytesRead); + } + return decompressedBytes.toByteArray(); + } catch (final IOException e) { + throw new RuntimeException(e); // NOPMD AvoidThrowingRawExceptionTypes + } + } + + public final byte[] readBytes(final String blobName) throws IOException { + return readBytes(blobName, "none"); + } + + public final List<String> readLines(final String blobName, final String compression) throws IOException { + final byte[] blobBytes = readBytes(blobName, compression); + try (ByteArrayInputStream bais = new ByteArrayInputStream(blobBytes); + InputStreamReader reader = new InputStreamReader(bais, StandardCharsets.UTF_8); + BufferedReader bufferedReader = new BufferedReader(reader)) { + return bufferedReader.lines().collect(Collectors.toList()); + } catch (final IOException e) { + throw new RuntimeException(e); // NOPMD AvoidThrowingRawExceptionTypes + } + } + + public final List<String> listObjects() { + return s3Client.listObjects(bucketName) + .getObjectSummaries() + .stream() + .map(S3ObjectSummary::getKey) + .collect(Collectors.toList()); + } + + private InputStream getDecompressedStream(final InputStream inputStream, final String compression) + throws IOException { + Objects.requireNonNull(inputStream, "inputStream cannot be null"); + Objects.requireNonNull(compression, "compression cannot be null"); + + final CompressionType compressionType = CompressionType.forName(compression); + switch (compressionType) { + case ZSTD : + return new ZstdInputStream(inputStream); + case GZIP : + return new GZIPInputStream(inputStream); + case SNAPPY : + return new SnappyInputStream(inputStream); + default : + return inputStream; + } + } + + private List<String> decodeRequiredFields(final String[] originalFields, final int[] fieldsToDecode) { + Objects.requireNonNull(originalFields, "originalFields cannot be null"); + Objects.requireNonNull(fieldsToDecode, "fieldsToDecode cannot be null"); + + final List<String> result = Arrays.asList(originalFields); + for (final int fieldIdx : fieldsToDecode) { + result.set(fieldIdx, b64Decode(result.get(fieldIdx))); + } + return result; + } + + private String b64Decode(final String value) { + Objects.requireNonNull(value, "value cannot be null"); + + return new String(Base64.getDecoder().decode(value), StandardCharsets.UTF_8); + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/IndexesToString.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/IndexesToString.java new file mode 100644 index 000000000..d54faa941 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/IndexesToString.java @@ -0,0 +1,22 @@ +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.testutils; + +@FunctionalInterface +public interface IndexesToString { + String generate(int partition, int epoch, int currIdx); +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/KeyValueGenerator.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/KeyValueGenerator.java new file mode 100644 index 000000000..b02103cb8 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/KeyValueGenerator.java @@ -0,0 +1,62 @@ +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.testutils; + +import java.util.Iterator; + +public class KeyValueGenerator implements Iterable<KeyValueMessage> { + + public final int numPartitions; + public final int numEpochs; + public final IndexesToString keyGenerator; + public final IndexesToString valueGenerator; + + public KeyValueGenerator(final int numPartitions, final int numEpochs, final IndexesToString keyGenerator, + final IndexesToString valueGenerator) { + this.numPartitions = numPartitions; + this.numEpochs = numEpochs; + this.keyGenerator = keyGenerator; + this.valueGenerator = valueGenerator; + } + + @Override + public Iterator<KeyValueMessage> iterator() { + return new Iterator<>() { + int partition; + int epoch; + int currIdx; + + @Override + public boolean hasNext() { + return epoch < numEpochs; + } + + @Override + public KeyValueMessage next() { + final KeyValueMessage msg = new KeyValueMessage(keyGenerator.generate(partition, epoch, currIdx), + valueGenerator.generate(partition, epoch, currIdx), partition, currIdx, epoch); + currIdx += 1; + partition += 1; + if (partition >= numPartitions) { + epoch += 1; + partition = 0; + } + return msg; + } + }; + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/KeyValueMessage.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/KeyValueMessage.java new file mode 100644 index 000000000..fed5372c8 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/KeyValueMessage.java @@ -0,0 +1,33 @@ +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.testutils; + +public class KeyValueMessage { + public final String key; + public final String value; + public final int partition; + public final int idx; + public final int epoch; + + public KeyValueMessage(final String key, final String value, final int partition, final int idx, final int epoch) { + this.key = key; + this.value = value; + this.partition = partition; + this.idx = idx; + this.epoch = epoch; + } +} From 814201414ac12eccfcb1cd4b865dc91c75c4ef90 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Fri, 13 Sep 2024 11:05:12 +0200 Subject: [PATCH 07/90] Adding s3 iterator --- .../s3/source/DelimitedRecordReader.java | 68 ++++--- .../connect/s3/source/S3FilesReader.java | 171 ------------------ .../kafka/connect/s3/source/S3Offset.java | 20 ++ .../s3/source/S3SourceRecordIterator.java | 162 +++++++++++++++++ .../kafka/connect/s3/source/S3SourceTask.java | 45 +---- .../s3/source/config/AwsAccessSecret.java | 2 +- .../s3/source/config/S3ClientFactory.java | 58 ++++++ 7 files changed, 287 insertions(+), 239 deletions(-) delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3FilesReader.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java index 05a7389bb..efbeee68c 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java @@ -19,7 +19,6 @@ import static java.util.Optional.ofNullable; import java.io.BufferedInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; @@ -58,7 +57,8 @@ public ConsumerRecord<byte[], byte[]> read(final String topic, final int partiti final byte[] value = readTo(data, valueDelimiter); if (value == null) { if (key.isPresent()) { - throw new IllegalStateException("missing value for key!" + new String(key.get())); + throw new IllegalStateException( + "missing value for key!" + new String(key.get(), StandardCharsets.UTF_8)); } return null; } @@ -67,35 +67,53 @@ public ConsumerRecord<byte[], byte[]> read(final String topic, final int partiti // read up to and including the given multi-byte delimeter private byte[] readTo(final BufferedInputStream data, final byte[] del) throws IOException { - final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final int lastByte = del[del.length - 1] & 0xff; + byte[] buffer = new byte[1024]; // Buffer for reading data, adjust size as needed + int bufferIndex = 0; // Tracks the current position in the buffer int readCount; + while (true) { readCount = data.read(); if (readCount == -1) { - break; + // Return null if no bytes were read and EOF is reached + return (bufferIndex == 0) ? null : Arrays.copyOf(buffer, bufferIndex); } - baos.write(readCount); - if (readCount == lastByte && baos.size() >= del.length) { - final byte[] bytes = baos.toByteArray(); - if (endsWith(bytes, del)) { - final byte[] undelimited = new byte[bytes.length - del.length]; - System.arraycopy(bytes, 0, undelimited, 0, undelimited.length); - return undelimited; - } + + // Write the byte to the buffer + if (bufferIndex >= buffer.length) { + // Resize buffer if needed, avoiding frequent resizing + buffer = Arrays.copyOf(buffer, buffer.length * 2); + } + buffer[bufferIndex++] = (byte) readCount; + + // Check for delimiter match + final Optional<byte[]> optionalBytes = getBytes(del, lastByte, buffer, bufferIndex, readCount); + if (optionalBytes.isPresent()) { + return optionalBytes.get(); } } - // if we got here, we got EOF before we got the delimiter - return (baos.size() == 0) ? null : baos.toByteArray(); } - private boolean endsWith(final byte[] bytes, final byte[] suffix) { - for (int i = 0; i < suffix.length; i++) { - if (bytes[bytes.length - suffix.length + i] != suffix[i]) { - return false; + private static Optional<byte[]> getBytes(final byte[] del, final int lastByte, final byte[] buffer, + final int bufferIndex, final int readCount) { + if (readCount == lastByte && bufferIndex >= del.length) { + boolean matches = true; + for (int i = 0; i < del.length; i++) { + if (buffer[bufferIndex - del.length + i] != del[i]) { + matches = false; + break; + } + } + + if (matches) { + // Return undelimited data without creating new objects inside the loop + final byte[] undelimited = new byte[bufferIndex - del.length]; + System.arraycopy(buffer, 0, undelimited, 0, undelimited.length); + return Optional.of(undelimited); } } - return true; + // Return Optional.empty() to signify no match was found + return Optional.empty(); } private static byte[] delimiterBytes(final String value, final String encoding) { @@ -115,7 +133,7 @@ public static DelimitedRecordReader from(final Map<String, String> taskConfig) { Iterator<ConsumerRecord<byte[], byte[]>> readAll(final String topic, final int partition, final InputStream inputStream, final long startOffset) { return new Iterator<ConsumerRecord<byte[], byte[]>>() { - ConsumerRecord<byte[], byte[]> nextConsumerRecord; + Optional<ConsumerRecord<byte[], byte[]>> nextConsumerRecord; final BufferedInputStream buffered = new BufferedInputStream(inputStream); @@ -124,13 +142,13 @@ Iterator<ConsumerRecord<byte[], byte[]>> readAll(final String topic, final int p @Override public boolean hasNext() { try { - if (nextConsumerRecord == null) { - nextConsumerRecord = read(topic, partition, offset++, buffered); + if (nextConsumerRecord.isPresent()) { + nextConsumerRecord = ofNullable(read(topic, partition, offset++, buffered)); } } catch (IOException e) { throw new DataException(e); } - return nextConsumerRecord != null; + return nextConsumerRecord.isPresent(); } @Override @@ -138,8 +156,8 @@ public ConsumerRecord<byte[], byte[]> next() { if (!hasNext()) { throw new NoSuchElementException(); } - final ConsumerRecord<byte[], byte[]> record = this.nextConsumerRecord; - nextConsumerRecord = null; + final ConsumerRecord<byte[], byte[]> record = this.nextConsumerRecord.get(); + nextConsumerRecord = Optional.empty(); return record; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3FilesReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3FilesReader.java deleted file mode 100644 index 9685c951c..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3FilesReader.java +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.s3.source; - -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.START_MARKER_KEY; - -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.NoSuchElementException; -import java.util.Optional; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.kafka.clients.consumer.ConsumerRecord; - -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; - -import com.amazonaws.AmazonClientException; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.model.ListObjectsRequest; -import com.amazonaws.services.s3.model.ObjectListing; -import com.amazonaws.services.s3.model.S3Object; -import com.amazonaws.services.s3.model.S3ObjectSummary; - -/** - * Helpers for reading records out of S3. Not thread safe. Records should be in order since S3 lists files in - * lexicographic order. It is strongly recommended that you use a unique key prefix per topic as there is no option to - * restrict this reader by topic. - * <p> - * NOTE: hasNext() on the returned iterators may throw AmazonClientException if there was a problem communicating with - * S3 or reading an object. Your code should catch AmazonClientException and implement back-off and retry as desired. - * <p> - * Any other exception should be considered a permanent failure. - */ -public class S3FilesReader implements Iterable<S3SourceRecord> { - - public static final Pattern DEFAULT_PATTERN = Pattern.compile("(\\/|^)" // match the / or the start of the key so we - // shouldn't have to worry about prefix - + "(?<topic>[^/]+?)-" // assuming no / in topic names - + "(?<partition>\\d{5})-" + "(?<offset>\\d{12})\\.gz$"); - - private final AmazonS3 s3Client; - - private final DelimitedRecordReader makeReader; - - private final Map<S3Partition, S3Offset> offsets; - - private final S3SourceConfig s3SourceConfig; - private final String bucketName; - private final String s3Prefix; - - public S3FilesReader(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, - final String s3Prefix, final Map<S3Partition, S3Offset> offsets, final DelimitedRecordReader recordReader) { - this.s3SourceConfig = s3SourceConfig; - this.offsets = Optional.ofNullable(offsets).orElseGet(HashMap::new); - this.s3Client = s3Client; - this.makeReader = recordReader; - this.bucketName = bucketName; - this.s3Prefix = s3Prefix; - } - - @Override - public Iterator<S3SourceRecord> iterator() { - return readAll(); - } - - public Iterator<S3SourceRecord> readAll() { - return new Iterator<>() { - String currentKey; - Iterator<S3ObjectSummary> nextFile; - Iterator<ConsumerRecord<byte[], byte[]>> iterator = Collections.emptyIterator(); - - // Initialize once by listing all objects matching the criteria - { - // Fetch all objects in one go - final ObjectListing objectListing = s3Client.listObjects(new ListObjectsRequest(bucketName, s3Prefix, - s3SourceConfig.getString(START_MARKER_KEY), null, s3SourceConfig.getInt(FETCH_PAGE_SIZE) * 2)); - - // Filter and collect the relevant object summaries - final List<S3ObjectSummary> chunks = new ArrayList<>(objectListing.getObjectSummaries()); - - // Set up the iterator for files - nextFile = chunks.iterator(); - } - - private void nextObject() { - if (!nextFile.hasNext()) { - iterator = Collections.emptyIterator(); - return; - } - - try { - final S3ObjectSummary file = nextFile.next(); - currentKey = file.getKey(); - try (InputStream content = getContent(s3Client.getObject(bucketName, currentKey))) { - iterator = parseKey(currentKey, (topic, partition, startOffset) -> makeReader.readAll(topic, - partition, content, startOffset)); - } - } catch (IOException e) { - throw new AmazonClientException(e); - } - } - - private InputStream getContent(final S3Object object) throws IOException { - return object.getObjectContent(); - } - - @Override - public boolean hasNext() { - while (!iterator.hasNext() && nextFile.hasNext()) { - nextObject(); - } - return iterator.hasNext(); - } - - @Override - public S3SourceRecord next() { - if (!hasNext()) { - throw new NoSuchElementException(); - } - final ConsumerRecord<byte[], byte[]> record = iterator.next(); - return new S3SourceRecord(S3Partition.from(bucketName, s3Prefix, record.topic(), record.partition()), - S3Offset.from(currentKey, record.offset()), record.topic(), record.partition(), record.key(), - record.value()); - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - }; - } - - private <T> T parseKey(final String key, final KeyConsumer<T> consumer) throws IOException { - final Matcher matcher = DEFAULT_PATTERN.matcher(key); - if (!matcher.find()) { - throw new IllegalArgumentException("Not a valid chunk filename! " + key); - } - final String topic = matcher.group("topic"); - final int partition = Integer.parseInt(matcher.group("partition")); - final long startOffset = Long.parseLong(matcher.group("offset")); - - return consumer.consume(topic, partition, startOffset); - } - - private interface KeyConsumer<T> { - T consume(String topic, int partition, long startOffset) throws IOException; - } - -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java index 4b44e1ab5..bbb8615ae 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java @@ -17,6 +17,7 @@ package io.aiven.kafka.connect.s3.source; import java.util.Map; +import java.util.Objects; public class S3Offset implements Comparable<S3Offset> { @@ -47,4 +48,23 @@ public int compareTo(final S3Offset s3Offset) { final int compareTo = s3key.compareTo(s3Offset.s3key); return compareTo == 0 ? (int) (offset - s3Offset.offset) : compareTo; } + + // Overriding equals to ensure consistency with compareTo + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + final S3Offset other = (S3Offset) obj; + return offset == other.offset && Objects.equals(s3key, other.s3key); + } + + // Overriding hashCode to match equals + @Override + public int hashCode() { + return Objects.hash(s3key, offset); + } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java new file mode 100644 index 000000000..3e68455cd --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java @@ -0,0 +1,162 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.START_MARKER_KEY; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.kafka.clients.consumer.ConsumerRecord; + +import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import com.amazonaws.AmazonClientException; +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.model.ListObjectsRequest; +import com.amazonaws.services.s3.model.ObjectListing; +import com.amazonaws.services.s3.model.S3Object; +import com.amazonaws.services.s3.model.S3ObjectSummary; + +public final class S3SourceRecordIterator implements Iterator<S3SourceRecord> { + + public static final Pattern DEFAULT_PATTERN = Pattern.compile("(\\/|^)" // match the / or the start of the key so we + // shouldn't have to worry about prefix + + "(?<topic>[^/]+?)-" // assuming no / in topic names + + "(?<partition>\\d{5})-" + "(?<offset>\\d{12})\\.gz$"); + private String currentKey; + private Iterator<S3ObjectSummary> nextFileIterator; + private Iterator<ConsumerRecord<byte[], byte[]>> recordIterator = Collections.emptyIterator(); + + private final DelimitedRecordReader makeReader; + + private final Map<S3Partition, S3Offset> offsets; + + private final S3SourceConfig s3SourceConfig; + private final String bucketName; + private final String s3Prefix; + private final AmazonS3 s3Client; + + public S3SourceRecordIterator(final S3SourceConfig s3SourceConfig, final String bucketName, final String s3Prefix, + final Map<S3Partition, S3Offset> offsets, final DelimitedRecordReader recordReader) { + this.s3SourceConfig = s3SourceConfig; + this.offsets = Optional.ofNullable(offsets).orElseGet(HashMap::new); + this.makeReader = recordReader; + final S3ClientFactory s3ClientFactory = new S3ClientFactory(); + this.s3Client = s3ClientFactory.createAmazonS3Client(s3SourceConfig); + this.bucketName = bucketName; + this.s3Prefix = s3Prefix; + try { + final List<S3ObjectSummary> chunks = fetchObjectSummaries(s3Client); + nextFileIterator = chunks.iterator(); + } catch (IOException e) { + throw new AmazonClientException("Failed to initialize S3 file reader", e); + } + } + + private List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOException { + final ObjectListing objectListing = s3Client.listObjects(new ListObjectsRequest().withBucketName(bucketName) + .withPrefix(s3Prefix) + .withMarker(s3SourceConfig.getString(START_MARKER_KEY)) + .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * 2)); + + return new ArrayList<>(objectListing.getObjectSummaries()); + } + + private void nextObject() { + if (!nextFileIterator.hasNext()) { + recordIterator = Collections.emptyIterator(); + return; + } + + try { + final S3ObjectSummary file = nextFileIterator.next(); + currentKey = file.getKey(); + recordIterator = createIteratorForCurrentFile(); + } catch (IOException e) { + throw new AmazonClientException(e); + } + } + + private Iterator<ConsumerRecord<byte[], byte[]>> createIteratorForCurrentFile() throws IOException { + final S3Object s3Object = s3Client.getObject(bucketName, currentKey); + try (InputStream content = getContent(s3Object)) { + return parseKey(currentKey, + (topic, partition, startOffset) -> makeReader.readAll(topic, partition, content, startOffset)); + } + } + + private InputStream getContent(final S3Object object) throws IOException { + return object.getObjectContent(); + } + + private S3Offset offset() { + return offsets.get(S3Partition.from(bucketName, s3Prefix, "", 0)); + } + + @Override + public boolean hasNext() { + while (!recordIterator.hasNext() && nextFileIterator.hasNext()) { + nextObject(); + } + return recordIterator.hasNext(); + } + + @Override + public S3SourceRecord next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + final ConsumerRecord<byte[], byte[]> record = recordIterator.next(); + return new S3SourceRecord(S3Partition.from(bucketName, s3Prefix, record.topic(), record.partition()), + S3Offset.from(currentKey, record.offset()), record.topic(), record.partition(), record.key(), + record.value()); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + private <T> T parseKey(final String key, final KeyConsumer<T> consumer) throws IOException { + final Matcher matcher = DEFAULT_PATTERN.matcher(key); + if (!matcher.find()) { + throw new IllegalArgumentException("Not a valid chunk filename! " + key); + } + final String topic = matcher.group("topic"); + final int partition = Integer.parseInt(matcher.group("partition")); + final long startOffset = Long.parseLong(matcher.group("offset")); + + return consumer.consume(topic, partition, startOffset); + } +} + +interface KeyConsumer<T> { + T consume(String topic, int partition, long startOffset) throws IOException; +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 32bc49bdd..365b71441 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -38,16 +38,8 @@ import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.source.SourceTask; -import io.aiven.kafka.connect.s3.source.config.AwsCredentialProviderFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import com.amazonaws.PredefinedClientConfigurations; -import com.amazonaws.client.builder.AwsClientBuilder; -import com.amazonaws.retry.PredefinedBackoffStrategies; -import com.amazonaws.retry.PredefinedRetryPolicies; -import com.amazonaws.retry.RetryPolicy; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.AmazonS3ClientBuilder; import com.amazonaws.services.s3.model.AmazonS3Exception; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -64,8 +56,6 @@ public class S3SourceTask extends SourceTask { private Map<S3Partition, S3Offset> offsets; - private AmazonS3 s3Client; - Iterator<S3SourceRecord> sourceRecordIterator; private final AtomicBoolean stopped = new AtomicBoolean(); @@ -74,8 +64,6 @@ public class S3SourceTask extends SourceTask { private final static long ERROR_BACKOFF = 1000L; - AwsCredentialProviderFactory credentialFactory = new AwsCredentialProviderFactory(); - @SuppressWarnings("PMD.UnnecessaryConstructor") // required by Connect public S3SourceTask() { super(); @@ -92,8 +80,7 @@ public void start(final Map<String, String> props) { Objects.requireNonNull(props, "props hasn't been set"); s3SourceConfig = new S3SourceConfig(props); - s3Client = createAmazonS3Client(s3SourceConfig); - LOGGER.info("S3 client initialized " + s3Client.getBucketLocation("")); + LOGGER.info("S3 client initialized "); prepareReaderFromOffsetStorageReader(); } @@ -132,8 +119,8 @@ private void prepareReaderFromOffsetStorageReader() { .map(Object::toString) .map(s -> s.getBytes(parseEncoding(s3SourceConfig, "key.encoding"))); - sourceRecordIterator = new S3FilesReader(s3SourceConfig, s3Client, s3Bucket, s3Prefix, offsets, - new DelimitedRecordReader(valueDelimiter, keyDelimiter)).readAll(); + sourceRecordIterator = new S3SourceRecordIterator(s3SourceConfig, s3Bucket, s3Prefix, offsets, + new DelimitedRecordReader(valueDelimiter, keyDelimiter)); } private Set<Integer> getPartitions() { @@ -161,32 +148,6 @@ private Charset parseEncoding(final S3SourceConfig s3SourceConfig, final String .orElse(StandardCharsets.UTF_8); } - private AmazonS3 createAmazonS3Client(final S3SourceConfig config) { - final var awsEndpointConfig = newEndpointConfiguration(this.s3SourceConfig); - final var clientConfig = PredefinedClientConfigurations.defaultConfig() - .withRetryPolicy(new RetryPolicy(PredefinedRetryPolicies.DEFAULT_RETRY_CONDITION, - new PredefinedBackoffStrategies.FullJitterBackoffStrategy( - Math.toIntExact(config.getS3RetryBackoffDelayMs()), - Math.toIntExact(config.getS3RetryBackoffMaxDelayMs())), - config.getS3RetryBackoffMaxRetries(), false)); - final var s3ClientBuilder = AmazonS3ClientBuilder.standard() - .withCredentials(credentialFactory.getProvider(config)) - .withClientConfiguration(clientConfig); - if (Objects.isNull(awsEndpointConfig)) { - s3ClientBuilder.withRegion(config.getAwsS3Region().getName()); - } else { - s3ClientBuilder.withEndpointConfiguration(awsEndpointConfig).withPathStyleAccessEnabled(true); - } - return s3ClientBuilder.build(); - } - - private AwsClientBuilder.EndpointConfiguration newEndpointConfiguration(final S3SourceConfig config) { - if (Objects.isNull(config.getAwsS3EndPoint())) { - return null; - } - return new AwsClientBuilder.EndpointConfiguration(config.getAwsS3EndPoint(), config.getAwsS3Region().getName()); - } - @Override public List<SourceRecord> poll() throws InterruptedException { // read up to the configured poll size diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsAccessSecret.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsAccessSecret.java index 2e9d2ac55..503998fc8 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsAccessSecret.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsAccessSecret.java @@ -1,5 +1,5 @@ /* - * Copyright 2021 Aiven Oy + * Copyright 2024 Aiven Oy * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java new file mode 100644 index 000000000..a9edbbc61 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java @@ -0,0 +1,58 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.config; + +import java.util.Objects; + +import com.amazonaws.PredefinedClientConfigurations; +import com.amazonaws.client.builder.AwsClientBuilder; +import com.amazonaws.retry.PredefinedBackoffStrategies; +import com.amazonaws.retry.PredefinedRetryPolicies; +import com.amazonaws.retry.RetryPolicy; +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.AmazonS3ClientBuilder; + +public class S3ClientFactory { + + private final AwsCredentialProviderFactory credentialFactory = new AwsCredentialProviderFactory(); + + public AmazonS3 createAmazonS3Client(final S3SourceConfig config) { + final var awsEndpointConfig = newEndpointConfiguration(config); + final var clientConfig = PredefinedClientConfigurations.defaultConfig() + .withRetryPolicy(new RetryPolicy(PredefinedRetryPolicies.DEFAULT_RETRY_CONDITION, + new PredefinedBackoffStrategies.FullJitterBackoffStrategy( + Math.toIntExact(config.getS3RetryBackoffDelayMs()), + Math.toIntExact(config.getS3RetryBackoffMaxDelayMs())), + config.getS3RetryBackoffMaxRetries(), false)); + final var s3ClientBuilder = AmazonS3ClientBuilder.standard() + .withCredentials(credentialFactory.getProvider(config)) + .withClientConfiguration(clientConfig); + if (Objects.isNull(awsEndpointConfig)) { + s3ClientBuilder.withRegion(config.getAwsS3Region().getName()); + } else { + s3ClientBuilder.withEndpointConfiguration(awsEndpointConfig).withPathStyleAccessEnabled(true); + } + return s3ClientBuilder.build(); + } + + private AwsClientBuilder.EndpointConfiguration newEndpointConfiguration(final S3SourceConfig config) { + if (Objects.isNull(config.getAwsS3EndPoint())) { + return null; + } + return new AwsClientBuilder.EndpointConfiguration(config.getAwsS3EndPoint(), config.getAwsS3Region().getName()); + } +} From 4d627909a82e3c0c7998920369c4f1f36372e0e1 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Fri, 13 Sep 2024 12:34:39 +0200 Subject: [PATCH 08/90] Updating test --- .../connect/s3/source/IntegrationBase.java | 35 ++++++++++++++++ .../connect/s3/source/IntegrationTest.java | 42 ++++++++++++++++--- .../kafka/connect/s3/source/S3SourceTask.java | 2 +- 3 files changed, 72 insertions(+), 7 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index c43401457..635d0fcd3 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -18,8 +18,11 @@ import java.io.File; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Properties; import java.util.concurrent.ExecutionException; @@ -28,6 +31,11 @@ import org.apache.kafka.clients.admin.AdminClient; import org.apache.kafka.clients.admin.AdminClientConfig; import org.apache.kafka.clients.admin.NewTopic; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; import com.amazonaws.auth.AWSStaticCredentialsProvider; import com.amazonaws.auth.BasicAWSCredentials; @@ -112,4 +120,31 @@ static LocalStackContainer createS3Container() { return new LocalStackContainer(DockerImageName.parse("localstack/localstack:2.0.2")) .withServices(LocalStackContainer.Service.S3); } + + static List<String> consumeMessages(final String topic, final int expectedMessageCount, + final KafkaContainer kafka) { + final Properties props = new Properties(); + props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, kafka.getBootstrapServers()); + props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group"); + props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); + props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); + props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); + + try (KafkaConsumer<byte[], byte[]> consumer = new KafkaConsumer<>(props)) { + consumer.subscribe(Collections.singletonList(topic)); + final List<String> messages = new ArrayList<>(); + + // Poll messages from the topic + while (messages.size() < expectedMessageCount) { + final ConsumerRecords<byte[], byte[]> records = consumer.poll(5L); + for (final ConsumerRecord<byte[], byte[]> record : records) { + messages.add(new String(record.value(), StandardCharsets.UTF_8)); // Convert message from bytes to + // string for easy + // verification + } + } + + return messages; + } + } } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 7ae37197a..4f6145fe8 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -20,6 +20,10 @@ import java.io.File; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; import java.util.HashMap; @@ -32,6 +36,7 @@ import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.model.PutObjectRequest; import org.junit.Ignore; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; @@ -62,33 +67,35 @@ final class IntegrationTest implements IntegrationBase { public static final LocalStackContainer LOCALSTACK = IntegrationBase.createS3Container(); @Container - private static final KafkaContainer KAFKA = IntegrationBase.createKafkaContainer(); + private static final KafkaContainer KAFKA_CONTAINER = IntegrationBase.createKafkaContainer(); private AdminClient adminClient; private ConnectRunner connectRunner; + private static AmazonS3 s3Client; + @BeforeAll static void setUpAll() throws IOException, InterruptedException { s3Prefix = COMMON_PREFIX + ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) + "/"; - final AmazonS3 s3Client = IntegrationBase.createS3Client(LOCALSTACK); + s3Client = IntegrationBase.createS3Client(LOCALSTACK); s3Endpoint = LOCALSTACK.getEndpoint().toString(); testBucketAccessor = new BucketAccessor(s3Client, TEST_BUCKET_NAME); pluginDir = IntegrationBase.getPluginDir(); IntegrationBase.extractConnectorPlugin(pluginDir); - IntegrationBase.waitForRunningContainer(KAFKA); + IntegrationBase.waitForRunningContainer(KAFKA_CONTAINER); } @BeforeEach void setUp(final TestInfo testInfo) throws ExecutionException, InterruptedException { testBucketAccessor.createBucket(); - adminClient = newAdminClient(KAFKA); + adminClient = newAdminClient(KAFKA_CONTAINER); final var topicName = IntegrationBase.topicName(testInfo); final var topics = List.of(topicName); IntegrationBase.createTopics(adminClient, topics); - connectRunner = newConnectRunner(KAFKA, pluginDir, OFFSET_FLUSH_INTERVAL_MS); + connectRunner = newConnectRunner(KAFKA_CONTAINER, pluginDir, OFFSET_FLUSH_INTERVAL_MS); connectRunner.start(); } @@ -102,12 +109,28 @@ void tearDown() { } @Test - void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedException { + void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectRunner.createConnector(connectorConfig); + // Create a new object on the bucket + // final String testObjectKey = s3Prefix + "test-file.txt"; + final String testData = "Hello, Kafka Connect S3 Source!"; + + final Path testFilePath = Paths.get("/tmp/test-file.txt"); + Files.write(testFilePath, testData.getBytes(StandardCharsets.UTF_8)); + + saveToS3(TEST_BUCKET_NAME, "", "test.txt", testFilePath.toFile()); + + // Verify that the connector is correctly set up assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); + + // Poll messages from the Kafka topic and verify the consumed data + final List<String> records = IntegrationBase.consumeMessages(topicName, 1, KAFKA_CONTAINER); + + // Verify that the correct data is read from the S3 bucket and pushed to Kafka + assertThat(records).containsExactly(testData); } private Map<String, String> getConfig(final Map<String, String> config, final String topicName) { @@ -128,4 +151,11 @@ private Map<String, String> basicConnectorConfig(final String connectorName) { config.put("tasks.max", "1"); return config; } + + public static void saveToS3(final String bucketName, final String folderName, final String fileNameInS3, + final File fileToWrite) { + final PutObjectRequest request = new PutObjectRequest(bucketName, folderName + fileNameInS3, fileToWrite); + s3Client.putObject(request); + // assertThat(putObj.getMetadata() + } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 365b71441..bddb8fea9 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -71,7 +71,7 @@ public S3SourceTask() { @Override public String version() { - return null; + return Version.VERSION; } @Override From f9b5041366a6548829df1d8923e58e9598c82e01 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Fri, 13 Sep 2024 18:10:18 +0200 Subject: [PATCH 09/90] Update tests --- .../connect/s3/source/ConnectRunner.java | 11 +- .../connect/s3/source/IntegrationTest.java | 25 +- .../kafka/connect/s3/source/S3SourceTask.java | 24 +- .../s3/source/config/S3SourceConfig.java | 259 +++++++++++++++++- 4 files changed, 301 insertions(+), 18 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/ConnectRunner.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/ConnectRunner.java index 593705dd1..fbe1ad97a 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/ConnectRunner.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/ConnectRunner.java @@ -19,6 +19,7 @@ import static org.assertj.core.api.Assertions.assertThat; import java.io.File; +import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.concurrent.ExecutionException; @@ -55,8 +56,9 @@ public ConnectRunner(final File pluginDir, final String bootstrapServers, final this.offsetFlushInterval = offsetFlushIntervalMs; } - void start() { + void start() throws IOException { final Map<String, String> workerProps = new HashMap<>(); + final File tempFile = File.createTempFile("connect", "offsets"); workerProps.put("bootstrap.servers", bootstrapServers); workerProps.put("offset.flush.interval.ms", Integer.toString(offsetFlushInterval)); @@ -65,12 +67,11 @@ void start() { workerProps.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); workerProps.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); workerProps.put("internal.key.converter", "org.apache.kafka.connect.json.JsonConverter"); - workerProps.put("internal.key.converter.schemas.enable", "false"); + workerProps.put("internal.key.converter.schemas.enable", "true"); workerProps.put("internal.value.converter", "org.apache.kafka.connect.json.JsonConverter"); - workerProps.put("internal.value.converter.schemas.enable", "false"); + workerProps.put("internal.value.converter.schemas.enable", "true"); - // Don't need it since we'll memory MemoryOffsetBackingStore. - workerProps.put("offset.storage.file.filename", ""); + workerProps.put("offset.storage.file.filename", tempFile.getCanonicalPath()); workerProps.put("plugin.path", pluginDir.getPath()); diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 4f6145fe8..a842acf65 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -16,6 +16,9 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.START_MARKER_KEY; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TOPIC_PARTITIONS_KEY; import static org.assertj.core.api.Assertions.assertThat; import java.io.File; @@ -53,7 +56,10 @@ final class IntegrationTest implements IntegrationBase { private static final String CONNECTOR_NAME = "aiven-s3-source-connector"; private static final String COMMON_PREFIX = "s3-source-connector-for-apache-kafka-test-"; - private static final int OFFSET_FLUSH_INTERVAL_MS = 5000; + private static final int OFFSET_FLUSH_INTERVAL_MS = 500; + + private static final String S3_ACCESS_KEY_ID = "test-key-id0"; + private static final String S3_SECRET_ACCESS_KEY = "test_secret_key0"; private static final String TEST_BUCKET_NAME = "test-bucket0"; @@ -87,7 +93,7 @@ static void setUpAll() throws IOException, InterruptedException { } @BeforeEach - void setUp(final TestInfo testInfo) throws ExecutionException, InterruptedException { + void setUp(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { testBucketAccessor.createBucket(); adminClient = newAdminClient(KAFKA_CONTAINER); @@ -112,6 +118,7 @@ void tearDown() { void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + connectRunner.createConnector(connectorConfig); // Create a new object on the bucket @@ -140,6 +147,20 @@ private Map<String, String> getConfig(final Map<String, String> config, final St private Map<String, String> getConfig(final Map<String, String> config, final List<String> topicNames) { config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); config.put("topics", String.join(",", topicNames)); + config.put("aws.access.key.id", S3_ACCESS_KEY_ID); + config.put("aws.secret.access.key", S3_SECRET_ACCESS_KEY); + config.put("aws.s3.endpoint", s3Endpoint); + config.put(AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET_NAME); + config.put("aws.s3.prefix", s3Prefix); + config.put(START_MARKER_KEY, COMMON_PREFIX); + + config.put(TOPIC_PARTITIONS_KEY, "1,2"); + + config.put("key.delimiter", "\\t"); + config.put("key.encoding", "UTF-8"); + config.put("value.delimiter", "\\n"); + config.put("value.encoding", "UTF-8"); + return config; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index bddb8fea9..32b8df68d 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -60,7 +60,7 @@ public class S3SourceTask extends SourceTask { private final AtomicBoolean stopped = new AtomicBoolean(); - private final static long S_3_POLL_INTERVAL = 10_000L; + private final static long S_3_POLL_INTERVAL_MS = 10_000L; private final static long ERROR_BACKOFF = 1000L; @@ -86,7 +86,7 @@ public void start(final Map<String, String> props) { private void prepareReaderFromOffsetStorageReader() { final String s3Prefix = s3SourceConfig.getString("aws.s3.prefix"); - final String s3Bucket = s3SourceConfig.getString("aws.s3.bucket"); + final String s3Bucket = s3SourceConfig.getString("aws.s3.bucket.name"); final Set<Integer> partitionList = getPartitions(); final Set<String> topics = getTopics(); @@ -96,9 +96,22 @@ private void prepareReaderFromOffsetStorageReader() { .flatMap(p -> topics.stream().map(t -> S3Partition.from(s3Bucket, s3Prefix, t, p))) .collect(toList()); + // List<String> topicPartitions = Arrays.asList("1","2"); + // List<Map<String, String>> offsetPartitions = topicPartitions.stream().map( + // tp -> { + // HashMap<String, String> offsetInfo = new HashMap<>(); + // offsetInfo.put("source", tp); + // offsetInfo.put("targetPrefix", "targetTopicPrefix"); + // return offsetInfo; + // } + // ).collect(Collectors.toList()); + // final Map<Map<String, String>, Map<String, Object>> offsetMap = + // context.offsetStorageReader().offsets(offsetPartitions); + // get partition offsets + final List<Map<String, Object>> partitions = s3Partitions.stream().map(S3Partition::asMap).collect(toList()); final Map<Map<String, Object>, Map<String, Object>> offsetMap = context.offsetStorageReader() - .offsets(s3Partitions.stream().map(S3Partition::asMap).collect(toList())); + .offsets(partitions); if (offsets == null) { offsets = offsetMap.entrySet() @@ -108,8 +121,6 @@ private void prepareReaderFromOffsetStorageReader() { toMap(entry -> S3Partition.from(entry.getKey()), entry -> S3Offset.from(entry.getValue()))); } - LOGGER.info("{} reading from S3 with offsets {}", s3SourceConfig.getString("name"), offsets); - final byte[] valueDelimiter = Optional.ofNullable(s3SourceConfig.getString("value.delimiter")) .map(Object::toString) .orElse("\n") @@ -179,7 +190,7 @@ private List<SourceRecord> getSourceRecords(final List<SourceRecord> results) th while (!sourceRecordIterator.hasNext() && !stopped.get()) { LOGGER.debug("Blocking until new S3 files are available."); // sleep and block here until new files are available - Thread.sleep(S_3_POLL_INTERVAL); + Thread.sleep(S_3_POLL_INTERVAL_MS); prepareReaderFromOffsetStorageReader(); } @@ -199,5 +210,6 @@ private List<SourceRecord> getSourceRecords(final List<SourceRecord> results) th @Override public void stop() { + this.stopped.set(true); } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 8bc4c3b5d..a8ea2c525 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -16,17 +16,26 @@ package io.aiven.kafka.connect.s3.source.config; -import java.util.Collections; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.regex.Pattern; +import java.util.stream.Collectors; import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.common.config.ConfigException; + +import io.aiven.kafka.connect.common.config.validators.NonEmptyPassword; +import io.aiven.kafka.connect.common.config.validators.UrlValidator; import com.amazonaws.auth.AWSCredentialsProvider; import com.amazonaws.regions.Region; import com.amazonaws.regions.RegionUtils; import com.amazonaws.regions.Regions; +import com.amazonaws.services.s3.internal.BucketNameUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -62,15 +71,40 @@ final public class S3SourceConfig extends AbstractConfig { public static final String AWS_STS_ROLE_SESSION_DURATION = "aws.sts.role.session.duration"; public static final String AWS_STS_CONFIG_ENDPOINT = "aws.sts.config.endpoint"; + private static final String GROUP_AWS = "AWS"; + private static final String GROUP_AWS_STS = "AWS_STS"; + + private static final String GROUP_OFFSET_TOPIC = "OFFSET_TOPIC"; + + private static final String GROUP_FILE = "FILE_SPECIFIC"; + + private static final String GROUP_S3_RETRY_BACKOFF_POLICY = "S3 retry backoff policy"; + + // Default values from AWS SDK, since they are hidden + public static final int AWS_S3_RETRY_BACKOFF_DELAY_MS_DEFAULT = 100; + public static final int AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_DEFAULT = 20_000; + @Deprecated public static final String AWS_SECRET_ACCESS_KEY = "aws_secret_access_key"; + @Deprecated + public static final String AWS_S3_PREFIX_CONFIG = "aws.s3.prefix"; + + @Deprecated + public static final String AWS_S3_PREFIX = "aws_s3_prefix"; + public static final String AWS_ACCESS_KEY_ID_CONFIG = "aws.access.key.id"; public static final String AWS_SECRET_ACCESS_KEY_CONFIG = "aws.secret.access.key"; public static final String AWS_CREDENTIALS_PROVIDER_CONFIG = "aws.credentials.provider"; - public static final String TOPIC_PARTITIONS_KEY = "topic.assigned.partitions"; + public static final String AWS_CREDENTIAL_PROVIDER_DEFAULT = "com.amazonaws.auth.DefaultAWSCredentialsProviderChain"; + + public static final String AWS_S3_BUCKET_NAME_CONFIG = "aws.s3.bucket.name"; + + public static final String AWS_S3_SSE_ALGORITHM_CONFIG = "aws.s3.sse.algorithm"; + + public static final String TOPIC_PARTITIONS_KEY = "offset.storage.topic.partitions"; public static final String TOPICS_KEY = "topics"; public static final String START_MARKER_KEY = "aws.s3.start.marker"; @@ -78,18 +112,233 @@ final public class S3SourceConfig extends AbstractConfig { public static final String MAX_POLL_RECORDS = "max.poll.records"; + public static final String KEY_DELIMITER = "key.delimiter"; + + public static final String KEY_ENCODING = "key.encoding"; + + public static final String VALUE_DELIMITER = "value.delimiter"; + + public static final String VALUE_ENCODING = "value.encoding"; + + public static final int S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT = 3; + public S3SourceConfig(final Map<String, String> properties) { super(configDef(), preprocessProperties(properties)); validate(); // NOPMD ConstructorCallsOverridableMethod getStsRole is called } static Map<String, String> preprocessProperties(final Map<String, String> properties) { - LOGGER.info("preprocessProperties " + properties); - return Collections.emptyMap(); + // Add other preprocessings when needed here. Mind the order. + return handleDeprecatedYyyyUppercase(properties); + } + + private static Map<String, String> handleDeprecatedYyyyUppercase(final Map<String, String> properties) { + if (!properties.containsKey(AWS_S3_PREFIX_CONFIG) && !properties.containsKey(AWS_S3_PREFIX)) { + return properties; + } + + final var result = new HashMap<>(properties); + for (final var prop : List.of(AWS_S3_PREFIX_CONFIG, AWS_S3_PREFIX)) { + if (properties.containsKey(prop)) { + String template = properties.get(prop); + final String originalTemplate = template; + + final var unitYyyyPattern = Pattern.compile("\\{\\{\\s*timestamp\\s*:\\s*unit\\s*=\\s*YYYY\\s*}}"); + template = unitYyyyPattern.matcher(template) + .replaceAll(matchResult -> matchResult.group().replace("YYYY", "yyyy")); + + if (!template.equals(originalTemplate)) { + LOGGER.warn("{{timestamp:unit=YYYY}} is no longer supported, " + + "please use {{timestamp:unit=yyyy}} instead. " + "It was automatically replaced: {}", + template); + } + + result.put(prop, template); + } + } + return result; } public static ConfigDef configDef() { - return new S3SourceConfigDef(); + final var configDef = new S3SourceConfigDef(); + addOffsetStorageConfig(configDef); + addAwsStsConfigGroup(configDef); + addAwsConfigGroup(configDef); + addDeprecatedConfiguration(configDef); + addFileConfiguration(configDef); + addS3RetryPolicies(configDef); + addOtherConfig(configDef); + return configDef; + } + + private static void addOtherConfig(final S3SourceConfigDef configDef) { + int awsOtherGroupCounter = 0; + configDef.define(FETCH_PAGE_SIZE, ConfigDef.Type.INT, 10, ConfigDef.Range.atLeast(1), + ConfigDef.Importance.MEDIUM, "Fetch page size", GROUP_AWS_STS, awsOtherGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, FETCH_PAGE_SIZE); + configDef.define(MAX_POLL_RECORDS, ConfigDef.Type.INT, 500, ConfigDef.Range.atLeast(1), + ConfigDef.Importance.MEDIUM, "Max poll records", GROUP_AWS_STS, awsOtherGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, MAX_POLL_RECORDS); + } + + private static void addAwsStsConfigGroup(final ConfigDef configDef) { + int awsStsGroupCounter = 0; + configDef.define(AWS_STS_ROLE_ARN, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "AWS STS Role", GROUP_AWS_STS, awsStsGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, AWS_STS_ROLE_ARN); + + configDef.define(AWS_STS_ROLE_SESSION_NAME, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "AWS STS Session name", GROUP_AWS_STS, awsStsGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, AWS_STS_ROLE_SESSION_NAME); + + configDef.define(AWS_STS_ROLE_SESSION_DURATION, ConfigDef.Type.INT, 3600, + ConfigDef.Range.between(AwsStsRole.MIN_SESSION_DURATION, AwsStsRole.MAX_SESSION_DURATION), + ConfigDef.Importance.MEDIUM, "AWS STS Session duration", GROUP_AWS_STS, awsStsGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, AWS_STS_ROLE_SESSION_DURATION); + + configDef.define(AWS_STS_ROLE_EXTERNAL_ID, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "AWS STS External Id", GROUP_AWS_STS, awsStsGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, AWS_STS_ROLE_EXTERNAL_ID); + + configDef.define(AWS_STS_CONFIG_ENDPOINT, ConfigDef.Type.STRING, AwsStsEndpointConfig.AWS_STS_GLOBAL_ENDPOINT, + new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "AWS STS Config Endpoint", GROUP_AWS_STS, + awsStsGroupCounter++, // NOPMD UnusedAssignment + ConfigDef.Width.NONE, AWS_STS_CONFIG_ENDPOINT); + } + + private static void addS3RetryPolicies(final ConfigDef configDef) { + var retryPolicyGroupCounter = 0; + configDef.define(AWS_S3_RETRY_BACKOFF_DELAY_MS_CONFIG, ConfigDef.Type.LONG, + AWS_S3_RETRY_BACKOFF_DELAY_MS_DEFAULT, ConfigDef.Range.atLeast(1L), ConfigDef.Importance.MEDIUM, + "S3 default base sleep time for non-throttled exceptions in milliseconds. " + "Default is " + + AWS_S3_RETRY_BACKOFF_DELAY_MS_DEFAULT + ".", + GROUP_S3_RETRY_BACKOFF_POLICY, retryPolicyGroupCounter++, // NOPMD UnusedAssignment + ConfigDef.Width.NONE, AWS_S3_RETRY_BACKOFF_DELAY_MS_CONFIG); + configDef.define(AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_CONFIG, ConfigDef.Type.LONG, + AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_DEFAULT, ConfigDef.Range.atLeast(1L), ConfigDef.Importance.MEDIUM, + "S3 maximum back-off time before retrying a request in milliseconds. " + "Default is " + + AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_DEFAULT + ".", + GROUP_S3_RETRY_BACKOFF_POLICY, retryPolicyGroupCounter++, // NOPMD UnusedAssignment + ConfigDef.Width.NONE, AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_CONFIG); + configDef.define(AWS_S3_RETRY_BACKOFF_MAX_RETRIES_CONFIG, ConfigDef.Type.INT, + S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT, ConfigDef.Range.between(1L, 30), ConfigDef.Importance.MEDIUM, + "Maximum retry limit " + "(if the value is greater than 30, " + + "there can be integer overflow issues during delay calculation). " + "Default is " + + S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT + ".", + GROUP_S3_RETRY_BACKOFF_POLICY, retryPolicyGroupCounter++, // NOPMD UnusedAssignment + ConfigDef.Width.NONE, AWS_S3_RETRY_BACKOFF_MAX_RETRIES_CONFIG); + } + + private static void addFileConfiguration(final S3SourceConfigDef configDef) { + configDef.define(KEY_DELIMITER, ConfigDef.Type.STRING, "\\t", new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "eg : \n", GROUP_FILE, 0, ConfigDef.Width.NONE, KEY_DELIMITER); + configDef.define(KEY_ENCODING, ConfigDef.Type.STRING, "UTF-8", new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "eg : UTF-8", GROUP_FILE, 1, ConfigDef.Width.NONE, KEY_ENCODING); + configDef.define(VALUE_DELIMITER, ConfigDef.Type.STRING, "\\n", new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "eg : \t", GROUP_FILE, 2, ConfigDef.Width.NONE, VALUE_DELIMITER); + configDef.define(VALUE_ENCODING, ConfigDef.Type.STRING, "UTF-8", new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "eg : UTF-8", GROUP_FILE, 3, ConfigDef.Width.NONE, VALUE_ENCODING); + } + + private static void addOffsetStorageConfig(final ConfigDef configDef) { + configDef.define(TOPIC_PARTITIONS_KEY, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "eg : 1,2", GROUP_OFFSET_TOPIC, 0, ConfigDef.Width.NONE, + TOPIC_PARTITIONS_KEY); + configDef.define(TOPICS_KEY, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "eg : testtopic", GROUP_OFFSET_TOPIC, 0, ConfigDef.Width.NONE, TOPICS_KEY); + } + + private static void addDeprecatedConfiguration(final ConfigDef configDef) { + configDef.define(AWS_S3_PREFIX_CONFIG, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, + "[Deprecated] Use `file.name.template` instead. Prefix for stored objects, e.g. cluster-1/", GROUP_AWS, + 0, ConfigDef.Width.NONE, AWS_S3_PREFIX_CONFIG); + } + + private static void addAwsConfigGroup(final ConfigDef configDef) { + int awsGroupCounter = 0; + + configDef.define(AWS_ACCESS_KEY_ID_CONFIG, ConfigDef.Type.PASSWORD, null, new NonEmptyPassword(), + ConfigDef.Importance.MEDIUM, "AWS Access Key ID", GROUP_AWS, awsGroupCounter++, ConfigDef.Width.NONE, + AWS_ACCESS_KEY_ID_CONFIG); + + configDef.define(AWS_SECRET_ACCESS_KEY_CONFIG, ConfigDef.Type.PASSWORD, null, new NonEmptyPassword(), + ConfigDef.Importance.MEDIUM, "AWS Secret Access Key", GROUP_AWS, awsGroupCounter++, + ConfigDef.Width.NONE, AWS_SECRET_ACCESS_KEY_CONFIG); + + configDef.define(AWS_CREDENTIALS_PROVIDER_CONFIG, ConfigDef.Type.CLASS, AWS_CREDENTIAL_PROVIDER_DEFAULT, + ConfigDef.Importance.MEDIUM, + "When you initialize a new " + "service client without supplying any arguments, " + + "the AWS SDK for Java attempts to find temporary " + + "credentials by using the default credential " + "provider chain implemented by the " + + "DefaultAWSCredentialsProviderChain class.", + + GROUP_AWS, awsGroupCounter++, ConfigDef.Width.NONE, AWS_CREDENTIALS_PROVIDER_CONFIG); + + configDef.define(AWS_S3_BUCKET_NAME_CONFIG, ConfigDef.Type.STRING, null, new BucketNameValidator(), + ConfigDef.Importance.MEDIUM, "AWS S3 Bucket name", GROUP_AWS, awsGroupCounter++, ConfigDef.Width.NONE, + AWS_S3_BUCKET_NAME_CONFIG); + + // AWS S3 Server Side Encryption Algorithm configuration + // Example values: 'AES256' for S3-managed keys, 'aws:kms' for AWS KMS-managed keys + configDef.define(AWS_S3_SSE_ALGORITHM_CONFIG, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, + "AWS S3 Server Side Encryption Algorithm. Example values: 'AES256', 'aws:kms'.", GROUP_AWS, + awsGroupCounter++, ConfigDef.Width.NONE, AWS_S3_SSE_ALGORITHM_CONFIG); + + configDef.define(AWS_S3_ENDPOINT_CONFIG, ConfigDef.Type.STRING, null, new UrlValidator(), + ConfigDef.Importance.LOW, "Explicit AWS S3 Endpoint Address, mainly for testing", GROUP_AWS, + awsGroupCounter++, ConfigDef.Width.NONE, AWS_S3_ENDPOINT_CONFIG); + + configDef.define(AWS_S3_REGION_CONFIG, ConfigDef.Type.STRING, null, new AwsRegionValidator(), + ConfigDef.Importance.MEDIUM, "AWS S3 Region, e.g. us-east-1", GROUP_AWS, awsGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, AWS_S3_REGION_CONFIG); + configDef.define(AWS_S3_REGION, ConfigDef.Type.STRING, null, new AwsRegionValidator(), + ConfigDef.Importance.MEDIUM, "AWS S3 Region, e.g. us-east-1", GROUP_AWS, awsGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, AWS_S3_REGION); + + configDef.define(START_MARKER_KEY, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "AWS S3 Start marker, e.g. prefix", GROUP_AWS, awsGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, START_MARKER_KEY); + } + + protected static class AwsRegionValidator implements ConfigDef.Validator { + private static final String SUPPORTED_AWS_REGIONS = Arrays.stream(Regions.values()) + .map(Regions::getName) + .collect(Collectors.joining(", ")); + + @Override + public void ensureValid(final String name, final Object value) { + if (Objects.nonNull(value)) { + final String valueStr = (String) value; + final Region region = RegionUtils.getRegion(valueStr); + if (!RegionUtils.getRegions().contains(region)) { + throw new ConfigException(name, valueStr, "supported values are: " + SUPPORTED_AWS_REGIONS); + } + } + } + } + + private static class BucketNameValidator implements ConfigDef.Validator { + @Override + public void ensureValid(final String name, final Object value) { + try { + if (value != null) { + BucketNameUtils.validateBucketName((String) value); + } + } catch (final IllegalArgumentException e) { + throw new ConfigException("Illegal bucket name: " + e.getMessage()); + } + } } private void validate() { From 495f5ccc735d98294452eb322724f77dd0c3b769 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Sat, 14 Sep 2024 23:38:07 +0200 Subject: [PATCH 10/90] Adding integration test --- .../connect/s3/source/IntegrationBase.java | 3 +- .../connect/s3/source/IntegrationTest.java | 21 ++- .../s3/source/DelimitedRecordReader.java | 131 ++++++------------ .../kafka/connect/s3/source/S3Offset.java | 8 ++ .../s3/source/S3SourceRecordIterator.java | 70 ++++++---- .../kafka/connect/s3/source/S3SourceTask.java | 33 ++++- .../s3/source/config/S3SourceConfig.java | 15 +- 7 files changed, 156 insertions(+), 125 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index 635d0fcd3..6116bcccf 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -93,7 +93,8 @@ static KafkaContainer createKafkaContainer() { } static String topicName(final TestInfo testInfo) { - return testInfo.getTestMethod().get().getName() + "-" + testInfo.getDisplayName().hashCode(); + return "testtopic"; +// return testInfo.getTestMethod().get().getName();// + "-" + testInfo.getDisplayName().hashCode(); } static void createTopics(final AdminClient adminClient, final List<String> topicNames) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index a842acf65..15a8d3022 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -29,17 +29,24 @@ import java.nio.file.Paths; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ExecutionException; +import org.apache.avro.generic.GenericRecord; import org.apache.kafka.clients.admin.AdminClient; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.PutObjectRequest; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.common.TopicPartition; import org.junit.Ignore; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; @@ -79,6 +86,8 @@ final class IntegrationTest implements IntegrationBase { private static AmazonS3 s3Client; + private String topicName ; + @BeforeAll static void setUpAll() throws IOException, InterruptedException { s3Prefix = COMMON_PREFIX + ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) + "/"; @@ -97,7 +106,7 @@ void setUp(final TestInfo testInfo) throws ExecutionException, InterruptedExcept testBucketAccessor.createBucket(); adminClient = newAdminClient(KAFKA_CONTAINER); - final var topicName = IntegrationBase.topicName(testInfo); + topicName = IntegrationBase.topicName(testInfo); final var topics = List.of(topicName); IntegrationBase.createTopics(adminClient, topics); @@ -125,10 +134,13 @@ void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx // final String testObjectKey = s3Prefix + "test-file.txt"; final String testData = "Hello, Kafka Connect S3 Source!"; - final Path testFilePath = Paths.get("/tmp/test-file.txt"); + String fileName = topicName + "-0-0001.txt"; + + final Path testFilePath = Paths.get("/tmp/" + fileName); +// final Path testFilePath = Paths.get("/tmp/test-file.txt"); Files.write(testFilePath, testData.getBytes(StandardCharsets.UTF_8)); - saveToS3(TEST_BUCKET_NAME, "", "test.txt", testFilePath.toFile()); + saveToS3(TEST_BUCKET_NAME, "", fileName, testFilePath.toFile()); // Verify that the connector is correctly set up assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); @@ -177,6 +189,7 @@ public static void saveToS3(final String bucketName, final String folderName, fi final File fileToWrite) { final PutObjectRequest request = new PutObjectRequest(bucketName, folderName + fileNameInS3, fileToWrite); s3Client.putObject(request); - // assertThat(putObj.getMetadata() + List<String> objects = testBucketAccessor.listObjects(); + assertThat(objects.size()).isEqualTo(1); } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java index efbeee68c..6990c351a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java @@ -19,6 +19,7 @@ import static java.util.Optional.ofNullable; import java.io.BufferedInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; @@ -45,75 +46,59 @@ public DelimitedRecordReader(final byte[] valueDelimiter, final Optional<byte[]> this.keyDelimiter = keyDelimiter.map(delimiter -> Arrays.copyOf(delimiter, delimiter.length)); } - public ConsumerRecord<byte[], byte[]> read(final String topic, final int partition, final long offset, - final BufferedInputStream data) throws IOException { + public ConsumerRecord<byte[], byte[]> read(String topic, int partition, long offset, + BufferedInputStream data, String keyData) throws IOException { + Optional<byte[]> key = Optional.empty(); - if (keyDelimiter.isPresent()) { - key = ofNullable(readTo(data, keyDelimiter.get())); - if (!key.isPresent()) { - return null; - } + if (keyData != null){ + key = Optional.of(keyData.getBytes()); } - final byte[] value = readTo(data, valueDelimiter); +// Optional<byte[]> key = Optional.empty(); +// if (keyDelimiter.isPresent()) { +// key = Optional.ofNullable(readTo(data, keyDelimiter.get())); +// if (!key.isPresent()) { +// return null; +// } +// } + byte[] value = readTo(data, valueDelimiter); if (value == null) { - if (key.isPresent()) { - throw new IllegalStateException( - "missing value for key!" + new String(key.get(), StandardCharsets.UTF_8)); + if(key.isPresent()) { + throw new IllegalStateException("missing value for key!" + key); } return null; } - return new ConsumerRecord<>(topic, partition, offset, key.orElse(null), value); + return new ConsumerRecord<>( + topic, partition, offset, key.orElse(null), value + ); } // read up to and including the given multi-byte delimeter - private byte[] readTo(final BufferedInputStream data, final byte[] del) throws IOException { - final int lastByte = del[del.length - 1] & 0xff; - byte[] buffer = new byte[1024]; // Buffer for reading data, adjust size as needed - int bufferIndex = 0; // Tracks the current position in the buffer - int readCount; - - while (true) { - readCount = data.read(); - if (readCount == -1) { - // Return null if no bytes were read and EOF is reached - return (bufferIndex == 0) ? null : Arrays.copyOf(buffer, bufferIndex); - } - - // Write the byte to the buffer - if (bufferIndex >= buffer.length) { - // Resize buffer if needed, avoiding frequent resizing - buffer = Arrays.copyOf(buffer, buffer.length * 2); - } - buffer[bufferIndex++] = (byte) readCount; - - // Check for delimiter match - final Optional<byte[]> optionalBytes = getBytes(del, lastByte, buffer, bufferIndex, readCount); - if (optionalBytes.isPresent()) { - return optionalBytes.get(); + private byte[] readTo(BufferedInputStream data, byte[] del) throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + int lastByte = del[del.length - 1] & 0xff; + int b; + while((b = data.read()) != -1) { + baos.write(b); + if (b == lastByte && baos.size() >= del.length) { + byte[] bytes = baos.toByteArray(); + if (endsWith(bytes, del)) { + byte[] undelimited = new byte[bytes.length - del.length]; + System.arraycopy(bytes, 0, undelimited, 0, undelimited.length); + return undelimited; + } } } + // if we got here, we got EOF before we got the delimiter + return (baos.size() == 0) ? null : baos.toByteArray(); } - private static Optional<byte[]> getBytes(final byte[] del, final int lastByte, final byte[] buffer, - final int bufferIndex, final int readCount) { - if (readCount == lastByte && bufferIndex >= del.length) { - boolean matches = true; - for (int i = 0; i < del.length; i++) { - if (buffer[bufferIndex - del.length + i] != del[i]) { - matches = false; - break; - } - } - - if (matches) { - // Return undelimited data without creating new objects inside the loop - final byte[] undelimited = new byte[bufferIndex - del.length]; - System.arraycopy(buffer, 0, undelimited, 0, undelimited.length); - return Optional.of(undelimited); + private boolean endsWith(byte[] bytes, byte[] suffix) { + for (int i = 0; i < suffix.length; i++) { + if (bytes[bytes.length - suffix.length + i] != suffix[i]) { + return false; } } - // Return Optional.empty() to signify no match was found - return Optional.empty(); + return true; } private static byte[] delimiterBytes(final String value, final String encoding) { @@ -129,42 +114,4 @@ public static DelimitedRecordReader from(final Map<String, String> taskConfig) { taskConfig.get("key.converter.encoding"))) : Optional.empty()); } - - Iterator<ConsumerRecord<byte[], byte[]>> readAll(final String topic, final int partition, - final InputStream inputStream, final long startOffset) { - return new Iterator<ConsumerRecord<byte[], byte[]>>() { - Optional<ConsumerRecord<byte[], byte[]>> nextConsumerRecord; - - final BufferedInputStream buffered = new BufferedInputStream(inputStream); - - long offset = startOffset; - - @Override - public boolean hasNext() { - try { - if (nextConsumerRecord.isPresent()) { - nextConsumerRecord = ofNullable(read(topic, partition, offset++, buffered)); - } - } catch (IOException e) { - throw new DataException(e); - } - return nextConsumerRecord.isPresent(); - } - - @Override - public ConsumerRecord<byte[], byte[]> next() { - if (!hasNext()) { - throw new NoSuchElementException(); - } - final ConsumerRecord<byte[], byte[]> record = this.nextConsumerRecord.get(); - nextConsumerRecord = Optional.empty(); - return record; - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - }; - } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java index bbb8615ae..fdb0c41bf 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java @@ -16,6 +16,7 @@ package io.aiven.kafka.connect.s3.source; +import java.util.HashMap; import java.util.Map; import java.util.Objects; @@ -49,6 +50,13 @@ public int compareTo(final S3Offset s3Offset) { return compareTo == 0 ? (int) (offset - s3Offset.offset) : compareTo; } + public Map<String, ?> asMap() { + final Map<String, Object> map = new HashMap<>(); + map.put("s3key", s3key); + map.put("originalOffset", offset); + return map; + } + // Overriding equals to ensure consistency with compareTo @Override public boolean equals(final Object obj) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java index 3e68455cd..634821171 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java @@ -19,6 +19,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.START_MARKER_KEY; +import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; @@ -63,13 +64,12 @@ public final class S3SourceRecordIterator implements Iterator<S3SourceRecord> { private final String s3Prefix; private final AmazonS3 s3Client; - public S3SourceRecordIterator(final S3SourceConfig s3SourceConfig, final String bucketName, final String s3Prefix, + public S3SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, final String s3Prefix, final Map<S3Partition, S3Offset> offsets, final DelimitedRecordReader recordReader) { this.s3SourceConfig = s3SourceConfig; this.offsets = Optional.ofNullable(offsets).orElseGet(HashMap::new); this.makeReader = recordReader; - final S3ClientFactory s3ClientFactory = new S3ClientFactory(); - this.s3Client = s3ClientFactory.createAmazonS3Client(s3SourceConfig); + this.s3Client = s3Client; this.bucketName = bucketName; this.s3Prefix = s3Prefix; try { @@ -82,8 +82,8 @@ public S3SourceRecordIterator(final S3SourceConfig s3SourceConfig, final String private List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOException { final ObjectListing objectListing = s3Client.listObjects(new ListObjectsRequest().withBucketName(bucketName) - .withPrefix(s3Prefix) - .withMarker(s3SourceConfig.getString(START_MARKER_KEY)) +// .withPrefix(s3Prefix) +// .withMarker(s3SourceConfig.getString(START_MARKER_KEY)) .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * 2)); return new ArrayList<>(objectListing.getObjectSummaries()); @@ -106,18 +106,55 @@ private void nextObject() { private Iterator<ConsumerRecord<byte[], byte[]>> createIteratorForCurrentFile() throws IOException { final S3Object s3Object = s3Client.getObject(bucketName, currentKey); - try (InputStream content = getContent(s3Object)) { - return parseKey(currentKey, - (topic, partition, startOffset) -> makeReader.readAll(topic, partition, content, startOffset)); + try (InputStream content = getContent(s3Object); + BufferedInputStream bufferedContent = new BufferedInputStream(content)) { + + // Extract the topic, partition, and startOffset from the key +// Matcher matcher = DEFAULT_PATTERN.matcher(currentKey); +// if (!matcher.find()) { +// throw new IllegalArgumentException("Invalid file key format: " + currentKey); +// } + final String topic = "testtopic";//matcher.group("topic"); + final int partition = 0;//Integer.parseInt(matcher.group("partition")); + final long startOffset = 0l;//Long.parseLong(matcher.group("offset")); + + return new Iterator<>() { + private ConsumerRecord<byte[], byte[]> nextRecord = readNext(); + + private ConsumerRecord<byte[], byte[]> readNext() { + try { + return makeReader.read(topic, partition, startOffset, bufferedContent, currentKey); + } catch (IOException e) { + throw new RuntimeException("Failed to read record from file", e); + } + } + + @Override + public boolean hasNext() { + // Check if there's another record + return nextRecord != null; + } + + @Override + public ConsumerRecord<byte[], byte[]> next() { + if (nextRecord == null) { + throw new NoSuchElementException(); + } + ConsumerRecord<byte[], byte[]> currentRecord = nextRecord; + nextRecord = null; + return currentRecord; + } + }; } } + private InputStream getContent(final S3Object object) throws IOException { return object.getObjectContent(); } private S3Offset offset() { - return offsets.get(S3Partition.from(bucketName, s3Prefix, "", 0)); + return offsets.get(S3Partition.from(bucketName, s3Prefix, "testtopic", 0)); } @Override @@ -144,19 +181,4 @@ public void remove() { throw new UnsupportedOperationException(); } - private <T> T parseKey(final String key, final KeyConsumer<T> consumer) throws IOException { - final Matcher matcher = DEFAULT_PATTERN.matcher(key); - if (!matcher.find()) { - throw new IllegalArgumentException("Not a valid chunk filename! " + key); - } - final String topic = matcher.group("topic"); - final int partition = Integer.parseInt(matcher.group("partition")); - final long startOffset = Long.parseLong(matcher.group("offset")); - - return consumer.consume(topic, partition, startOffset); - } -} - -interface KeyConsumer<T> { - T consume(String topic, int partition, long startOffset) throws IOException; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 32b8df68d..327116d49 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -35,12 +35,16 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Collectors; +import com.amazonaws.services.s3.AmazonS3; +import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; +import org.apache.kafka.connect.data.SchemaAndValue; import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.source.SourceTask; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import com.amazonaws.services.s3.model.AmazonS3Exception; +import org.apache.kafka.connect.storage.Converter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -58,12 +62,18 @@ public class S3SourceTask extends SourceTask { Iterator<S3SourceRecord> sourceRecordIterator; + private Optional<Converter> keyConverter; + private Converter valueConverter; + private final AtomicBoolean stopped = new AtomicBoolean(); private final static long S_3_POLL_INTERVAL_MS = 10_000L; private final static long ERROR_BACKOFF = 1000L; + final S3ClientFactory s3ClientFactory = new S3ClientFactory(); + private AmazonS3 s3Client; + @SuppressWarnings("PMD.UnnecessaryConstructor") // required by Connect public S3SourceTask() { super(); @@ -80,6 +90,18 @@ public void start(final Map<String, String> props) { Objects.requireNonNull(props, "props hasn't been set"); s3SourceConfig = new S3SourceConfig(props); + try { + keyConverter = Optional.of((Converter) s3SourceConfig.getClass("key.converter").newInstance()); + valueConverter = (Converter) s3SourceConfig.getClass("value.converter").newInstance(); + } catch (InstantiationException | IllegalAccessException e) { + throw new RuntimeException(e); + }; +// keyConverter = Optional.ofNullable(Configure.buildConverter(taskConfig, "key.converter", true, null)); +// valueConverter = Configure.buildConverter(taskConfig, "value.converter", false, AlreadyBytesConverter.class); + + + this.s3Client = s3ClientFactory.createAmazonS3Client(s3SourceConfig); + LOGGER.info("S3 client initialized "); prepareReaderFromOffsetStorageReader(); } @@ -130,7 +152,7 @@ private void prepareReaderFromOffsetStorageReader() { .map(Object::toString) .map(s -> s.getBytes(parseEncoding(s3SourceConfig, "key.encoding"))); - sourceRecordIterator = new S3SourceRecordIterator(s3SourceConfig, s3Bucket, s3Prefix, offsets, + sourceRecordIterator = new S3SourceRecordIterator(s3SourceConfig, s3Client, s3Bucket, s3Prefix, offsets, new DelimitedRecordReader(valueDelimiter, keyDelimiter)); } @@ -202,9 +224,16 @@ private List<SourceRecord> getSourceRecords(final List<SourceRecord> results) th && !stopped.get(); i++) { final S3SourceRecord record = sourceRecordIterator.next(); LOGGER.info(record.offset() + record.getToTopic() + record.partition()); + String topic = "testtopic"; + Optional<SchemaAndValue> key = keyConverter.map(c -> c.toConnectData(topic, record.key())); + SchemaAndValue value = valueConverter.toConnectData(topic, record.value()); + results.add(new SourceRecord(record.file().asMap(), record.offset().asMap(), topic, + record.partition(), + key.map(SchemaAndValue::schema).orElse(null), key.map(SchemaAndValue::value).orElse(null), + value.schema(), value.value())); } - LOGGER.debug("{} returning {} records.", s3SourceConfig.getString("name"), results.size()); + LOGGER.debug("{} records.", results.size()); return results; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index a8ea2c525..9c941a227 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -73,6 +73,7 @@ final public class S3SourceConfig extends AbstractConfig { private static final String GROUP_AWS = "AWS"; private static final String GROUP_AWS_STS = "AWS_STS"; + private static final String GROUP_OTHER = "OTHER_CFG"; private static final String GROUP_OFFSET_TOPIC = "OFFSET_TOPIC"; @@ -112,6 +113,8 @@ final public class S3SourceConfig extends AbstractConfig { public static final String MAX_POLL_RECORDS = "max.poll.records"; + public static final String KEY_CONVERTER = "key.converter"; + public static final String VALUE_CONVERTER = "value.converter"; public static final String KEY_DELIMITER = "key.delimiter"; public static final String KEY_ENCODING = "key.encoding"; @@ -174,13 +177,21 @@ public static ConfigDef configDef() { private static void addOtherConfig(final S3SourceConfigDef configDef) { int awsOtherGroupCounter = 0; configDef.define(FETCH_PAGE_SIZE, ConfigDef.Type.INT, 10, ConfigDef.Range.atLeast(1), - ConfigDef.Importance.MEDIUM, "Fetch page size", GROUP_AWS_STS, awsOtherGroupCounter++, // NOPMD + ConfigDef.Importance.MEDIUM, "Fetch page size", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD // UnusedAssignment ConfigDef.Width.NONE, FETCH_PAGE_SIZE); configDef.define(MAX_POLL_RECORDS, ConfigDef.Type.INT, 500, ConfigDef.Range.atLeast(1), - ConfigDef.Importance.MEDIUM, "Max poll records", GROUP_AWS_STS, awsOtherGroupCounter++, // NOPMD + ConfigDef.Importance.MEDIUM, "Max poll records", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD // UnusedAssignment ConfigDef.Width.NONE, MAX_POLL_RECORDS); + configDef.define(KEY_CONVERTER, ConfigDef.Type.CLASS, "org.apache.kafka.connect.converters.ByteArrayConverter", + ConfigDef.Importance.MEDIUM, "Key converter", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, KEY_CONVERTER); + configDef.define(VALUE_CONVERTER, ConfigDef.Type.CLASS, "org.apache.kafka.connect.converters.ByteArrayConverter", + ConfigDef.Importance.MEDIUM, "Value converter", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, VALUE_CONVERTER); } private static void addAwsStsConfigGroup(final ConfigDef configDef) { From a378fb448cd50cb4acc530b392058dab46e1d1ee Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Sun, 22 Sep 2024 20:08:39 +0200 Subject: [PATCH 11/90] Deleted unused package --- .../connect/s3/source/IntegrationBase.java | 2 +- .../connect/s3/source/IntegrationTest.java | 11 +- .../connect/source/s3/ConnectRunner.java | 119 ------------------ .../connect/source/s3/IntegrationBase.java | 94 -------------- .../connect/source/s3/IntegrationTest.java | 107 ---------------- .../s3/source/DelimitedRecordReader.java | 32 ++--- .../s3/source/S3SourceRecordIterator.java | 28 ++--- .../kafka/connect/s3/source/S3SourceTask.java | 20 ++- .../s3/source/config/S3SourceConfig.java | 19 +-- .../AivenKafkaConnectS3SourceConnector.java | 69 ---------- .../kafka/connect/source/s3/S3SourceTask.java | 62 --------- .../kafka/connect/source/s3/Version.java | 43 ------- .../source/s3/config/S3SourceConfig.java | 68 ---------- .../source/s3/config/S3SourceConfigDef.java | 30 ----- 14 files changed, 47 insertions(+), 657 deletions(-) delete mode 100644 s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java delete mode 100644 s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java delete mode 100644 s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index 6116bcccf..a1cdb5ce2 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -94,7 +94,7 @@ static KafkaContainer createKafkaContainer() { static String topicName(final TestInfo testInfo) { return "testtopic"; -// return testInfo.getTestMethod().get().getName();// + "-" + testInfo.getDisplayName().hashCode(); + // return testInfo.getTestMethod().get().getName();// + "-" + testInfo.getDisplayName().hashCode(); } static void createTopics(final AdminClient adminClient, final List<String> topicNames) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 15a8d3022..a9d16815f 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -29,24 +29,17 @@ import java.nio.file.Paths; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; -import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ExecutionException; -import org.apache.avro.generic.GenericRecord; import org.apache.kafka.clients.admin.AdminClient; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.PutObjectRequest; -import org.apache.kafka.clients.consumer.ConsumerConfig; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.clients.producer.KafkaProducer; -import org.apache.kafka.clients.producer.ProducerConfig; -import org.apache.kafka.common.TopicPartition; import org.junit.Ignore; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; @@ -86,7 +79,7 @@ final class IntegrationTest implements IntegrationBase { private static AmazonS3 s3Client; - private String topicName ; + private String topicName; @BeforeAll static void setUpAll() throws IOException, InterruptedException { @@ -137,7 +130,7 @@ void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx String fileName = topicName + "-0-0001.txt"; final Path testFilePath = Paths.get("/tmp/" + fileName); -// final Path testFilePath = Paths.get("/tmp/test-file.txt"); + // final Path testFilePath = Paths.get("/tmp/test-file.txt"); Files.write(testFilePath, testData.getBytes(StandardCharsets.UTF_8)); saveToS3(TEST_BUCKET_NAME, "", fileName, testFilePath.toFile()); diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java deleted file mode 100644 index 5aab1c99f..000000000 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/ConnectRunner.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.source.s3; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.ExecutionException; - -import org.apache.kafka.common.utils.Time; -import org.apache.kafka.connect.runtime.Connect; -import org.apache.kafka.connect.runtime.ConnectorConfig; -import org.apache.kafka.connect.runtime.Herder; -import org.apache.kafka.connect.runtime.Worker; -import org.apache.kafka.connect.runtime.isolation.Plugins; -import org.apache.kafka.connect.runtime.rest.RestServer; -import org.apache.kafka.connect.runtime.rest.entities.ConnectorInfo; -import org.apache.kafka.connect.runtime.standalone.StandaloneConfig; -import org.apache.kafka.connect.runtime.standalone.StandaloneHerder; -import org.apache.kafka.connect.storage.MemoryOffsetBackingStore; -import org.apache.kafka.connect.util.FutureCallback; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -final class ConnectRunner { - private static final Logger LOGGER = LoggerFactory.getLogger(ConnectRunner.class); - - private final File pluginDir; - private final String bootstrapServers; - private final int offsetFlushInterval; - - private Herder herder; - private Connect connect; - - public ConnectRunner(final File pluginDir, final String bootstrapServers, final int offsetFlushIntervalMs) { - this.pluginDir = pluginDir; - this.bootstrapServers = bootstrapServers; - this.offsetFlushInterval = offsetFlushIntervalMs; - } - - void start() { - final Map<String, String> workerProps = new HashMap<>(); - workerProps.put("bootstrap.servers", bootstrapServers); - - workerProps.put("offset.flush.interval.ms", Integer.toString(offsetFlushInterval)); - - // These don't matter much (each connector sets its own converters), but need to be filled with valid classes. - workerProps.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); - workerProps.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); - workerProps.put("internal.key.converter", "org.apache.kafka.connect.json.JsonConverter"); - workerProps.put("internal.key.converter.schemas.enable", "false"); - workerProps.put("internal.value.converter", "org.apache.kafka.connect.json.JsonConverter"); - workerProps.put("internal.value.converter.schemas.enable", "false"); - - // Don't need it since we'll memory MemoryOffsetBackingStore. - workerProps.put("offset.storage.file.filename", ""); - - workerProps.put("plugin.path", pluginDir.getPath()); - - final Time time = Time.SYSTEM; - final String workerId = "test-worker"; - final String kafkaClusterId = "test-cluster"; - - final Plugins plugins = new Plugins(workerProps); - final StandaloneConfig config = new StandaloneConfig(workerProps); - - final Worker worker = new Worker(workerId, time, plugins, config, new MemoryOffsetBackingStore()); - herder = new StandaloneHerder(worker, kafkaClusterId); - - final RestServer rest = new RestServer(config); - - connect = new Connect(herder, rest); - - connect.start(); - } - - void createConnector(final Map<String, String> config) throws ExecutionException, InterruptedException { - assert herder != null; - - final FutureCallback<Herder.Created<ConnectorInfo>> callback = new FutureCallback<>((error, info) -> { - if (error != null) { - LOGGER.error("Failed to create job"); - } else { - LOGGER.info("Created connector {}", info.result().name()); - } - }); - herder.putConnectorConfig(config.get(ConnectorConfig.NAME_CONFIG), config, false, callback); - - final Herder.Created<ConnectorInfo> connectorInfoCreated = callback.get(); - assert connectorInfoCreated.created(); - assertThat(connectorInfoCreated.result().config().get("connector.class")) - .isEqualTo(AivenKafkaConnectS3SourceConnector.class.getName()); - } - - void stop() { - connect.stop(); - } - - void awaitStop() { - connect.awaitStop(); - } -} diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java deleted file mode 100644 index be21ec8f1..000000000 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationBase.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.source.s3; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.time.Duration; -import java.util.List; -import java.util.Properties; -import java.util.concurrent.ExecutionException; -import java.util.stream.Collectors; - -import org.apache.kafka.clients.admin.AdminClient; -import org.apache.kafka.clients.admin.AdminClientConfig; -import org.apache.kafka.clients.admin.NewTopic; - -import com.github.dockerjava.api.model.Ulimit; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.TestInfo; -import org.testcontainers.containers.Container; -import org.testcontainers.containers.KafkaContainer; -import org.testcontainers.containers.Network; -import org.testcontainers.utility.DockerImageName; - -public interface IntegrationBase { - - String DOCKER_IMAGE_KAFKA = "confluentinc/cp-kafka:7.7.0"; - - default AdminClient newAdminClient(final KafkaContainer kafka) { - final Properties adminClientConfig = new Properties(); - adminClientConfig.put(AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, kafka.getBootstrapServers()); - return AdminClient.create(adminClientConfig); - } - - default ConnectRunner newConnectRunner(final KafkaContainer kafka, final File pluginDir, - final int offsetFlushIntervalMs) { - return new ConnectRunner(pluginDir, kafka.getBootstrapServers(), offsetFlushIntervalMs); - } - - static void extractConnectorPlugin(File pluginDir) throws IOException, InterruptedException { - final File distFile = new File(System.getProperty("integration-test.distribution.file.path")); - assert distFile.exists(); - - final String cmd = String.format("tar -xf %s --strip-components=1 -C %s", distFile, pluginDir.toString()); - final Process process = Runtime.getRuntime().exec(cmd); - assert process.waitFor() == 0; - } - - static File getPluginDir() throws IOException { - final File testDir = Files.createTempDirectory("s3-source-connector-for-apache-kafka-test-").toFile(); - - final File pluginDir = new File(testDir, "plugins/s3-source-connector-for-apache-kafka/"); - assert pluginDir.mkdirs(); - return pluginDir; - } - - static KafkaContainer createKafkaContainer() { - return new KafkaContainer(DockerImageName.parse(DOCKER_IMAGE_KAFKA)) - .withEnv("KAFKA_AUTO_CREATE_TOPICS_ENABLE", "false") - .withNetwork(Network.newNetwork()) - .withExposedPorts(KafkaContainer.KAFKA_PORT, 9092) - .withCreateContainerCmdModifier( - cmd -> cmd.getHostConfig().withUlimits(List.of(new Ulimit("nofile", 30_000L, 30_000L)))); - } - - static String topicName(final TestInfo testInfo) { - return testInfo.getTestMethod().get().getName() + "-" + testInfo.getDisplayName().hashCode(); - } - - static void createTopics(final AdminClient adminClient, final List<String> topicNames) - throws ExecutionException, InterruptedException { - final var newTopics = topicNames.stream().map(s -> new NewTopic(s, 4, (short) 1)).collect(Collectors.toList()); - adminClient.createTopics(newTopics).all().get(); - } - - static void waitForRunningContainer(final Container<?> kafka) { - Awaitility.await().atMost(Duration.ofMinutes(1)).until(kafka::isRunning); - } -} diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java deleted file mode 100644 index 921f97715..000000000 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/source/s3/IntegrationTest.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.source.s3; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ExecutionException; - -import org.apache.kafka.clients.admin.AdminClient; - -import org.junit.Ignore; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.TestInfo; -import org.testcontainers.containers.KafkaContainer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -@Ignore -@Testcontainers -final class IntegrationTest implements IntegrationBase { - private static final String CONNECTOR_NAME = "aiven-s3-source-connector"; - private static final int OFFSET_FLUSH_INTERVAL_MS = 5000; - - private static File pluginDir; - - @Container - private static final KafkaContainer KAFKA = IntegrationBase.createKafkaContainer(); - private AdminClient adminClient; - private ConnectRunner connectRunner; - - @BeforeAll - static void setUpAll() throws IOException, InterruptedException { - pluginDir = IntegrationBase.getPluginDir(); - IntegrationBase.extractConnectorPlugin(pluginDir); - IntegrationBase.waitForRunningContainer(KAFKA); - } - - @BeforeEach - void setUp(final TestInfo testInfo) throws ExecutionException, InterruptedException { - adminClient = newAdminClient(KAFKA); - - final var topicName = IntegrationBase.topicName(testInfo); - final var topics = List.of(topicName); - IntegrationBase.createTopics(adminClient, topics); - - connectRunner = newConnectRunner(KAFKA, pluginDir, OFFSET_FLUSH_INTERVAL_MS); - connectRunner.start(); - } - - @AfterEach - void tearDown() { - connectRunner.stop(); - adminClient.close(); - - connectRunner.awaitStop(); - } - - @Test - void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedException { - final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectRunner.createConnector(connectorConfig); - - assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); - } - - private Map<String, String> getConfig(final Map<String, String> config, final String topicName) { - return getConfig(config, List.of(topicName)); - } - - private Map<String, String> getConfig(final Map<String, String> config, final List<String> topicNames) { - config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); - config.put("topics", String.join(",", topicNames)); - return config; - } - - private Map<String, String> basicConnectorConfig(final String connectorName) { - final Map<String, String> config = new HashMap<>(); - config.put("name", connectorName); - config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); - config.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); - config.put("tasks.max", "1"); - return config; - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java index 6990c351a..7082305ab 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java @@ -21,17 +21,13 @@ import java.io.BufferedInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.io.InputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Arrays; -import java.util.Iterator; import java.util.Map; -import java.util.NoSuchElementException; import java.util.Optional; import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.connect.errors.DataException; /** * Reads records that are followed by byte delimiters. @@ -46,30 +42,28 @@ public DelimitedRecordReader(final byte[] valueDelimiter, final Optional<byte[]> this.keyDelimiter = keyDelimiter.map(delimiter -> Arrays.copyOf(delimiter, delimiter.length)); } - public ConsumerRecord<byte[], byte[]> read(String topic, int partition, long offset, - BufferedInputStream data, String keyData) throws IOException { + public ConsumerRecord<byte[], byte[]> read(String topic, int partition, long offset, BufferedInputStream data, + String keyData) throws IOException { Optional<byte[]> key = Optional.empty(); - if (keyData != null){ + if (keyData != null) { key = Optional.of(keyData.getBytes()); } -// Optional<byte[]> key = Optional.empty(); -// if (keyDelimiter.isPresent()) { -// key = Optional.ofNullable(readTo(data, keyDelimiter.get())); -// if (!key.isPresent()) { -// return null; -// } -// } + // Optional<byte[]> key = Optional.empty(); + // if (keyDelimiter.isPresent()) { + // key = Optional.ofNullable(readTo(data, keyDelimiter.get())); + // if (!key.isPresent()) { + // return null; + // } + // } byte[] value = readTo(data, valueDelimiter); if (value == null) { - if(key.isPresent()) { + if (key.isPresent()) { throw new IllegalStateException("missing value for key!" + key); } return null; } - return new ConsumerRecord<>( - topic, partition, offset, key.orElse(null), value - ); + return new ConsumerRecord<>(topic, partition, offset, key.orElse(null), value); } // read up to and including the given multi-byte delimeter @@ -77,7 +71,7 @@ private byte[] readTo(BufferedInputStream data, byte[] del) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); int lastByte = del[del.length - 1] & 0xff; int b; - while((b = data.read()) != -1) { + while ((b = data.read()) != -1) { baos.write(b); if (b == lastByte && baos.size() >= del.length) { byte[] bytes = baos.toByteArray(); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java index 634821171..09e18a955 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java @@ -17,7 +17,6 @@ package io.aiven.kafka.connect.s3.source; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.START_MARKER_KEY; import java.io.BufferedInputStream; import java.io.IOException; @@ -30,12 +29,10 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; -import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.kafka.clients.consumer.ConsumerRecord; -import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import com.amazonaws.AmazonClientException; @@ -64,8 +61,8 @@ public final class S3SourceRecordIterator implements Iterator<S3SourceRecord> { private final String s3Prefix; private final AmazonS3 s3Client; - public S3SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, final String s3Prefix, - final Map<S3Partition, S3Offset> offsets, final DelimitedRecordReader recordReader) { + public S3SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, + final String s3Prefix, final Map<S3Partition, S3Offset> offsets, final DelimitedRecordReader recordReader) { this.s3SourceConfig = s3SourceConfig; this.offsets = Optional.ofNullable(offsets).orElseGet(HashMap::new); this.makeReader = recordReader; @@ -82,8 +79,8 @@ public S3SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS private List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOException { final ObjectListing objectListing = s3Client.listObjects(new ListObjectsRequest().withBucketName(bucketName) -// .withPrefix(s3Prefix) -// .withMarker(s3SourceConfig.getString(START_MARKER_KEY)) + // .withPrefix(s3Prefix) + // .withMarker(s3SourceConfig.getString(START_MARKER_KEY)) .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * 2)); return new ArrayList<>(objectListing.getObjectSummaries()); @@ -107,16 +104,16 @@ private void nextObject() { private Iterator<ConsumerRecord<byte[], byte[]>> createIteratorForCurrentFile() throws IOException { final S3Object s3Object = s3Client.getObject(bucketName, currentKey); try (InputStream content = getContent(s3Object); - BufferedInputStream bufferedContent = new BufferedInputStream(content)) { + BufferedInputStream bufferedContent = new BufferedInputStream(content)) { // Extract the topic, partition, and startOffset from the key -// Matcher matcher = DEFAULT_PATTERN.matcher(currentKey); -// if (!matcher.find()) { -// throw new IllegalArgumentException("Invalid file key format: " + currentKey); -// } - final String topic = "testtopic";//matcher.group("topic"); - final int partition = 0;//Integer.parseInt(matcher.group("partition")); - final long startOffset = 0l;//Long.parseLong(matcher.group("offset")); + // Matcher matcher = DEFAULT_PATTERN.matcher(currentKey); + // if (!matcher.find()) { + // throw new IllegalArgumentException("Invalid file key format: " + currentKey); + // } + final String topic = "testtopic";// matcher.group("topic"); + final int partition = 0;// Integer.parseInt(matcher.group("partition")); + final long startOffset = 0l;// Long.parseLong(matcher.group("offset")); return new Iterator<>() { private ConsumerRecord<byte[], byte[]> nextRecord = readNext(); @@ -148,7 +145,6 @@ public ConsumerRecord<byte[], byte[]> next() { } } - private InputStream getContent(final S3Object object) throws IOException { return object.getObjectContent(); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 327116d49..b3418cd62 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -35,16 +35,16 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Collectors; -import com.amazonaws.services.s3.AmazonS3; -import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; import org.apache.kafka.connect.data.SchemaAndValue; import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.source.SourceTask; +import org.apache.kafka.connect.storage.Converter; +import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.AmazonS3Exception; -import org.apache.kafka.connect.storage.Converter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -95,10 +95,9 @@ public void start(final Map<String, String> props) { valueConverter = (Converter) s3SourceConfig.getClass("value.converter").newInstance(); } catch (InstantiationException | IllegalAccessException e) { throw new RuntimeException(e); - }; -// keyConverter = Optional.ofNullable(Configure.buildConverter(taskConfig, "key.converter", true, null)); -// valueConverter = Configure.buildConverter(taskConfig, "value.converter", false, AlreadyBytesConverter.class); - + } ; + // keyConverter = Optional.ofNullable(Configure.buildConverter(taskConfig, "key.converter", true, null)); + // valueConverter = Configure.buildConverter(taskConfig, "value.converter", false, AlreadyBytesConverter.class); this.s3Client = s3ClientFactory.createAmazonS3Client(s3SourceConfig); @@ -227,10 +226,9 @@ private List<SourceRecord> getSourceRecords(final List<SourceRecord> results) th String topic = "testtopic"; Optional<SchemaAndValue> key = keyConverter.map(c -> c.toConnectData(topic, record.key())); SchemaAndValue value = valueConverter.toConnectData(topic, record.value()); - results.add(new SourceRecord(record.file().asMap(), record.offset().asMap(), topic, - record.partition(), - key.map(SchemaAndValue::schema).orElse(null), key.map(SchemaAndValue::value).orElse(null), - value.schema(), value.value())); + results.add(new SourceRecord(record.file().asMap(), record.offset().asMap(), topic, record.partition(), + key.map(SchemaAndValue::schema).orElse(null), key.map(SchemaAndValue::value).orElse(null), + value.schema(), value.value())); } LOGGER.debug("{} records.", results.size()); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 9c941a227..37fe8406d 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -178,20 +178,21 @@ private static void addOtherConfig(final S3SourceConfigDef configDef) { int awsOtherGroupCounter = 0; configDef.define(FETCH_PAGE_SIZE, ConfigDef.Type.INT, 10, ConfigDef.Range.atLeast(1), ConfigDef.Importance.MEDIUM, "Fetch page size", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD - // UnusedAssignment + // UnusedAssignment ConfigDef.Width.NONE, FETCH_PAGE_SIZE); configDef.define(MAX_POLL_RECORDS, ConfigDef.Type.INT, 500, ConfigDef.Range.atLeast(1), ConfigDef.Importance.MEDIUM, "Max poll records", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD - // UnusedAssignment + // UnusedAssignment ConfigDef.Width.NONE, MAX_POLL_RECORDS); configDef.define(KEY_CONVERTER, ConfigDef.Type.CLASS, "org.apache.kafka.connect.converters.ByteArrayConverter", - ConfigDef.Importance.MEDIUM, "Key converter", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, KEY_CONVERTER); - configDef.define(VALUE_CONVERTER, ConfigDef.Type.CLASS, "org.apache.kafka.connect.converters.ByteArrayConverter", - ConfigDef.Importance.MEDIUM, "Value converter", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, VALUE_CONVERTER); + ConfigDef.Importance.MEDIUM, "Key converter", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, KEY_CONVERTER); + configDef.define(VALUE_CONVERTER, ConfigDef.Type.CLASS, + "org.apache.kafka.connect.converters.ByteArrayConverter", ConfigDef.Importance.MEDIUM, + "Value converter", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, VALUE_CONVERTER); } private static void addAwsStsConfigGroup(final ConfigDef configDef) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java deleted file mode 100644 index 16488e130..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/AivenKafkaConnectS3SourceConnector.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.source.s3; - -import java.util.Collections; -import java.util.List; -import java.util.Map; - -import org.apache.kafka.common.config.ConfigDef; -import org.apache.kafka.connect.connector.Task; -import org.apache.kafka.connect.source.SourceConnector; - -import io.aiven.kafka.connect.source.s3.config.S3SourceConfig; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * AivenKafkaConnectS3SourceConnector is a Kafka Connect Connector implementation that watches a S3 bucket and generates - * tasks to ingest contents. - */ -public class AivenKafkaConnectS3SourceConnector extends SourceConnector { - - private static final Logger LOGGER = LoggerFactory.getLogger(AivenKafkaConnectS3SourceConnector.class); - - @Override - public ConfigDef config() { - return S3SourceConfig.configDef(); - } - - @Override - public String version() { - return Version.VERSION; - } - - @Override - public Class<? extends Task> taskClass() { - return S3SourceTask.class; - } - - @Override - public List<Map<String, String>> taskConfigs(final int maxTasks) { - return Collections.emptyList(); - } - - @Override - public void start(final Map<String, String> properties) { - LOGGER.info("Start S3 Source connector"); - } - - @Override - public void stop() { - LOGGER.info("Stop S3 Source connector"); - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java deleted file mode 100644 index 704579fba..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/S3SourceTask.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.source.s3; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Objects; - -import org.apache.kafka.connect.source.SourceRecord; -import org.apache.kafka.connect.source.SourceTask; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * S3SourceTask is a Kafka Connect SourceTask implementation that reads from source-s3 buckets and generates Kafka - * Connect records. - */ -public class S3SourceTask extends SourceTask { - - private static final Logger LOGGER = LoggerFactory.getLogger(AivenKafkaConnectS3SourceConnector.class); - - @SuppressWarnings("PMD.UnnecessaryConstructor") // required by Connect - public S3SourceTask() { - super(); - } - - @Override - public String version() { - return null; - } - - @Override - public void start(final Map<String, String> props) { - LOGGER.info("S3 Source task started."); - Objects.requireNonNull(props, "props hasn't been set"); - } - - @Override - public List<SourceRecord> poll() { - return Collections.emptyList(); - } - - @Override - public void stop() { - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java deleted file mode 100644 index b5e5cdc85..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/Version.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright 2020 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.source.s3; - -import java.io.InputStream; -import java.util.Properties; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -final class Version { - private static final Logger LOGGER = LoggerFactory.getLogger(Version.class); - - private static final String PROPERTIES_FILENAME = "s3-source-connector-for-apache-kafka-version.properties"; - - static final String VERSION; // NOPMD AvoidFieldNameMatchingTypeName - - static { - final Properties props = new Properties(); - try (InputStream resourceStream = Thread.currentThread() - .getContextClassLoader() - .getResourceAsStream(PROPERTIES_FILENAME)) { - props.load(resourceStream); - } catch (final Exception e) { // NOPMD AvoidCatchingGenericException - LOGGER.warn("Error while loading {}: {}", PROPERTIES_FILENAME, e.getMessage()); - } - VERSION = props.getProperty("version", "unknown").trim(); - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java deleted file mode 100644 index a77caeec4..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfig.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.source.s3.config; - -import java.util.Collections; -import java.util.List; -import java.util.Map; - -import org.apache.kafka.common.config.ConfigDef; - -import io.aiven.kafka.connect.common.config.AivenCommonConfig; -import io.aiven.kafka.connect.common.config.CompressionType; -import io.aiven.kafka.connect.common.config.OutputField; -import io.aiven.kafka.connect.common.config.OutputFieldEncodingType; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -@SuppressWarnings({ "PMD.TooManyMethods", "PMD.GodClass", "PMD.ExcessiveImports" }) -final public class S3SourceConfig extends AivenCommonConfig { - - public static final Logger LOGGER = LoggerFactory.getLogger(S3SourceConfig.class); - - public S3SourceConfig(final Map<String, String> properties) { - super(configDef(), preprocessProperties(properties)); - validate(); - } - - static Map<String, String> preprocessProperties(final Map<String, String> properties) { - return Collections.emptyMap(); - } - - public static ConfigDef configDef() { - return new S3SourceConfigDef(); - } - - private void validate() { - LOGGER.debug("Validating config."); - } - @Override - public CompressionType getCompressionType() { - return CompressionType.GZIP; - } - - @Override - public List<OutputField> getOutputFields() { - return Collections.emptyList(); - } - - @Override - public OutputFieldEncodingType getOutputFieldEncodingType() { - return null; - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java deleted file mode 100644 index 12fa37d77..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/source/s3/config/S3SourceConfigDef.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.source.s3.config; - -import java.util.List; -import java.util.Map; - -import org.apache.kafka.common.config.ConfigDef; -import org.apache.kafka.common.config.ConfigValue; - -public class S3SourceConfigDef extends ConfigDef { - @Override - public List<ConfigValue> validate(final Map<String, String> props) { - return super.validate(S3SourceConfig.preprocessProperties(props)); - } -} From e408c1937ef43c6b14c26884552a8617d1d6b545 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Mon, 23 Sep 2024 09:07:05 +0200 Subject: [PATCH 12/90] byte array reader --- .../connect/s3/source/IntegrationTest.java | 33 ++++ .../s3/source/DelimitedRecordReader.java | 111 ----------- .../s3/source/S3SourceRecordIterator.java | 27 ++- .../kafka/connect/s3/source/S3SourceTask.java | 13 +- .../s3/source/testutils/S3OutputStream.java | 181 ++++++++++++++++++ 5 files changed, 234 insertions(+), 131 deletions(-) delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index a9d16815f..d9b189bdd 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -23,6 +23,7 @@ import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; @@ -34,6 +35,8 @@ import java.util.Map; import java.util.concurrent.ExecutionException; +import io.aiven.kafka.connect.s3.source.testutils.S3OutputStream; +import org.apache.commons.io.IOUtils; import org.apache.kafka.clients.admin.AdminClient; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; @@ -46,6 +49,8 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.testcontainers.containers.KafkaContainer; import org.testcontainers.containers.localstack.LocalStackContainer; import org.testcontainers.junit.jupiter.Container; @@ -54,6 +59,9 @@ @Ignore @Testcontainers final class IntegrationTest implements IntegrationBase { + + private static final Logger LOGGER = LoggerFactory.getLogger(IntegrationTest.class); + private static final String S3_FILE_NAME = "testtopic-0-0001.txt"; private static final String CONNECTOR_NAME = "aiven-s3-source-connector"; private static final String COMMON_PREFIX = "s3-source-connector-for-apache-kafka-test-"; private static final int OFFSET_FLUSH_INTERVAL_MS = 500; @@ -145,6 +153,17 @@ void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx assertThat(records).containsExactly(testData); } + @Test + void multiUploadTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { + final var topicName = IntegrationBase.topicName(testInfo); + final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + + connectRunner.createConnector(connectorConfig); + multipartUpload(TEST_BUCKET_NAME, "testkey"); + // Poll messages from the Kafka topic and verify the consumed data + final List<String> records = IntegrationBase.consumeMessages(topicName, 1, KAFKA_CONTAINER); + } + private Map<String, String> getConfig(final Map<String, String> config, final String topicName) { return getConfig(config, List.of(topicName)); } @@ -185,4 +204,18 @@ public static void saveToS3(final String bucketName, final String folderName, fi List<String> objects = testBucketAccessor.listObjects(); assertThat(objects.size()).isEqualTo(1); } + + public void multipartUpload(String bucketName, String key) { + try (S3OutputStream s3OutputStream = new S3OutputStream(bucketName, key, S3OutputStream.DEFAULT_PART_SIZE, s3Client)) { + InputStream resourceStream = Thread.currentThread() + .getContextClassLoader() + .getResourceAsStream(S3_FILE_NAME); + assert resourceStream != null; + byte [] fileBytes = IOUtils.toByteArray(resourceStream); + s3OutputStream.write(fileBytes); + } catch (IOException e) { + LOGGER.error(e.getMessage()); + } + } + } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java deleted file mode 100644 index 7082305ab..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/DelimitedRecordReader.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.s3.source; - -import static java.util.Optional.ofNullable; - -import java.io.BufferedInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.Map; -import java.util.Optional; - -import org.apache.kafka.clients.consumer.ConsumerRecord; - -/** - * Reads records that are followed by byte delimiters. - */ -public class DelimitedRecordReader { - private final byte[] valueDelimiter; - - private final Optional<byte[]> keyDelimiter; - - public DelimitedRecordReader(final byte[] valueDelimiter, final Optional<byte[]> keyDelimiter) { - this.valueDelimiter = Arrays.copyOf(valueDelimiter, valueDelimiter.length); - this.keyDelimiter = keyDelimiter.map(delimiter -> Arrays.copyOf(delimiter, delimiter.length)); - } - - public ConsumerRecord<byte[], byte[]> read(String topic, int partition, long offset, BufferedInputStream data, - String keyData) throws IOException { - - Optional<byte[]> key = Optional.empty(); - if (keyData != null) { - key = Optional.of(keyData.getBytes()); - } - // Optional<byte[]> key = Optional.empty(); - // if (keyDelimiter.isPresent()) { - // key = Optional.ofNullable(readTo(data, keyDelimiter.get())); - // if (!key.isPresent()) { - // return null; - // } - // } - byte[] value = readTo(data, valueDelimiter); - if (value == null) { - if (key.isPresent()) { - throw new IllegalStateException("missing value for key!" + key); - } - return null; - } - return new ConsumerRecord<>(topic, partition, offset, key.orElse(null), value); - } - - // read up to and including the given multi-byte delimeter - private byte[] readTo(BufferedInputStream data, byte[] del) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - int lastByte = del[del.length - 1] & 0xff; - int b; - while ((b = data.read()) != -1) { - baos.write(b); - if (b == lastByte && baos.size() >= del.length) { - byte[] bytes = baos.toByteArray(); - if (endsWith(bytes, del)) { - byte[] undelimited = new byte[bytes.length - del.length]; - System.arraycopy(bytes, 0, undelimited, 0, undelimited.length); - return undelimited; - } - } - } - // if we got here, we got EOF before we got the delimiter - return (baos.size() == 0) ? null : baos.toByteArray(); - } - - private boolean endsWith(byte[] bytes, byte[] suffix) { - for (int i = 0; i < suffix.length; i++) { - if (bytes[bytes.length - suffix.length + i] != suffix[i]) { - return false; - } - } - return true; - } - - private static byte[] delimiterBytes(final String value, final String encoding) { - return ofNullable(value).orElse("\n") - .getBytes(ofNullable(encoding).map(Charset::forName).orElse(StandardCharsets.UTF_8)); - } - - public static DelimitedRecordReader from(final Map<String, String> taskConfig) { - return new DelimitedRecordReader( - delimiterBytes(taskConfig.get("value.converter.delimiter"), taskConfig.get("value.converter.encoding")), - taskConfig.containsKey("key.converter") - ? Optional.of(delimiterBytes(taskConfig.get("key.converter.delimiter"), - taskConfig.get("key.converter.encoding"))) - : Optional.empty()); - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java index 09e18a955..5d94b4040 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java @@ -18,7 +18,6 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; -import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; @@ -31,6 +30,7 @@ import java.util.Optional; import java.util.regex.Pattern; +import com.amazonaws.util.IOUtils; import org.apache.kafka.clients.consumer.ConsumerRecord; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; @@ -41,6 +41,9 @@ import com.amazonaws.services.s3.model.ObjectListing; import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; +//import software.amazon.awssdk.core.sync.ResponseTransformer; +//import software.amazon.awssdk.services.s3.model.GetObjectRequest; +//import software.amazon.awssdk.services.s3.model.GetObjectResponse; public final class S3SourceRecordIterator implements Iterator<S3SourceRecord> { @@ -52,8 +55,6 @@ public final class S3SourceRecordIterator implements Iterator<S3SourceRecord> { private Iterator<S3ObjectSummary> nextFileIterator; private Iterator<ConsumerRecord<byte[], byte[]>> recordIterator = Collections.emptyIterator(); - private final DelimitedRecordReader makeReader; - private final Map<S3Partition, S3Offset> offsets; private final S3SourceConfig s3SourceConfig; @@ -62,10 +63,9 @@ public final class S3SourceRecordIterator implements Iterator<S3SourceRecord> { private final AmazonS3 s3Client; public S3SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, - final String s3Prefix, final Map<S3Partition, S3Offset> offsets, final DelimitedRecordReader recordReader) { + final String s3Prefix, final Map<S3Partition, S3Offset> offsets) { this.s3SourceConfig = s3SourceConfig; this.offsets = Optional.ofNullable(offsets).orElseGet(HashMap::new); - this.makeReader = recordReader; this.s3Client = s3Client; this.bucketName = bucketName; this.s3Prefix = s3Prefix; @@ -103,8 +103,7 @@ private void nextObject() { private Iterator<ConsumerRecord<byte[], byte[]>> createIteratorForCurrentFile() throws IOException { final S3Object s3Object = s3Client.getObject(bucketName, currentKey); - try (InputStream content = getContent(s3Object); - BufferedInputStream bufferedContent = new BufferedInputStream(content)) { + try (InputStream content = getContent(s3Object)) { // Extract the topic, partition, and startOffset from the key // Matcher matcher = DEFAULT_PATTERN.matcher(currentKey); @@ -120,7 +119,19 @@ private Iterator<ConsumerRecord<byte[], byte[]>> createIteratorForCurrentFile() private ConsumerRecord<byte[], byte[]> readNext() { try { - return makeReader.read(topic, partition, startOffset, bufferedContent, currentKey); + Optional<byte[]> key = Optional.empty(); + if (currentKey != null) { + key = Optional.of(currentKey.getBytes()); + } + byte[] value = IOUtils.toByteArray(content); + + if (value == null) { + if (key.isPresent()) { + throw new IllegalStateException("missing value for key!" + key); + } + return null; + } + return new ConsumerRecord<>(topic, partition, startOffset, key.orElse(null), value); } catch (IOException e) { throw new RuntimeException("Failed to read record from file", e); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index b3418cd62..06a54fe37 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -141,18 +141,7 @@ private void prepareReaderFromOffsetStorageReader() { .collect( toMap(entry -> S3Partition.from(entry.getKey()), entry -> S3Offset.from(entry.getValue()))); } - - final byte[] valueDelimiter = Optional.ofNullable(s3SourceConfig.getString("value.delimiter")) - .map(Object::toString) - .orElse("\n") - .getBytes(parseEncoding(s3SourceConfig, "value.encoding")); - - final Optional<byte[]> keyDelimiter = Optional.ofNullable(s3SourceConfig.getString("key.delimiter")) - .map(Object::toString) - .map(s -> s.getBytes(parseEncoding(s3SourceConfig, "key.encoding"))); - - sourceRecordIterator = new S3SourceRecordIterator(s3SourceConfig, s3Client, s3Bucket, s3Prefix, offsets, - new DelimitedRecordReader(valueDelimiter, keyDelimiter)); + sourceRecordIterator = new S3SourceRecordIterator(s3SourceConfig, s3Client, s3Bucket, s3Prefix, offsets); } private Set<Integer> getPartitions() { diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java new file mode 100644 index 000000000..8f138f293 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java @@ -0,0 +1,181 @@ +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.testutils; + +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.model.AbortMultipartUploadRequest; +import com.amazonaws.services.s3.model.CompleteMultipartUploadRequest; +import com.amazonaws.services.s3.model.InitiateMultipartUploadRequest; +import com.amazonaws.services.s3.model.ObjectMetadata; +import com.amazonaws.services.s3.model.PartETag; +import com.amazonaws.services.s3.model.UploadPartRequest; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +public class S3OutputStream extends OutputStream { + + private final Logger logger = LoggerFactory.getLogger(S3OutputStream.class); + + public static final int DEFAULT_PART_SIZE = 5 * 1024 * 1024; // 1 MB + + private final AmazonS3 client; + + private final ByteBuffer byteBuffer; + + private final String bucketName; + + private final String key; + + private MultipartUpload multipartUpload; + + private final int partSize; + + private final String serverSideEncryptionAlgorithm; + + private boolean closed; + + @SuppressFBWarnings(value = "EI_EXPOSE_REP2", justification = "AmazonS3 client is mutable") + public S3OutputStream(final String bucketName, final String key, final int partSize, final AmazonS3 client) { + this(bucketName, key, partSize, client, null); + } + + @SuppressFBWarnings(value = "EI_EXPOSE_REP2", justification = "AmazonS3 client is mutable") + public S3OutputStream(final String bucketName, final String key, final int partSize, final AmazonS3 client, + final String serverSideEncryptionAlgorithm) { + super(); + this.bucketName = bucketName; + this.key = key; + this.client = client; + this.partSize = partSize; + this.byteBuffer = ByteBuffer.allocate(partSize); + this.serverSideEncryptionAlgorithm = serverSideEncryptionAlgorithm; + } + + @Override + public void write(final int singleByte) throws IOException { + write(new byte[] { (byte) singleByte }, 0, 1); + } + + @Override + public void write(final byte[] bytes, final int off, final int len) throws IOException { + if (Objects.isNull(bytes) || bytes.length == 0) { + return; + } + if (Objects.isNull(multipartUpload)) { + multipartUpload = newMultipartUpload(); + } + final var source = ByteBuffer.wrap(bytes, off, len); + while (source.hasRemaining()) { + final var transferred = Math.min(byteBuffer.remaining(), source.remaining()); + final var offset = source.arrayOffset() + source.position(); + byteBuffer.put(source.array(), offset, transferred); + source.position(source.position() + transferred); + if (!byteBuffer.hasRemaining()) { + flushBuffer(0, partSize, partSize); + } + } + } + + private MultipartUpload newMultipartUpload() throws IOException { + logger.debug("Create new multipart upload request"); + final var initialRequest = new InitiateMultipartUploadRequest(bucketName, key); + initialRequest.setObjectMetadata(this.buildObjectMetadata()); + final var initiateResult = client.initiateMultipartUpload(initialRequest); + logger.debug("Upload ID: {}", initiateResult.getUploadId()); + return new MultipartUpload(initiateResult.getUploadId()); + } + + private ObjectMetadata buildObjectMetadata() { + final ObjectMetadata metadata = new ObjectMetadata(); + + if (this.serverSideEncryptionAlgorithm != null) { + metadata.setSSEAlgorithm(this.serverSideEncryptionAlgorithm); + } + + return metadata; + } + + @Override + public void close() throws IOException { + if (closed) { + return; + } + if (byteBuffer.position() > 0 && Objects.nonNull(multipartUpload)) { + flushBuffer(byteBuffer.arrayOffset(), byteBuffer.position(), byteBuffer.position()); + } + if (Objects.nonNull(multipartUpload)) { + multipartUpload.complete(); + multipartUpload = null; // NOPMD NullAssignment + } + closed = true; + super.close(); + } + + private void flushBuffer(final int offset, final int length, final int partSize) throws IOException { + try { + multipartUpload.uploadPart(new ByteArrayInputStream(byteBuffer.array(), offset, length), partSize); + byteBuffer.clear(); + } catch (final Exception e) { // NOPMD AvoidCatchingGenericException + multipartUpload.abort(); + multipartUpload = null; // NOPMD NullAssignment + throw new IOException(e); + } + } + + private class MultipartUpload { + + private final String uploadId; + + private final List<PartETag> partETags = new ArrayList<>(); + + public MultipartUpload(final String uploadId) { + this.uploadId = uploadId; + } + + public void uploadPart(final InputStream inputStream, final int partSize) throws IOException { + final var partNumber = partETags.size() + 1; + final var uploadPartRequest = new UploadPartRequest().withBucketName(bucketName) + .withKey(key) + .withUploadId(uploadId) + .withPartSize(partSize) + .withPartNumber(partNumber) + .withInputStream(inputStream); + final var uploadResult = client.uploadPart(uploadPartRequest); + partETags.add(uploadResult.getPartETag()); + } + + public void complete() { + client.completeMultipartUpload(new CompleteMultipartUploadRequest(bucketName, key, uploadId, partETags)); + } + + public void abort() { + client.abortMultipartUpload(new AbortMultipartUploadRequest(bucketName, key, uploadId)); + } + + } + +} From fd65b38dca8dc0fef427c22daa230de1b86e5d0f Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Mon, 23 Sep 2024 11:34:08 +0200 Subject: [PATCH 13/90] update offset topic, pattern for topic --- .../connect/s3/source/IntegrationBase.java | 3 +- .../connect/s3/source/IntegrationTest.java | 63 +++++++++---------- .../s3/source/S3SourceRecordIterator.java | 37 +++++------ .../kafka/connect/s3/source/S3SourceTask.java | 24 ++----- .../s3/source/config/S3SourceConfig.java | 24 ++----- .../s3/source/testutils/S3OutputStream.java | 20 +++--- 6 files changed, 71 insertions(+), 100 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index a1cdb5ce2..95d4c6415 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -93,8 +93,7 @@ static KafkaContainer createKafkaContainer() { } static String topicName(final TestInfo testInfo) { - return "testtopic"; - // return testInfo.getTestMethod().get().getName();// + "-" + testInfo.getDisplayName().hashCode(); + return testInfo.getTestMethod().get().getName();// + "-" + testInfo.getDisplayName().hashCode(); } static void createTopics(final AdminClient adminClient, final List<String> topicNames) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index d9b189bdd..cbc3e7e32 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -17,6 +17,7 @@ package io.aiven.kafka.connect.s3.source; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OFFSET_STORAGE_TOPIC; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.START_MARKER_KEY; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TOPIC_PARTITIONS_KEY; import static org.assertj.core.api.Assertions.assertThat; @@ -35,14 +36,14 @@ import java.util.Map; import java.util.concurrent.ExecutionException; -import io.aiven.kafka.connect.s3.source.testutils.S3OutputStream; -import org.apache.commons.io.IOUtils; import org.apache.kafka.clients.admin.AdminClient; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; +import io.aiven.kafka.connect.s3.source.testutils.S3OutputStream; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.PutObjectRequest; +import org.apache.commons.io.IOUtils; import org.junit.Ignore; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; @@ -127,36 +128,42 @@ void tearDown() { @Test void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME)); connectRunner.createConnector(connectorConfig); - // Create a new object on the bucket - // final String testObjectKey = s3Prefix + "test-file.txt"; - final String testData = "Hello, Kafka Connect S3 Source!"; - - String fileName = topicName + "-0-0001.txt"; + final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; + final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; - final Path testFilePath = Paths.get("/tmp/" + fileName); - // final Path testFilePath = Paths.get("/tmp/test-file.txt"); - Files.write(testFilePath, testData.getBytes(StandardCharsets.UTF_8)); + // write 2 objects to s3 + writeToS3(topicName, testData1, 1); + writeToS3(topicName, testData2, 2); - saveToS3(TEST_BUCKET_NAME, "", fileName, testFilePath.toFile()); + List<String> objects = testBucketAccessor.listObjects(); + assertThat(objects.size()).isEqualTo(2); // Verify that the connector is correctly set up assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); // Poll messages from the Kafka topic and verify the consumed data - final List<String> records = IntegrationBase.consumeMessages(topicName, 1, KAFKA_CONTAINER); + final List<String> records = IntegrationBase.consumeMessages(topicName, 2, KAFKA_CONTAINER); // Verify that the correct data is read from the S3 bucket and pushed to Kafka - assertThat(records).containsExactly(testData); + assertThat(records).contains(testData1).contains(testData2); + } + + private static void writeToS3(String topicName, String testData1, int id) throws IOException { + String fileName = topicName + "-00000-00000000012" + id + ".txt"; + final Path testFilePath = Paths.get("/tmp/" + fileName); + Files.write(testFilePath, testData1.getBytes(StandardCharsets.UTF_8)); + + saveToS3(TEST_BUCKET_NAME, "", fileName, testFilePath.toFile()); } @Test void multiUploadTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME)); connectRunner.createConnector(connectorConfig); multipartUpload(TEST_BUCKET_NAME, "testkey"); @@ -164,27 +171,16 @@ void multiUploadTest(final TestInfo testInfo) throws ExecutionException, Interru final List<String> records = IntegrationBase.consumeMessages(topicName, 1, KAFKA_CONTAINER); } - private Map<String, String> getConfig(final Map<String, String> config, final String topicName) { - return getConfig(config, List.of(topicName)); - } - - private Map<String, String> getConfig(final Map<String, String> config, final List<String> topicNames) { + private Map<String, String> getConfig(final Map<String, String> config) { config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); - config.put("topics", String.join(",", topicNames)); config.put("aws.access.key.id", S3_ACCESS_KEY_ID); config.put("aws.secret.access.key", S3_SECRET_ACCESS_KEY); config.put("aws.s3.endpoint", s3Endpoint); config.put(AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET_NAME); config.put("aws.s3.prefix", s3Prefix); config.put(START_MARKER_KEY, COMMON_PREFIX); - config.put(TOPIC_PARTITIONS_KEY, "1,2"); - - config.put("key.delimiter", "\\t"); - config.put("key.encoding", "UTF-8"); - config.put("value.delimiter", "\\n"); - config.put("value.encoding", "UTF-8"); - + config.put(OFFSET_STORAGE_TOPIC, "connect-offsets"); return config; } @@ -201,17 +197,16 @@ public static void saveToS3(final String bucketName, final String folderName, fi final File fileToWrite) { final PutObjectRequest request = new PutObjectRequest(bucketName, folderName + fileNameInS3, fileToWrite); s3Client.putObject(request); - List<String> objects = testBucketAccessor.listObjects(); - assertThat(objects.size()).isEqualTo(1); } public void multipartUpload(String bucketName, String key) { - try (S3OutputStream s3OutputStream = new S3OutputStream(bucketName, key, S3OutputStream.DEFAULT_PART_SIZE, s3Client)) { + try (S3OutputStream s3OutputStream = new S3OutputStream(bucketName, key, S3OutputStream.DEFAULT_PART_SIZE, + s3Client)) { InputStream resourceStream = Thread.currentThread() - .getContextClassLoader() - .getResourceAsStream(S3_FILE_NAME); + .getContextClassLoader() + .getResourceAsStream(S3_FILE_NAME); assert resourceStream != null; - byte [] fileBytes = IOUtils.toByteArray(resourceStream); + byte[] fileBytes = IOUtils.toByteArray(resourceStream); s3OutputStream.write(fileBytes); } catch (IOException e) { LOGGER.error(e.getMessage()); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java index 5d94b4040..573313d09 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java @@ -28,9 +28,9 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; +import java.util.regex.Matcher; import java.util.regex.Pattern; -import com.amazonaws.util.IOUtils; import org.apache.kafka.clients.consumer.ConsumerRecord; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; @@ -41,16 +41,12 @@ import com.amazonaws.services.s3.model.ObjectListing; import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; -//import software.amazon.awssdk.core.sync.ResponseTransformer; -//import software.amazon.awssdk.services.s3.model.GetObjectRequest; -//import software.amazon.awssdk.services.s3.model.GetObjectResponse; +import com.amazonaws.util.IOUtils; public final class S3SourceRecordIterator implements Iterator<S3SourceRecord> { - public static final Pattern DEFAULT_PATTERN = Pattern.compile("(\\/|^)" // match the / or the start of the key so we - // shouldn't have to worry about prefix - + "(?<topic>[^/]+?)-" // assuming no / in topic names - + "(?<partition>\\d{5})-" + "(?<offset>\\d{12})\\.gz$"); + public static final Pattern DEFAULT_PATTERN = Pattern + .compile("(?<topic>[^/]+?)-" + "(?<partition>\\d{5})-" + "(?<offset>\\d{12})" + "\\.(?<extension>[^.]+)$"); private String currentKey; private Iterator<S3ObjectSummary> nextFileIterator; private Iterator<ConsumerRecord<byte[], byte[]>> recordIterator = Collections.emptyIterator(); @@ -105,15 +101,19 @@ private Iterator<ConsumerRecord<byte[], byte[]>> createIteratorForCurrentFile() final S3Object s3Object = s3Client.getObject(bucketName, currentKey); try (InputStream content = getContent(s3Object)) { - // Extract the topic, partition, and startOffset from the key - // Matcher matcher = DEFAULT_PATTERN.matcher(currentKey); - // if (!matcher.find()) { - // throw new IllegalArgumentException("Invalid file key format: " + currentKey); - // } - final String topic = "testtopic";// matcher.group("topic"); - final int partition = 0;// Integer.parseInt(matcher.group("partition")); - final long startOffset = 0l;// Long.parseLong(matcher.group("offset")); - + final Matcher matcher = DEFAULT_PATTERN.matcher(currentKey); + String topic = null; + int partition = 0; + long startOffset = 0l; + if (matcher.find()) { + topic = matcher.group("topic"); + partition = Integer.parseInt(matcher.group("partition")); + startOffset = Long.parseLong(matcher.group("offset")); + } + + final String finalTopic = topic; + final int finalPartition = partition; + final long finalStartOffset = startOffset; return new Iterator<>() { private ConsumerRecord<byte[], byte[]> nextRecord = readNext(); @@ -131,7 +131,8 @@ private ConsumerRecord<byte[], byte[]> readNext() { } return null; } - return new ConsumerRecord<>(topic, partition, startOffset, key.orElse(null), value); + return new ConsumerRecord<>(finalTopic, finalPartition, finalStartOffset, key.orElse(null), + value); } catch (IOException e) { throw new RuntimeException("Failed to read record from file", e); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 06a54fe37..0408b6a3f 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -17,7 +17,7 @@ package io.aiven.kafka.connect.s3.source; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TOPICS_KEY; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OFFSET_STORAGE_TOPIC; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TOPIC_PARTITIONS_KEY; import static java.util.stream.Collectors.toList; import static java.util.stream.Collectors.toMap; @@ -110,25 +110,13 @@ private void prepareReaderFromOffsetStorageReader() { final String s3Bucket = s3SourceConfig.getString("aws.s3.bucket.name"); final Set<Integer> partitionList = getPartitions(); - final Set<String> topics = getTopics(); + final Set<String> topics = getOffsetStorageTopic(); // map to s3 partitions final List<S3Partition> s3Partitions = partitionList.stream() .flatMap(p -> topics.stream().map(t -> S3Partition.from(s3Bucket, s3Prefix, t, p))) .collect(toList()); - // List<String> topicPartitions = Arrays.asList("1","2"); - // List<Map<String, String>> offsetPartitions = topicPartitions.stream().map( - // tp -> { - // HashMap<String, String> offsetInfo = new HashMap<>(); - // offsetInfo.put("source", tp); - // offsetInfo.put("targetPrefix", "targetTopicPrefix"); - // return offsetInfo; - // } - // ).collect(Collectors.toList()); - // final Map<Map<String, String>, Map<String, Object>> offsetMap = - // context.offsetStorageReader().offsets(offsetPartitions); - // get partition offsets final List<Map<String, Object>> partitions = s3Partitions.stream().map(S3Partition::asMap).collect(toList()); final Map<Map<String, Object>, Map<String, Object>> offsetMap = context.offsetStorageReader() @@ -153,12 +141,12 @@ private Set<Integer> getPartitions() { } } - private Set<String> getTopics() { - final String topicString = s3SourceConfig.getString(TOPICS_KEY); + private Set<String> getOffsetStorageTopic() { + final String topicString = s3SourceConfig.getString(OFFSET_STORAGE_TOPIC); if (Objects.nonNull(topicString)) { return Arrays.stream(topicString.split(",")).collect(Collectors.toSet()); } else { - throw new IllegalStateException("Topics list is not configured."); + throw new IllegalStateException("Offset storage topics list is not configured."); } } @@ -212,7 +200,7 @@ private List<SourceRecord> getSourceRecords(final List<SourceRecord> results) th && !stopped.get(); i++) { final S3SourceRecord record = sourceRecordIterator.next(); LOGGER.info(record.offset() + record.getToTopic() + record.partition()); - String topic = "testtopic"; + String topic = record.getToTopic(); Optional<SchemaAndValue> key = keyConverter.map(c -> c.toConnectData(topic, record.key())); SchemaAndValue value = valueConverter.toConnectData(topic, record.value()); results.add(new SourceRecord(record.file().asMap(), record.offset().asMap(), topic, record.partition(), diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 37fe8406d..4be66364d 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -106,7 +106,9 @@ final public class S3SourceConfig extends AbstractConfig { public static final String AWS_S3_SSE_ALGORITHM_CONFIG = "aws.s3.sse.algorithm"; public static final String TOPIC_PARTITIONS_KEY = "offset.storage.topic.partitions"; - public static final String TOPICS_KEY = "topics"; + public static final String OFFSET_STORAGE_TOPIC = "offset.storage.topic"; + + public static final String OFFSET_STORAGE_TOPIC_DEFAULT = "connect-offsets"; public static final String START_MARKER_KEY = "aws.s3.start.marker"; public static final String FETCH_PAGE_SIZE = "aws.s3.fetch.page.size"; @@ -115,13 +117,6 @@ final public class S3SourceConfig extends AbstractConfig { public static final String KEY_CONVERTER = "key.converter"; public static final String VALUE_CONVERTER = "value.converter"; - public static final String KEY_DELIMITER = "key.delimiter"; - - public static final String KEY_ENCODING = "key.encoding"; - - public static final String VALUE_DELIMITER = "value.delimiter"; - - public static final String VALUE_ENCODING = "value.encoding"; public static final int S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT = 3; @@ -248,22 +243,15 @@ private static void addS3RetryPolicies(final ConfigDef configDef) { } private static void addFileConfiguration(final S3SourceConfigDef configDef) { - configDef.define(KEY_DELIMITER, ConfigDef.Type.STRING, "\\t", new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "eg : \n", GROUP_FILE, 0, ConfigDef.Width.NONE, KEY_DELIMITER); - configDef.define(KEY_ENCODING, ConfigDef.Type.STRING, "UTF-8", new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "eg : UTF-8", GROUP_FILE, 1, ConfigDef.Width.NONE, KEY_ENCODING); - configDef.define(VALUE_DELIMITER, ConfigDef.Type.STRING, "\\n", new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "eg : \t", GROUP_FILE, 2, ConfigDef.Width.NONE, VALUE_DELIMITER); - configDef.define(VALUE_ENCODING, ConfigDef.Type.STRING, "UTF-8", new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "eg : UTF-8", GROUP_FILE, 3, ConfigDef.Width.NONE, VALUE_ENCODING); } private static void addOffsetStorageConfig(final ConfigDef configDef) { configDef.define(TOPIC_PARTITIONS_KEY, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "eg : 1,2", GROUP_OFFSET_TOPIC, 0, ConfigDef.Width.NONE, TOPIC_PARTITIONS_KEY); - configDef.define(TOPICS_KEY, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "eg : testtopic", GROUP_OFFSET_TOPIC, 0, ConfigDef.Width.NONE, TOPICS_KEY); + configDef.define(OFFSET_STORAGE_TOPIC, ConfigDef.Type.STRING, OFFSET_STORAGE_TOPIC_DEFAULT, + new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "eg : connect-offsets", GROUP_OFFSET_TOPIC, + 0, ConfigDef.Width.NONE, OFFSET_STORAGE_TOPIC); } private static void addDeprecatedConfiguration(final ConfigDef configDef) { diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java index 8f138f293..5194cf398 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java @@ -16,6 +16,15 @@ package io.aiven.kafka.connect.s3.source.testutils; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.AbortMultipartUploadRequest; import com.amazonaws.services.s3.model.CompleteMultipartUploadRequest; @@ -27,15 +36,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; - public class S3OutputStream extends OutputStream { private final Logger logger = LoggerFactory.getLogger(S3OutputStream.class); @@ -65,7 +65,7 @@ public S3OutputStream(final String bucketName, final String key, final int partS @SuppressFBWarnings(value = "EI_EXPOSE_REP2", justification = "AmazonS3 client is mutable") public S3OutputStream(final String bucketName, final String key, final int partSize, final AmazonS3 client, - final String serverSideEncryptionAlgorithm) { + final String serverSideEncryptionAlgorithm) { super(); this.bucketName = bucketName; this.key = key; From b7c6ef5ac0ae69fa583d73881ae8190ef9e60458 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Mon, 23 Sep 2024 13:27:26 +0200 Subject: [PATCH 14/90] Fix offsets, topic details from key --- .../connect/s3/source/IntegrationTest.java | 30 +++++++++++-------- .../kafka/connect/s3/source/S3Offset.java | 4 +++ .../s3/source/S3SourceRecordIterator.java | 22 +++++++++++--- .../kafka/connect/s3/source/S3SourceTask.java | 20 ++++++------- .../s3/source/config/S3SourceConfig.java | 8 ++--- .../s3/source/testutils/S3OutputStream.java | 2 +- .../src/test/resources/testtopic-0-0001.txt | 6 ++++ 7 files changed, 60 insertions(+), 32 deletions(-) create mode 100644 s3-source-connector/src/test/resources/testtopic-0-0001.txt diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index cbc3e7e32..ab498130e 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -18,8 +18,8 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OFFSET_STORAGE_TOPIC; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OFFSET_STORAGE_TOPIC_PARTITIONS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.START_MARKER_KEY; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TOPIC_PARTITIONS_KEY; import static org.assertj.core.api.Assertions.assertThat; import java.io.File; @@ -152,23 +152,29 @@ void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx assertThat(records).contains(testData1).contains(testData2); } - private static void writeToS3(String topicName, String testData1, int id) throws IOException { - String fileName = topicName + "-00000-00000000012" + id + ".txt"; - final Path testFilePath = Paths.get("/tmp/" + fileName); - Files.write(testFilePath, testData1.getBytes(StandardCharsets.UTF_8)); - - saveToS3(TEST_BUCKET_NAME, "", fileName, testFilePath.toFile()); - } - @Test - void multiUploadTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { + void multiPartUploadTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME)); connectRunner.createConnector(connectorConfig); - multipartUpload(TEST_BUCKET_NAME, "testkey"); + String partition = "00001"; + String offset = "000000000121"; + String key = topicName + "-" + partition + "-" + offset + ".txt"; + multipartUpload(TEST_BUCKET_NAME, key); // Poll messages from the Kafka topic and verify the consumed data final List<String> records = IntegrationBase.consumeMessages(topicName, 1, KAFKA_CONTAINER); + assertThat(records.get(0)).contains("performanceeeqjz"); + } + + private static void writeToS3(String topicName, String testData1, int id) throws IOException { + String partition = "00000"; + String offset = "00000000012" + id; + String fileName = topicName + "-" + partition + "-" + offset + ".txt"; + final Path testFilePath = Paths.get("/tmp/" + fileName); + Files.write(testFilePath, testData1.getBytes(StandardCharsets.UTF_8)); + + saveToS3(TEST_BUCKET_NAME, "", fileName, testFilePath.toFile()); } private Map<String, String> getConfig(final Map<String, String> config) { @@ -179,7 +185,7 @@ private Map<String, String> getConfig(final Map<String, String> config) { config.put(AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET_NAME); config.put("aws.s3.prefix", s3Prefix); config.put(START_MARKER_KEY, COMMON_PREFIX); - config.put(TOPIC_PARTITIONS_KEY, "1,2"); + config.put(OFFSET_STORAGE_TOPIC_PARTITIONS, "1,2"); config.put(OFFSET_STORAGE_TOPIC, "connect-offsets"); return config; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java index fdb0c41bf..3b44eb805 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java @@ -57,6 +57,10 @@ public int compareTo(final S3Offset s3Offset) { return map; } + public long getOffset() { + return offset; + } + // Overriding equals to ensure consistency with compareTo @Override public boolean equals(final Object obj) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java index 573313d09..ff5f6b998 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java @@ -115,6 +115,8 @@ private Iterator<ConsumerRecord<byte[], byte[]>> createIteratorForCurrentFile() final int finalPartition = partition; final long finalStartOffset = startOffset; return new Iterator<>() { + private Map<S3Partition, Long> currentOffsets = new HashMap<>(); // Track offsets for each + // topic-partition private ConsumerRecord<byte[], byte[]> nextRecord = readNext(); private ConsumerRecord<byte[], byte[]> readNext() { @@ -131,8 +133,20 @@ private ConsumerRecord<byte[], byte[]> readNext() { } return null; } - return new ConsumerRecord<>(finalTopic, finalPartition, finalStartOffset, key.orElse(null), - value); + S3Partition s3Partition = S3Partition.from(bucketName, s3Prefix, finalTopic, finalPartition); + + long currentOffset; + if (offsets.containsKey(s3Partition)) { + S3Offset s3Offset = offsets.get(s3Partition); + currentOffset = s3Offset.getOffset() + 1; + } else { + currentOffset = currentOffsets.getOrDefault(s3Partition, finalStartOffset); + } + ConsumerRecord<byte[], byte[]> record = new ConsumerRecord<>(finalTopic, finalPartition, + currentOffset, key.orElse(null), value); + currentOffsets.put(s3Partition, currentOffset + 1); + + return record; } catch (IOException e) { throw new RuntimeException("Failed to read record from file", e); } @@ -161,8 +175,8 @@ private InputStream getContent(final S3Object object) throws IOException { return object.getObjectContent(); } - private S3Offset offset() { - return offsets.get(S3Partition.from(bucketName, s3Prefix, "testtopic", 0)); + private S3Offset offset(String topic, int partition) { + return offsets.get(S3Partition.from(bucketName, s3Prefix, topic, partition)); } @Override diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 0408b6a3f..3c9ee3bda 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -18,7 +18,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OFFSET_STORAGE_TOPIC; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TOPIC_PARTITIONS_KEY; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OFFSET_STORAGE_TOPIC_PARTITIONS; import static java.util.stream.Collectors.toList; import static java.util.stream.Collectors.toMap; @@ -95,9 +95,7 @@ public void start(final Map<String, String> props) { valueConverter = (Converter) s3SourceConfig.getClass("value.converter").newInstance(); } catch (InstantiationException | IllegalAccessException e) { throw new RuntimeException(e); - } ; - // keyConverter = Optional.ofNullable(Configure.buildConverter(taskConfig, "key.converter", true, null)); - // valueConverter = Configure.buildConverter(taskConfig, "value.converter", false, AlreadyBytesConverter.class); + } this.s3Client = s3ClientFactory.createAmazonS3Client(s3SourceConfig); @@ -109,12 +107,12 @@ private void prepareReaderFromOffsetStorageReader() { final String s3Prefix = s3SourceConfig.getString("aws.s3.prefix"); final String s3Bucket = s3SourceConfig.getString("aws.s3.bucket.name"); - final Set<Integer> partitionList = getPartitions(); - final Set<String> topics = getOffsetStorageTopic(); + final Set<Integer> offsetStorageTopicPartitions = getOffsetStorageTopicPartitions(); + final Set<String> offsetStorageTopic = getOffsetStorageTopic(); // map to s3 partitions - final List<S3Partition> s3Partitions = partitionList.stream() - .flatMap(p -> topics.stream().map(t -> S3Partition.from(s3Bucket, s3Prefix, t, p))) + final List<S3Partition> s3Partitions = offsetStorageTopicPartitions.stream() + .flatMap(p -> offsetStorageTopic.stream().map(t -> S3Partition.from(s3Bucket, s3Prefix, t, p))) .collect(toList()); // get partition offsets @@ -132,12 +130,12 @@ private void prepareReaderFromOffsetStorageReader() { sourceRecordIterator = new S3SourceRecordIterator(s3SourceConfig, s3Client, s3Bucket, s3Prefix, offsets); } - private Set<Integer> getPartitions() { - final String partitionString = s3SourceConfig.getString(TOPIC_PARTITIONS_KEY); + private Set<Integer> getOffsetStorageTopicPartitions() { + final String partitionString = s3SourceConfig.getString(OFFSET_STORAGE_TOPIC_PARTITIONS); if (Objects.nonNull(partitionString)) { return Arrays.stream(partitionString.split(",")).map(Integer::parseInt).collect(Collectors.toSet()); } else { - throw new IllegalStateException("Partition list is not configured."); + throw new IllegalStateException("Offset storage topics partition list is not configured."); } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 4be66364d..b40446373 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -105,7 +105,7 @@ final public class S3SourceConfig extends AbstractConfig { public static final String AWS_S3_SSE_ALGORITHM_CONFIG = "aws.s3.sse.algorithm"; - public static final String TOPIC_PARTITIONS_KEY = "offset.storage.topic.partitions"; + public static final String OFFSET_STORAGE_TOPIC_PARTITIONS = "offset.storage.topic.partitions"; public static final String OFFSET_STORAGE_TOPIC = "offset.storage.topic"; public static final String OFFSET_STORAGE_TOPIC_DEFAULT = "connect-offsets"; @@ -246,9 +246,9 @@ private static void addFileConfiguration(final S3SourceConfigDef configDef) { } private static void addOffsetStorageConfig(final ConfigDef configDef) { - configDef.define(TOPIC_PARTITIONS_KEY, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "eg : 1,2", GROUP_OFFSET_TOPIC, 0, ConfigDef.Width.NONE, - TOPIC_PARTITIONS_KEY); + configDef.define(OFFSET_STORAGE_TOPIC_PARTITIONS, ConfigDef.Type.STRING, "0", new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "eg : 0,1", GROUP_OFFSET_TOPIC, 0, ConfigDef.Width.NONE, + OFFSET_STORAGE_TOPIC_PARTITIONS); configDef.define(OFFSET_STORAGE_TOPIC, ConfigDef.Type.STRING, OFFSET_STORAGE_TOPIC_DEFAULT, new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "eg : connect-offsets", GROUP_OFFSET_TOPIC, 0, ConfigDef.Width.NONE, OFFSET_STORAGE_TOPIC); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java index 5194cf398..4d33e46c5 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java @@ -40,7 +40,7 @@ public class S3OutputStream extends OutputStream { private final Logger logger = LoggerFactory.getLogger(S3OutputStream.class); - public static final int DEFAULT_PART_SIZE = 5 * 1024 * 1024; // 1 MB + public static final int DEFAULT_PART_SIZE = 5 * 1024 * 1024; // 5 MB private final AmazonS3 client; diff --git a/s3-source-connector/src/test/resources/testtopic-0-0001.txt b/s3-source-connector/src/test/resources/testtopic-0-0001.txt new file mode 100644 index 000000000..c857053a9 --- /dev/null +++ b/s3-source-connector/src/test/resources/testtopic-0-0001.txt @@ -0,0 +1,6 @@ +performanceeeqjz fileajbzt reliabilityrtbxg Amazonyomxx S3jsukk S3jicqd multipartudsma Amazonlboqk contentepuod Amazonijbif filepbqji performancepsszv dataouuzw S3rdwof Amazonsyzgo filehhija reliabilitykccrg performanceefrfz Amazonzptcv multipartvpkxv. +contentgsrgr multipartqjaov filezqtro fileaunmb filerfjrx S3xkjoj filevlhez filernzty Amazondkzpv Amazonxdspv filebmwri Amazonxzqxz multipartggoaf fileqrpzo contentoporo filehsvga filehysoz Amazongqhtq multipartcqmwp S3tjaxu. +uploadupvpo performancewuoyl multipartbjjuk uploadpuecx dataqsdrc reliabilityusrbn S3wxsqo uploadmjczp Amazonulvpp datawahgl uploadghuib contentxvwoh contentvgtbd contentsttlw performancemnkib S3jdffr datasxzfy filevktta contentuewkr dataakciu. +S3kiqqs S3xmlbh reliabilitynrjhd Amazongbico S3honxh performancekwcyf performancehemxu contentzfktk filemuxvv uploadzcgqj reliabilitysdkwz filemzbbt performancezmfkb datazknlk Amazonkssri performancexklrb S3pfajq filekhldu reliabilitylixgd contenthqooz. +Amazonlljev datalbwgf fileimhqf multipartejavv Amazonsqfyd contentlfytq datapsrpi contentzotzk contentpfauu reliabilitysgqdc dataeiwnu filekyhlx contentcoomf performancetsxwq datacgjjl Amazoncrptx filekpsqv dataujipy performanceqjzow uploaddzryh. +Amazontpxgu reliabilitycapks fileiqyhi reliabilityxlxvs filepejwa contenttgbtb contentknony fileacpga datadqnqt S3erclt performancennoll reliabilityadyxe contentxutca contentjcoec multipartnjaef contentkcowq performancedzidj Amazonrwoaj dataogmoh performancewmtpn. From 179f26f9fc4f3e8de66204a0fc7802bb9e1e6dc7 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Mon, 23 Sep 2024 14:36:25 +0200 Subject: [PATCH 15/90] Fix spotbugs --- gradle-config/spotbugs-exclude.xml | 8 ++ .../connect/s3/source/IntegrationBase.java | 4 +- .../connect/s3/source/IntegrationTest.java | 33 +++-- .../s3/source/S3SourceRecordIterator.java | 116 +++++++++--------- .../kafka/connect/s3/source/S3SourceTask.java | 19 +-- .../s3/source/config/S3SourceConfig.java | 4 - 6 files changed, 92 insertions(+), 92 deletions(-) diff --git a/gradle-config/spotbugs-exclude.xml b/gradle-config/spotbugs-exclude.xml index 3f007d6b8..8e6fe7126 100644 --- a/gradle-config/spotbugs-exclude.xml +++ b/gradle-config/spotbugs-exclude.xml @@ -19,6 +19,14 @@ <Class name="io.aiven.kafka.connect.common.output.parquet.ParquetOutputWriterTest$ParquetInputFile" /> <Bug pattern="CT_CONSTRUCTOR_THROW" /> </Match> + <Match> + <Class name="io.aiven.kafka.connect.s3.source.S3SourceRecordIterator$1" /> + <Bug pattern="CT_CONSTRUCTOR_THROW" /> + </Match> + <Match> + <Class name="io.aiven.kafka.connect.s3.source.S3SourceRecordIterator" /> + <Bug pattern="EI_EXPOSE_REP2" /> + </Match> diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index 95d4c6415..b01adbeed 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -138,9 +138,7 @@ static List<String> consumeMessages(final String topic, final int expectedMessag while (messages.size() < expectedMessageCount) { final ConsumerRecords<byte[], byte[]> records = consumer.poll(5L); for (final ConsumerRecord<byte[], byte[]> record : records) { - messages.add(new String(record.value(), StandardCharsets.UTF_8)); // Convert message from bytes to - // string for easy - // verification + messages.add(new String(record.value(), StandardCharsets.UTF_8)); } } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index ab498130e..ee09fe3ae 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -88,8 +88,6 @@ final class IntegrationTest implements IntegrationBase { private static AmazonS3 s3Client; - private String topicName; - @BeforeAll static void setUpAll() throws IOException, InterruptedException { s3Prefix = COMMON_PREFIX + ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) + "/"; @@ -108,7 +106,7 @@ void setUp(final TestInfo testInfo) throws ExecutionException, InterruptedExcept testBucketAccessor.createBucket(); adminClient = newAdminClient(KAFKA_CONTAINER); - topicName = IntegrationBase.topicName(testInfo); + final String topicName = IntegrationBase.topicName(testInfo); final var topics = List.of(topicName); IntegrationBase.createTopics(adminClient, topics); @@ -139,7 +137,7 @@ void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx writeToS3(topicName, testData1, 1); writeToS3(topicName, testData2, 2); - List<String> objects = testBucketAccessor.listObjects(); + final List<String> objects = testBucketAccessor.listObjects(); assertThat(objects.size()).isEqualTo(2); // Verify that the connector is correctly set up @@ -158,19 +156,20 @@ void multiPartUploadTest(final TestInfo testInfo) throws ExecutionException, Int final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME)); connectRunner.createConnector(connectorConfig); - String partition = "00001"; - String offset = "000000000121"; - String key = topicName + "-" + partition + "-" + offset + ".txt"; + final String partition = "00001"; + final String offset = "000000000121"; + final String key = topicName + "-" + partition + "-" + offset + ".txt"; multipartUpload(TEST_BUCKET_NAME, key); // Poll messages from the Kafka topic and verify the consumed data final List<String> records = IntegrationBase.consumeMessages(topicName, 1, KAFKA_CONTAINER); assertThat(records.get(0)).contains("performanceeeqjz"); } - private static void writeToS3(String topicName, String testData1, int id) throws IOException { - String partition = "00000"; - String offset = "00000000012" + id; - String fileName = topicName + "-" + partition + "-" + offset + ".txt"; + private static void writeToS3(final String topicName, final String testData1, final int offsetId) + throws IOException { + final String partition = "00000"; + final String offset = "00000000012" + offsetId; + final String fileName = topicName + "-" + partition + "-" + offset + ".txt"; final Path testFilePath = Paths.get("/tmp/" + fileName); Files.write(testFilePath, testData1.getBytes(StandardCharsets.UTF_8)); @@ -205,14 +204,14 @@ public static void saveToS3(final String bucketName, final String folderName, fi s3Client.putObject(request); } - public void multipartUpload(String bucketName, String key) { + public void multipartUpload(final String bucketName, final String key) { try (S3OutputStream s3OutputStream = new S3OutputStream(bucketName, key, S3OutputStream.DEFAULT_PART_SIZE, - s3Client)) { - InputStream resourceStream = Thread.currentThread() - .getContextClassLoader() - .getResourceAsStream(S3_FILE_NAME); + s3Client); + InputStream resourceStream = Thread.currentThread() + .getContextClassLoader() + .getResourceAsStream(S3_FILE_NAME);) { assert resourceStream != null; - byte[] fileBytes = IOUtils.toByteArray(resourceStream); + final byte[] fileBytes = IOUtils.toByteArray(resourceStream); s3OutputStream.write(fileBytes); } catch (IOException e) { LOGGER.error(e.getMessage()); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java index ff5f6b998..1de32ac9a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -49,7 +50,7 @@ public final class S3SourceRecordIterator implements Iterator<S3SourceRecord> { .compile("(?<topic>[^/]+?)-" + "(?<partition>\\d{5})-" + "(?<offset>\\d{12})" + "\\.(?<extension>[^.]+)$"); private String currentKey; private Iterator<S3ObjectSummary> nextFileIterator; - private Iterator<ConsumerRecord<byte[], byte[]>> recordIterator = Collections.emptyIterator(); + private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> recordIterator = Collections.emptyIterator(); private final Map<S3Partition, S3Offset> offsets; @@ -97,10 +98,9 @@ private void nextObject() { } } - private Iterator<ConsumerRecord<byte[], byte[]>> createIteratorForCurrentFile() throws IOException { + private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentFile() throws IOException { final S3Object s3Object = s3Client.getObject(bucketName, currentKey); try (InputStream content = getContent(s3Object)) { - final Matcher matcher = DEFAULT_PATTERN.matcher(currentKey); String topic = null; int partition = 0; @@ -114,71 +114,77 @@ private Iterator<ConsumerRecord<byte[], byte[]>> createIteratorForCurrentFile() final String finalTopic = topic; final int finalPartition = partition; final long finalStartOffset = startOffset; - return new Iterator<>() { - private Map<S3Partition, Long> currentOffsets = new HashMap<>(); // Track offsets for each - // topic-partition - private ConsumerRecord<byte[], byte[]> nextRecord = readNext(); - - private ConsumerRecord<byte[], byte[]> readNext() { - try { - Optional<byte[]> key = Optional.empty(); - if (currentKey != null) { - key = Optional.of(currentKey.getBytes()); - } - byte[] value = IOUtils.toByteArray(content); + return getIterator(content, finalTopic, finalPartition, finalStartOffset); + } + } - if (value == null) { - if (key.isPresent()) { - throw new IllegalStateException("missing value for key!" + key); - } - return null; - } - S3Partition s3Partition = S3Partition.from(bucketName, s3Prefix, finalTopic, finalPartition); - - long currentOffset; - if (offsets.containsKey(s3Partition)) { - S3Offset s3Offset = offsets.get(s3Partition); - currentOffset = s3Offset.getOffset() + 1; - } else { - currentOffset = currentOffsets.getOrDefault(s3Partition, finalStartOffset); - } - ConsumerRecord<byte[], byte[]> record = new ConsumerRecord<>(finalTopic, finalPartition, - currentOffset, key.orElse(null), value); - currentOffsets.put(s3Partition, currentOffset + 1); + private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> getIterator(final InputStream content, + final String finalTopic, final int finalPartition, final long finalStartOffset) { + return new Iterator<>() { + private Map<S3Partition, Long> currentOffsets = new HashMap<>(); + private Optional<ConsumerRecord<byte[], byte[]>> nextRecord = readNext(); + + private Optional<ConsumerRecord<byte[], byte[]>> readNext() { + try { + Optional<byte[]> key = Optional.empty(); + if (currentKey != null) { + key = Optional.of(currentKey.getBytes(StandardCharsets.UTF_8)); + } + final byte[] value = IOUtils.toByteArray(content); - return record; - } catch (IOException e) { - throw new RuntimeException("Failed to read record from file", e); + if (value == null) { + if (key.isPresent()) { + throw new IllegalStateException("missing value for key!" + key); + } + return Optional.empty(); } + + return getConsumerRecord(key, value); + } catch (IOException e) { + throw new org.apache.kafka.connect.errors.ConnectException( + "Connect converters could not be instantiated.", e); } + } - @Override - public boolean hasNext() { - // Check if there's another record - return nextRecord != null; + private Optional<ConsumerRecord<byte[], byte[]>> getConsumerRecord(final Optional<byte[]> key, + final byte[] value) { + final S3Partition s3Partition = S3Partition.from(bucketName, s3Prefix, finalTopic, finalPartition); + + long currentOffset; + if (offsets.containsKey(s3Partition)) { + final S3Offset s3Offset = offsets.get(s3Partition); + currentOffset = s3Offset.getOffset() + 1; + } else { + currentOffset = currentOffsets.getOrDefault(s3Partition, finalStartOffset); } + final Optional<ConsumerRecord<byte[], byte[]>> record = Optional + .of(new ConsumerRecord<>(finalTopic, finalPartition, currentOffset, key.orElse(null), value)); + currentOffsets.put(s3Partition, currentOffset + 1); + return record; + } - @Override - public ConsumerRecord<byte[], byte[]> next() { - if (nextRecord == null) { - throw new NoSuchElementException(); - } - ConsumerRecord<byte[], byte[]> currentRecord = nextRecord; - nextRecord = null; - return currentRecord; + @Override + public boolean hasNext() { + // Check if there's another record + return nextRecord.isPresent(); + } + + @Override + public Optional<ConsumerRecord<byte[], byte[]>> next() { + if (nextRecord.isEmpty()) { + throw new NoSuchElementException(); } - }; - } + final Optional<ConsumerRecord<byte[], byte[]>> currentRecord = nextRecord; + nextRecord = Optional.empty(); + return currentRecord; + } + }; } private InputStream getContent(final S3Object object) throws IOException { return object.getObjectContent(); } - private S3Offset offset(String topic, int partition) { - return offsets.get(S3Partition.from(bucketName, s3Prefix, topic, partition)); - } - @Override public boolean hasNext() { while (!recordIterator.hasNext() && nextFileIterator.hasNext()) { @@ -192,7 +198,7 @@ public S3SourceRecord next() { if (!hasNext()) { throw new NoSuchElementException(); } - final ConsumerRecord<byte[], byte[]> record = recordIterator.next(); + final ConsumerRecord<byte[], byte[]> record = recordIterator.next().get(); return new S3SourceRecord(S3Partition.from(bucketName, s3Prefix, record.topic(), record.partition()), S3Offset.from(currentKey, record.offset()), record.topic(), record.partition(), record.key(), record.value()); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 3c9ee3bda..a1fa54fcd 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -22,8 +22,6 @@ import static java.util.stream.Collectors.toList; import static java.util.stream.Collectors.toMap; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; @@ -36,6 +34,7 @@ import java.util.stream.Collectors; import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.source.SourceTask; import org.apache.kafka.connect.storage.Converter; @@ -84,6 +83,7 @@ public String version() { return Version.VERSION; } + @Deprecated @Override public void start(final Map<String, String> props) { LOGGER.info("S3 Source task started."); @@ -94,7 +94,7 @@ public void start(final Map<String, String> props) { keyConverter = Optional.of((Converter) s3SourceConfig.getClass("key.converter").newInstance()); valueConverter = (Converter) s3SourceConfig.getClass("value.converter").newInstance(); } catch (InstantiationException | IllegalAccessException e) { - throw new RuntimeException(e); + throw new ConnectException("Connect converters could not be instantiated.", e); } this.s3Client = s3ClientFactory.createAmazonS3Client(s3SourceConfig); @@ -148,13 +148,6 @@ private Set<String> getOffsetStorageTopic() { } } - private Charset parseEncoding(final S3SourceConfig s3SourceConfig, final String key) { - return Optional.ofNullable(s3SourceConfig.getString(key)) - .map(Object::toString) - .map(Charset::forName) - .orElse(StandardCharsets.UTF_8); - } - @Override public List<SourceRecord> poll() throws InterruptedException { // read up to the configured poll size @@ -198,9 +191,9 @@ private List<SourceRecord> getSourceRecords(final List<SourceRecord> results) th && !stopped.get(); i++) { final S3SourceRecord record = sourceRecordIterator.next(); LOGGER.info(record.offset() + record.getToTopic() + record.partition()); - String topic = record.getToTopic(); - Optional<SchemaAndValue> key = keyConverter.map(c -> c.toConnectData(topic, record.key())); - SchemaAndValue value = valueConverter.toConnectData(topic, record.value()); + final String topic = record.getToTopic(); + final Optional<SchemaAndValue> key = keyConverter.map(c -> c.toConnectData(topic, record.key())); + final SchemaAndValue value = valueConverter.toConnectData(topic, record.value()); results.add(new SourceRecord(record.file().asMap(), record.offset().asMap(), topic, record.partition(), key.map(SchemaAndValue::schema).orElse(null), key.map(SchemaAndValue::value).orElse(null), value.schema(), value.value())); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index b40446373..c620b9184 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -163,7 +163,6 @@ public static ConfigDef configDef() { addAwsStsConfigGroup(configDef); addAwsConfigGroup(configDef); addDeprecatedConfiguration(configDef); - addFileConfiguration(configDef); addS3RetryPolicies(configDef); addOtherConfig(configDef); return configDef; @@ -242,9 +241,6 @@ private static void addS3RetryPolicies(final ConfigDef configDef) { ConfigDef.Width.NONE, AWS_S3_RETRY_BACKOFF_MAX_RETRIES_CONFIG); } - private static void addFileConfiguration(final S3SourceConfigDef configDef) { - } - private static void addOffsetStorageConfig(final ConfigDef configDef) { configDef.define(OFFSET_STORAGE_TOPIC_PARTITIONS, ConfigDef.Type.STRING, "0", new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "eg : 0,1", GROUP_OFFSET_TOPIC, 0, ConfigDef.Width.NONE, From 31fb337a1a8d634556470dabd518ee77cfdbea85 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Mon, 23 Sep 2024 15:54:01 +0200 Subject: [PATCH 16/90] Update properties --- .../kafka/connect/s3/source/IntegrationTest.java | 15 ++++++++++----- .../connect/s3/source/config/S3SourceConfig.java | 8 ++++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index ee09fe3ae..7684b94a4 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -16,7 +16,11 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_ACCESS_KEY_ID_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_ENDPOINT_CONFIG; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OFFSET_STORAGE_TOPIC; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OFFSET_STORAGE_TOPIC_PARTITIONS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.START_MARKER_KEY; @@ -176,16 +180,17 @@ private static void writeToS3(final String topicName, final String testData1, fi saveToS3(TEST_BUCKET_NAME, "", fileName, testFilePath.toFile()); } + @Deprecated private Map<String, String> getConfig(final Map<String, String> config) { config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); - config.put("aws.access.key.id", S3_ACCESS_KEY_ID); - config.put("aws.secret.access.key", S3_SECRET_ACCESS_KEY); - config.put("aws.s3.endpoint", s3Endpoint); + config.put(AWS_ACCESS_KEY_ID_CONFIG, S3_ACCESS_KEY_ID); + config.put(AWS_SECRET_ACCESS_KEY_CONFIG, S3_SECRET_ACCESS_KEY); + config.put(AWS_S3_ENDPOINT_CONFIG, s3Endpoint); config.put(AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET_NAME); - config.put("aws.s3.prefix", s3Prefix); + config.put(AWS_S3_PREFIX_CONFIG, s3Prefix); config.put(START_MARKER_KEY, COMMON_PREFIX); config.put(OFFSET_STORAGE_TOPIC_PARTITIONS, "1,2"); - config.put(OFFSET_STORAGE_TOPIC, "connect-offsets"); + config.put(OFFSET_STORAGE_TOPIC, "connect-storage-offsets"); return config; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index c620b9184..bb8fe68dc 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -108,7 +108,7 @@ final public class S3SourceConfig extends AbstractConfig { public static final String OFFSET_STORAGE_TOPIC_PARTITIONS = "offset.storage.topic.partitions"; public static final String OFFSET_STORAGE_TOPIC = "offset.storage.topic"; - public static final String OFFSET_STORAGE_TOPIC_DEFAULT = "connect-offsets"; + public static final String OFFSET_STORAGE_TOPIC_DEFAULT = "connect-storage-offsets"; public static final String START_MARKER_KEY = "aws.s3.start.marker"; public static final String FETCH_PAGE_SIZE = "aws.s3.fetch.page.size"; @@ -246,12 +246,12 @@ private static void addOffsetStorageConfig(final ConfigDef configDef) { ConfigDef.Importance.MEDIUM, "eg : 0,1", GROUP_OFFSET_TOPIC, 0, ConfigDef.Width.NONE, OFFSET_STORAGE_TOPIC_PARTITIONS); configDef.define(OFFSET_STORAGE_TOPIC, ConfigDef.Type.STRING, OFFSET_STORAGE_TOPIC_DEFAULT, - new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "eg : connect-offsets", GROUP_OFFSET_TOPIC, - 0, ConfigDef.Width.NONE, OFFSET_STORAGE_TOPIC); + new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "eg : connect-storage-offsets", + GROUP_OFFSET_TOPIC, 0, ConfigDef.Width.NONE, OFFSET_STORAGE_TOPIC); } private static void addDeprecatedConfiguration(final ConfigDef configDef) { - configDef.define(AWS_S3_PREFIX_CONFIG, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + configDef.define(AWS_S3_PREFIX_CONFIG, ConfigDef.Type.STRING, "prefix", new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "[Deprecated] Use `file.name.template` instead. Prefix for stored objects, e.g. cluster-1/", GROUP_AWS, 0, ConfigDef.Width.NONE, AWS_S3_PREFIX_CONFIG); From d77c3a486f565e82860710e6102f51412f387c96 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Tue, 24 Sep 2024 12:54:38 +0200 Subject: [PATCH 17/90] offset storage fix --- .../connect/s3/source/IntegrationTest.java | 14 ++++----- .../s3/source/S3SourceRecordIterator.java | 3 ++ .../kafka/connect/s3/source/S3SourceTask.java | 31 +++++++++++-------- .../s3/source/config/S3SourceConfig.java | 16 +++++----- ...nector-for-apache-kafka-version.properties | 16 ---------- 5 files changed, 35 insertions(+), 45 deletions(-) delete mode 100644 s3-source-connector/src/main/resources/source-s3-connector-for-apache-kafka-version.properties diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 7684b94a4..59ce35920 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -21,9 +21,9 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_ENDPOINT_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OFFSET_STORAGE_TOPIC; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OFFSET_STORAGE_TOPIC_PARTITIONS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.START_MARKER_KEY; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; import static org.assertj.core.api.Assertions.assertThat; import java.io.File; @@ -130,7 +130,7 @@ void tearDown() { @Test void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME)); + final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectRunner.createConnector(connectorConfig); @@ -157,7 +157,7 @@ void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx @Test void multiPartUploadTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME)); + final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectRunner.createConnector(connectorConfig); final String partition = "00001"; @@ -181,7 +181,7 @@ private static void writeToS3(final String topicName, final String testData1, fi } @Deprecated - private Map<String, String> getConfig(final Map<String, String> config) { + private Map<String, String> getConfig(final Map<String, String> config, final String topics) { config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); config.put(AWS_ACCESS_KEY_ID_CONFIG, S3_ACCESS_KEY_ID); config.put(AWS_SECRET_ACCESS_KEY_CONFIG, S3_SECRET_ACCESS_KEY); @@ -189,8 +189,8 @@ private Map<String, String> getConfig(final Map<String, String> config) { config.put(AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET_NAME); config.put(AWS_S3_PREFIX_CONFIG, s3Prefix); config.put(START_MARKER_KEY, COMMON_PREFIX); - config.put(OFFSET_STORAGE_TOPIC_PARTITIONS, "1,2"); - config.put(OFFSET_STORAGE_TOPIC, "connect-storage-offsets"); + config.put(TARGET_TOPIC_PARTITIONS, "0,1"); + config.put(TARGET_TOPICS, topics); return config; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java index 1de32ac9a..431419d7d 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java @@ -17,6 +17,7 @@ package io.aiven.kafka.connect.s3.source; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.LOGGER; import java.io.IOException; import java.io.InputStream; @@ -152,11 +153,13 @@ private Optional<ConsumerRecord<byte[], byte[]>> getConsumerRecord(final Optiona long currentOffset; if (offsets.containsKey(s3Partition)) { + LOGGER.info("getConsumerRecord containsKey: " + offsets); final S3Offset s3Offset = offsets.get(s3Partition); currentOffset = s3Offset.getOffset() + 1; } else { currentOffset = currentOffsets.getOrDefault(s3Partition, finalStartOffset); } + LOGGER.info("currentOffset :" + currentOffset); final Optional<ConsumerRecord<byte[], byte[]>> record = Optional .of(new ConsumerRecord<>(finalTopic, finalPartition, currentOffset, key.orElse(null), value)); currentOffsets.put(s3Partition, currentOffset + 1); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index a1fa54fcd..8ac83766c 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -16,9 +16,11 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OFFSET_STORAGE_TOPIC; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OFFSET_STORAGE_TOPIC_PARTITIONS; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; import static java.util.stream.Collectors.toList; import static java.util.stream.Collectors.toMap; @@ -103,16 +105,17 @@ public void start(final Map<String, String> props) { prepareReaderFromOffsetStorageReader(); } + @Deprecated private void prepareReaderFromOffsetStorageReader() { - final String s3Prefix = s3SourceConfig.getString("aws.s3.prefix"); - final String s3Bucket = s3SourceConfig.getString("aws.s3.bucket.name"); + final String s3Prefix = s3SourceConfig.getString(AWS_S3_PREFIX_CONFIG); + final String s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); - final Set<Integer> offsetStorageTopicPartitions = getOffsetStorageTopicPartitions(); - final Set<String> offsetStorageTopic = getOffsetStorageTopic(); + final Set<Integer> offsetStorageTopicPartitions = getTargetTopicPartitions(); + final Set<String> targetTopics = getTargetTopics(); // map to s3 partitions final List<S3Partition> s3Partitions = offsetStorageTopicPartitions.stream() - .flatMap(p -> offsetStorageTopic.stream().map(t -> S3Partition.from(s3Bucket, s3Prefix, t, p))) + .flatMap(p -> targetTopics.stream().map(t -> S3Partition.from(s3Bucket, s3Prefix, t, p))) .collect(toList()); // get partition offsets @@ -120,6 +123,9 @@ private void prepareReaderFromOffsetStorageReader() { final Map<Map<String, Object>, Map<String, Object>> offsetMap = context.offsetStorageReader() .offsets(partitions); + LOGGER.info("offsetMap : " + offsetMap); + LOGGER.info("offsetMap entry set : " + offsetMap.entrySet()); + if (offsets == null) { offsets = offsetMap.entrySet() .stream() @@ -127,11 +133,12 @@ private void prepareReaderFromOffsetStorageReader() { .collect( toMap(entry -> S3Partition.from(entry.getKey()), entry -> S3Offset.from(entry.getValue()))); } + LOGGER.info("Storage offsets : " + offsets); sourceRecordIterator = new S3SourceRecordIterator(s3SourceConfig, s3Client, s3Bucket, s3Prefix, offsets); } - private Set<Integer> getOffsetStorageTopicPartitions() { - final String partitionString = s3SourceConfig.getString(OFFSET_STORAGE_TOPIC_PARTITIONS); + private Set<Integer> getTargetTopicPartitions() { + final String partitionString = s3SourceConfig.getString(TARGET_TOPIC_PARTITIONS); if (Objects.nonNull(partitionString)) { return Arrays.stream(partitionString.split(",")).map(Integer::parseInt).collect(Collectors.toSet()); } else { @@ -139,8 +146,8 @@ private Set<Integer> getOffsetStorageTopicPartitions() { } } - private Set<String> getOffsetStorageTopic() { - final String topicString = s3SourceConfig.getString(OFFSET_STORAGE_TOPIC); + private Set<String> getTargetTopics() { + final String topicString = s3SourceConfig.getString(TARGET_TOPICS); if (Objects.nonNull(topicString)) { return Arrays.stream(topicString.split(",")).collect(Collectors.toSet()); } else { @@ -150,14 +157,12 @@ private Set<String> getOffsetStorageTopic() { @Override public List<SourceRecord> poll() throws InterruptedException { - // read up to the configured poll size final List<SourceRecord> results = new ArrayList<>(s3SourceConfig.getInt(MAX_POLL_RECORDS)); if (stopped.get()) { return results; } - // AWS errors will happen. Nothing to do about it but sleep and try again. while (!stopped.get()) { try { return getSourceRecords(results); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index bb8fe68dc..b6835435a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -105,10 +105,8 @@ final public class S3SourceConfig extends AbstractConfig { public static final String AWS_S3_SSE_ALGORITHM_CONFIG = "aws.s3.sse.algorithm"; - public static final String OFFSET_STORAGE_TOPIC_PARTITIONS = "offset.storage.topic.partitions"; - public static final String OFFSET_STORAGE_TOPIC = "offset.storage.topic"; - - public static final String OFFSET_STORAGE_TOPIC_DEFAULT = "connect-storage-offsets"; + public static final String TARGET_TOPIC_PARTITIONS = "topic.partitions"; + public static final String TARGET_TOPICS = "topics"; public static final String START_MARKER_KEY = "aws.s3.start.marker"; public static final String FETCH_PAGE_SIZE = "aws.s3.fetch.page.size"; @@ -242,12 +240,12 @@ private static void addS3RetryPolicies(final ConfigDef configDef) { } private static void addOffsetStorageConfig(final ConfigDef configDef) { - configDef.define(OFFSET_STORAGE_TOPIC_PARTITIONS, ConfigDef.Type.STRING, "0", new ConfigDef.NonEmptyString(), + configDef.define(TARGET_TOPIC_PARTITIONS, ConfigDef.Type.STRING, "0", new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "eg : 0,1", GROUP_OFFSET_TOPIC, 0, ConfigDef.Width.NONE, - OFFSET_STORAGE_TOPIC_PARTITIONS); - configDef.define(OFFSET_STORAGE_TOPIC, ConfigDef.Type.STRING, OFFSET_STORAGE_TOPIC_DEFAULT, - new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "eg : connect-storage-offsets", - GROUP_OFFSET_TOPIC, 0, ConfigDef.Width.NONE, OFFSET_STORAGE_TOPIC); + TARGET_TOPIC_PARTITIONS); + configDef.define(TARGET_TOPICS, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "eg : connect-storage-offsets", GROUP_OFFSET_TOPIC, 0, + ConfigDef.Width.NONE, TARGET_TOPICS); } private static void addDeprecatedConfiguration(final ConfigDef configDef) { diff --git a/s3-source-connector/src/main/resources/source-s3-connector-for-apache-kafka-version.properties b/s3-source-connector/src/main/resources/source-s3-connector-for-apache-kafka-version.properties deleted file mode 100644 index 9c2421c8a..000000000 --- a/s3-source-connector/src/main/resources/source-s3-connector-for-apache-kafka-version.properties +++ /dev/null @@ -1,16 +0,0 @@ -## -# Copyright 2024 Aiven Oy -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -## -version=${version ?: 'unknown'} From 3c827e78d1351ebdf8b5b21dc0ecc8d0c6e2333c Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Thu, 26 Sep 2024 16:11:31 +0200 Subject: [PATCH 18/90] Adding avro support, test --- gradle-config/aiven-pmd-test-ruleset.xml | 2 +- gradle-config/spotbugs-exclude.xml | 12 +- s3-source-connector/build.gradle.kts | 4 + .../connect/s3/source/IntegrationBase.java | 33 ++++++ .../connect/s3/source/IntegrationTest.java | 73 ++++++++++-- .../s3/source/SchemaRegistryContainer.java | 53 +++++++++ .../s3/source/OffsetStoragePartitionKey.java | 92 +++++++++++++++ .../source/OffsetStoragePartitionValue.java | 84 ++++++++++++++ .../kafka/connect/s3/source/S3Offset.java | 82 ------------- .../kafka/connect/s3/source/S3Partition.java | 101 ---------------- .../connect/s3/source/S3SourceRecord.java | 19 +-- .../kafka/connect/s3/source/S3SourceTask.java | 24 ++-- ...terator.java => SourceRecordIterator.java} | 109 +++++++++++++----- .../s3/source/config/S3SourceConfig.java | 21 +++- 14 files changed, 465 insertions(+), 244 deletions(-) create mode 100644 s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionKey.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionValue.java delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Partition.java rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/{S3SourceRecordIterator.java => SourceRecordIterator.java} (59%) diff --git a/gradle-config/aiven-pmd-test-ruleset.xml b/gradle-config/aiven-pmd-test-ruleset.xml index 0cc9ca531..264226283 100644 --- a/gradle-config/aiven-pmd-test-ruleset.xml +++ b/gradle-config/aiven-pmd-test-ruleset.xml @@ -78,7 +78,7 @@ </rule> <rule ref="category/java/codestyle.xml/TooManyStaticImports"> <properties> - <property name="maximumStaticImports" value="10" /> + <property name="maximumStaticImports" value="15" /> </properties> </rule> diff --git a/gradle-config/spotbugs-exclude.xml b/gradle-config/spotbugs-exclude.xml index 8e6fe7126..ec10b9b20 100644 --- a/gradle-config/spotbugs-exclude.xml +++ b/gradle-config/spotbugs-exclude.xml @@ -20,11 +20,19 @@ <Bug pattern="CT_CONSTRUCTOR_THROW" /> </Match> <Match> - <Class name="io.aiven.kafka.connect.s3.source.S3SourceRecordIterator$1" /> + <Class name="io.aiven.kafka.connect.s3.source.SourceRecordIterator$1" /> <Bug pattern="CT_CONSTRUCTOR_THROW" /> </Match> <Match> - <Class name="io.aiven.kafka.connect.s3.source.S3SourceRecordIterator" /> + <Class name="io.aiven.kafka.connect.s3.source.OffsetStoragePartitionKey" /> + <Bug pattern="CT_CONSTRUCTOR_THROW" /> + </Match> + <Match> + <Class name="io.aiven.kafka.connect.s3.source.OffsetStoragePartitionValue" /> + <Bug pattern="CT_CONSTRUCTOR_THROW" /> + </Match> + <Match> + <Class name="io.aiven.kafka.connect.s3.source.SourceRecordIterator" /> <Bug pattern="EI_EXPOSE_REP2" /> </Match> diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts index 2505715ea..9e699e72b 100644 --- a/s3-source-connector/build.gradle.kts +++ b/s3-source-connector/build.gradle.kts @@ -69,6 +69,10 @@ dependencies { implementation(tools.spotbugs.annotations) implementation(logginglibs.slf4j) + implementation(apache.avro) + implementation(confluent.kafka.connect.avro.converter) { + exclude(group = "org.apache.kafka", module = "kafka-clients") + } testImplementation(compressionlibs.snappy) testImplementation(compressionlibs.zstd.jni) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index b01adbeed..80224f896 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -36,6 +36,7 @@ import org.apache.kafka.clients.consumer.ConsumerRecords; import org.apache.kafka.clients.consumer.KafkaConsumer; import org.apache.kafka.common.serialization.ByteArrayDeserializer; +import org.apache.kafka.common.serialization.StringDeserializer; import com.amazonaws.auth.AWSStaticCredentialsProvider; import com.amazonaws.auth.BasicAWSCredentials; @@ -43,6 +44,8 @@ import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3ClientBuilder; import com.github.dockerjava.api.model.Ulimit; +import io.confluent.kafka.serializers.KafkaAvroDeserializer; +import org.apache.avro.generic.GenericRecord; import org.awaitility.Awaitility; import org.junit.jupiter.api.TestInfo; import org.testcontainers.containers.Container; @@ -145,4 +148,34 @@ static List<String> consumeMessages(final String topic, final int expectedMessag return messages; } } + + static List<GenericRecord> consumeAvroMessages(final String topic, final int expectedMessageCount, + final KafkaContainer kafka, final String schemaRegistryUrl) { + final Properties props = new Properties(); + props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, kafka.getBootstrapServers()); + props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group-avro"); + props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); // Assuming string + // key + props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, KafkaAvroDeserializer.class.getName()); // Avro + // deserializer + // for values + props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); + props.put("schema.registry.url", schemaRegistryUrl); // URL of the schema registry + props.put("specific.avro.reader", "false"); // Use GenericRecord instead of specific Avro classes + + try (KafkaConsumer<String, GenericRecord> consumer = new KafkaConsumer<>(props)) { + consumer.subscribe(Collections.singletonList(topic)); + final List<GenericRecord> recordsList = new ArrayList<>(); + + // Poll messages from the topic + while (recordsList.size() < expectedMessageCount) { + final ConsumerRecords<String, GenericRecord> records = consumer.poll(500L); + for (final ConsumerRecord<String, GenericRecord> record : records) { + recordsList.add(record.value()); // Add the GenericRecord to the list + } + } + + return recordsList; + } + } } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 59ce35920..2a695cca0 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -16,16 +16,19 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_ACCESS_KEY_ID_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_ENDPOINT_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.START_MARKER_KEY; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; import static org.assertj.core.api.Assertions.assertThat; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -47,6 +50,12 @@ import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.PutObjectRequest; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.DatumWriter; import org.apache.commons.io.IOUtils; import org.junit.Ignore; import org.junit.jupiter.api.AfterEach; @@ -87,6 +96,9 @@ final class IntegrationTest implements IntegrationBase { @Container private static final KafkaContainer KAFKA_CONTAINER = IntegrationBase.createKafkaContainer(); + + @Container + private static final SchemaRegistryContainer SCHEMA_REGISTRY = new SchemaRegistryContainer(KAFKA_CONTAINER); private AdminClient adminClient; private ConnectRunner connectRunner; @@ -138,8 +150,8 @@ void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; // write 2 objects to s3 - writeToS3(topicName, testData1, 1); - writeToS3(topicName, testData2, 2); + writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), 1); + writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), 2); final List<String> objects = testBucketAccessor.listObjects(); assertThat(objects.size()).isEqualTo(2); @@ -155,7 +167,7 @@ void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx } @Test - void multiPartUploadTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { + void multiPartUploadTest(final TestInfo testInfo) throws ExecutionException, InterruptedException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); @@ -169,13 +181,61 @@ void multiPartUploadTest(final TestInfo testInfo) throws ExecutionException, Int assertThat(records.get(0)).contains("performanceeeqjz"); } - private static void writeToS3(final String topicName, final String testData1, final int offsetId) + @Test + void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { + final var topicName = IntegrationBase.topicName(testInfo); + final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + connectorConfig.put(OUTPUT_FORMAT, AVRO_OUTPUT_FORMAT); + connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); + + connectRunner.createConnector(connectorConfig); + + // Define Avro schema + final String schemaJson = "{\n" + " \"type\": \"record\",\n" + " \"name\": \"TestRecord\",\n" + + " \"fields\": [\n" + " {\"name\": \"message\", \"type\": \"string\"},\n" + + " {\"name\": \"id\", \"type\": \"int\"}\n" + " ]\n" + "}"; + final Schema.Parser parser = new Schema.Parser(); + final Schema schema = parser.parse(schemaJson); + + // Create Avro records + final GenericRecord avroRecord = new GenericData.Record(schema); + avroRecord.put("message", "Hello, Kafka Connect S3 Source! object 1"); + avroRecord.put("id", 1); + + // Serialize Avro records to byte arrays + final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); + try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) { + dataFileWriter.create(schema, outputStream); + dataFileWriter.append(avroRecord); + dataFileWriter.flush(); + } + + writeToS3(topicName, outputStream.toByteArray(), 1); + outputStream.close(); + + final List<String> objects = testBucketAccessor.listObjects(); + assertThat(objects.size()).isEqualTo(1); + + // Verify that the connector is correctly set up + assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); + + // Poll Avro messages from the Kafka topic and deserialize them + final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 2, KAFKA_CONTAINER, + SCHEMA_REGISTRY.getSchemaRegistryUrl()); // Ensure this method deserializes Avro + + // Verify that the correct data is read from the S3 bucket and pushed to Kafka + assertThat(records).extracting(record -> record.get("message").toString()) + .contains("Hello, Kafka Connect S3 Source! object 1"); + } + + private static void writeToS3(final String topicName, final byte[] testDataBytes, final int offsetId) throws IOException { final String partition = "00000"; final String offset = "00000000012" + offsetId; final String fileName = topicName + "-" + partition + "-" + offset + ".txt"; final Path testFilePath = Paths.get("/tmp/" + fileName); - Files.write(testFilePath, testData1.getBytes(StandardCharsets.UTF_8)); + Files.write(testFilePath, testDataBytes); saveToS3(TEST_BUCKET_NAME, "", fileName, testFilePath.toFile()); } @@ -188,7 +248,6 @@ private Map<String, String> getConfig(final Map<String, String> config, final St config.put(AWS_S3_ENDPOINT_CONFIG, s3Endpoint); config.put(AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET_NAME); config.put(AWS_S3_PREFIX_CONFIG, s3Prefix); - config.put(START_MARKER_KEY, COMMON_PREFIX); config.put(TARGET_TOPIC_PARTITIONS, "0,1"); config.put(TARGET_TOPICS, topics); return config; diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java new file mode 100644 index 000000000..a27864718 --- /dev/null +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java @@ -0,0 +1,53 @@ +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import java.util.List; + +import com.github.dockerjava.api.model.Ulimit; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.KafkaContainer; +import org.testcontainers.utility.Base58; + +public final class SchemaRegistryContainer extends GenericContainer<SchemaRegistryContainer> { + public static final int SCHEMA_REGISTRY_PORT = 8081; + + public SchemaRegistryContainer(final KafkaContainer kafka) { + this("5.0.4", kafka); + } + + public SchemaRegistryContainer(final String confluentPlatformVersion, final KafkaContainer kafka) { + super("confluentinc/cp-schema-registry:" + confluentPlatformVersion); + + dependsOn(kafka); + withNetwork(kafka.getNetwork()); + withNetworkAliases("schema-registry-" + Base58.randomString(6)); + + withEnv("SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS", + String.format("PLAINTEXT://%s:%s", kafka.getNetworkAliases().get(0), 9092)); + + withExposedPorts(SCHEMA_REGISTRY_PORT); + withEnv("SCHEMA_REGISTRY_HOST_NAME", "localhost"); + + withCreateContainerCmdModifier( + cmd -> cmd.getHostConfig().withUlimits(List.of(new Ulimit("nofile", 30_000L, 30_000L)))); + } + + public String getSchemaRegistryUrl() { + return String.format("http://%s:%s", getHost(), getMappedPort(SCHEMA_REGISTRY_PORT)); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionKey.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionKey.java new file mode 100644 index 000000000..6ac7359d9 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionKey.java @@ -0,0 +1,92 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; + +public class OffsetStoragePartitionKey { + + public static final String BUCKET_NAME = "bucketName"; + public static final String TOPIC = "topic"; + public static final String TOPIC_PARTITION = "topicPartition"; + private final String s3BucketName; + private final String topic; + private final int partition; + + public OffsetStoragePartitionKey(final String s3BucketName, final String topic, final int partition) { + if (s3BucketName == null || s3BucketName.isEmpty()) { + throw new IllegalArgumentException("S3 bucket name cannot be null or empty"); + } + if (topic == null || topic.isEmpty()) { + throw new IllegalArgumentException("Topic cannot be null or empty"); + } + if (partition < 0) { + throw new IllegalArgumentException("Partition must be a non-negative integer"); + } + + this.s3BucketName = s3BucketName; + this.topic = topic; + this.partition = partition; + } + + public static OffsetStoragePartitionKey fromPartitionMap(final String bucket, final String topic, + final int partition) { + return new OffsetStoragePartitionKey(bucket, topic, partition); + } + + public static OffsetStoragePartitionKey fromPartitionMap(final Map<String, Object> map) { + Objects.requireNonNull(map, "Input map cannot be null"); + final String bucket = (String) map.getOrDefault(BUCKET_NAME, ""); + final String topic = (String) map.getOrDefault(TOPIC, ""); + final int partition = ((Number) map.getOrDefault(TOPIC_PARTITION, -1)).intValue(); + return fromPartitionMap(bucket, topic, partition); + } + + public Map<String, Object> toPartitionMap() { + final Map<String, Object> map = new HashMap<>(); + map.put(BUCKET_NAME, s3BucketName); + map.put(TOPIC, topic); + map.put(TOPIC_PARTITION, partition); + return map; + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + final OffsetStoragePartitionKey other = (OffsetStoragePartitionKey) obj; + return partition == other.partition && Objects.equals(s3BucketName, other.s3BucketName) + && Objects.equals(topic, other.topic); + } + + @Override + public int hashCode() { + return Objects.hash(s3BucketName, topic, partition); + } + + @Override + public String toString() { + return String.format("OffsetStoragePartitionKey{bucketName='%s', topic='%s', partition=%d}", s3BucketName, + topic, partition); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionValue.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionValue.java new file mode 100644 index 000000000..5fa965ffa --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionValue.java @@ -0,0 +1,84 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; + +public class OffsetStoragePartitionValue implements Comparable<OffsetStoragePartitionValue> { + + public static final String ORIGINAL_OFFSET = "storedOriginalOffset"; + private final long offset; + + public OffsetStoragePartitionValue(final long offset) { + if (offset < 0) { + throw new IllegalArgumentException("Offset cannot be negative"); + } + this.offset = offset; + } + + public static OffsetStoragePartitionValue fromOffsetMap(final long offset) { + return new OffsetStoragePartitionValue(offset); + } + + public static OffsetStoragePartitionValue fromOffsetMap(final Map<String, Object> map) { + Objects.requireNonNull(map, "Input map cannot be null"); + final Object offsetValue = map.get(ORIGINAL_OFFSET); + if (!(offsetValue instanceof Number)) { + throw new IllegalArgumentException("Original offset must be a valid number"); + } + return fromOffsetMap(((Number) offsetValue).longValue()); + } + + @Override + public String toString() { + return String.valueOf(offset); + } + + @Override + public int compareTo(final OffsetStoragePartitionValue other) { + return Long.compare(this.offset, other.offset); + } + + public Map<String, ?> asOffsetMap() { + final Map<String, Object> map = new HashMap<>(); + map.put(ORIGINAL_OFFSET, offset); + return map; + } + + public long getOffset() { + return offset; + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + final OffsetStoragePartitionValue other = (OffsetStoragePartitionValue) obj; + return offset == other.offset; + } + + @Override + public int hashCode() { + return Objects.hash(offset); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java deleted file mode 100644 index 3b44eb805..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Offset.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.s3.source; - -import java.util.HashMap; -import java.util.Map; -import java.util.Objects; - -public class S3Offset implements Comparable<S3Offset> { - - private final String s3key; - - private final long offset; - - public S3Offset(final String s3key, final long offset) { - this.s3key = s3key; - this.offset = offset; - } - - public static S3Offset from(final String s3key, final long offset) { - return new S3Offset(s3key, offset); - } - - public static S3Offset from(final Map<String, Object> map) { - return from((String) map.get("s3key"), (Long) map.get("originalOffset")); - } - - @Override - public String toString() { - return s3key + "@" + offset; - } - - @Override - public int compareTo(final S3Offset s3Offset) { - final int compareTo = s3key.compareTo(s3Offset.s3key); - return compareTo == 0 ? (int) (offset - s3Offset.offset) : compareTo; - } - - public Map<String, ?> asMap() { - final Map<String, Object> map = new HashMap<>(); - map.put("s3key", s3key); - map.put("originalOffset", offset); - return map; - } - - public long getOffset() { - return offset; - } - - // Overriding equals to ensure consistency with compareTo - @Override - public boolean equals(final Object obj) { - if (this == obj) { - return true; - } - if (obj == null || getClass() != obj.getClass()) { - return false; - } - final S3Offset other = (S3Offset) obj; - return offset == other.offset && Objects.equals(s3key, other.s3key); - } - - // Overriding hashCode to match equals - @Override - public int hashCode() { - return Objects.hash(s3key, offset); - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Partition.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Partition.java deleted file mode 100644 index 322344883..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3Partition.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.s3.source; - -import java.util.HashMap; -import java.util.Map; -import java.util.Objects; - -public class S3Partition { - - private final String bucket; - private final String keyPrefix; - private final String topic; - private final int partition; - - public S3Partition(final String bucket, final String keyPrefix, final String topic, final int partition) { - this.bucket = bucket; - this.keyPrefix = normalizePrefix(keyPrefix); - this.topic = topic; - this.partition = partition; - } - - public static S3Partition from(final String bucket, final String keyPrefix, final String topic, - final int partition) { - return new S3Partition(bucket, keyPrefix, topic, partition); - } - - public static S3Partition from(final Map<String, Object> map) { - final String bucket = (String) map.get("bucket"); - final String keyPrefix = (String) map.get("keyPrefix"); - final String topic = (String) map.get("topic"); - final int partition = ((Number) map.get("kafkaPartition")).intValue(); - return from(bucket, keyPrefix, topic, partition); - } - - public static String normalizePrefix(final String keyPrefix) { - return keyPrefix == null ? "" : keyPrefix.endsWith("/") ? keyPrefix : keyPrefix + "/"; - } - - public Map<String, Object> asMap() { - final Map<String, Object> map = new HashMap<>(); - map.put("bucket", bucket); - map.put("keyPrefix", keyPrefix); - map.put("topic", topic); - map.put("kafkaPartition", partition); - return map; - } - - public String getBucket() { - return bucket; - } - - public String getKeyPrefix() { - return keyPrefix; - } - - public String getTopic() { - return topic; - } - - public int getPartition() { - return partition; - } - - @Override - public boolean equals(final Object s3Partition) { - if (this == s3Partition) { - return true; - } - if (s3Partition == null || getClass() != s3Partition.getClass()) { - return false; - } - final S3Partition thatS3Partition = (S3Partition) s3Partition; - return partition == thatS3Partition.partition && Objects.equals(bucket, thatS3Partition.bucket) - && Objects.equals(keyPrefix, thatS3Partition.keyPrefix) && Objects.equals(topic, thatS3Partition.topic); - } - - @Override - public int hashCode() { - return Objects.hash(bucket, keyPrefix, topic, partition); - } - - @Override - public String toString() { - return bucket + "/" + keyPrefix + "/" + topic + "-" + partition; - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java index cff16264e..156859bd2 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java @@ -19,29 +19,30 @@ import java.util.Arrays; public class S3SourceRecord { - private final S3Partition s3Partition; - private final S3Offset s3Offset; + private final OffsetStoragePartitionKey offsetStoragePartitionKey; + private final OffsetStoragePartitionValue offsetStoragePartitionValue; private final String toTopic; private final int topicPartition; private final byte[] recordKey; private final byte[] recordValue; - public S3SourceRecord(final S3Partition s3Partition, final S3Offset s3Offset, final String toTopic, + public S3SourceRecord(final OffsetStoragePartitionKey offsetStoragePartitionKey, + final OffsetStoragePartitionValue offsetStoragePartitionValue, final String toTopic, final int topicPartition, final byte[] recordKey, final byte[] recordValue) { - this.s3Partition = s3Partition; - this.s3Offset = s3Offset; + this.offsetStoragePartitionKey = offsetStoragePartitionKey; + this.offsetStoragePartitionValue = offsetStoragePartitionValue; this.toTopic = toTopic; this.topicPartition = topicPartition; this.recordKey = Arrays.copyOf(recordKey, recordKey.length); this.recordValue = Arrays.copyOf(recordValue, recordValue.length); } - public S3Partition file() { - return s3Partition; + public OffsetStoragePartitionKey getOffsetStoragePartitionKey() { + return offsetStoragePartitionKey; } - public S3Offset offset() { - return s3Offset; + public OffsetStoragePartitionValue getOffsetStoragePartitionValue() { + return offsetStoragePartitionValue; } public String getToTopic() { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 8ac83766c..6a9e1b11c 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -17,7 +17,6 @@ package io.aiven.kafka.connect.s3.source; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; @@ -59,7 +58,7 @@ public class S3SourceTask extends SourceTask { private S3SourceConfig s3SourceConfig; - private Map<S3Partition, S3Offset> offsets; + private Map<OffsetStoragePartitionKey, OffsetStoragePartitionValue> offsets; Iterator<S3SourceRecord> sourceRecordIterator; @@ -107,19 +106,21 @@ public void start(final Map<String, String> props) { @Deprecated private void prepareReaderFromOffsetStorageReader() { - final String s3Prefix = s3SourceConfig.getString(AWS_S3_PREFIX_CONFIG); final String s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); final Set<Integer> offsetStorageTopicPartitions = getTargetTopicPartitions(); final Set<String> targetTopics = getTargetTopics(); // map to s3 partitions - final List<S3Partition> s3Partitions = offsetStorageTopicPartitions.stream() - .flatMap(p -> targetTopics.stream().map(t -> S3Partition.from(s3Bucket, s3Prefix, t, p))) + final List<OffsetStoragePartitionKey> offsetStoragePartitionKeys = offsetStorageTopicPartitions.stream() + .flatMap( + p -> targetTopics.stream().map(t -> OffsetStoragePartitionKey.fromPartitionMap(s3Bucket, t, p))) .collect(toList()); // get partition offsets - final List<Map<String, Object>> partitions = s3Partitions.stream().map(S3Partition::asMap).collect(toList()); + final List<Map<String, Object>> partitions = offsetStoragePartitionKeys.stream() + .map(OffsetStoragePartitionKey::toPartitionMap) + .collect(toList()); final Map<Map<String, Object>, Map<String, Object>> offsetMap = context.offsetStorageReader() .offsets(partitions); @@ -130,11 +131,11 @@ private void prepareReaderFromOffsetStorageReader() { offsets = offsetMap.entrySet() .stream() .filter(e -> e.getValue() != null) - .collect( - toMap(entry -> S3Partition.from(entry.getKey()), entry -> S3Offset.from(entry.getValue()))); + .collect(toMap(entry -> OffsetStoragePartitionKey.fromPartitionMap(entry.getKey()), + entry -> OffsetStoragePartitionValue.fromOffsetMap(entry.getValue()))); } LOGGER.info("Storage offsets : " + offsets); - sourceRecordIterator = new S3SourceRecordIterator(s3SourceConfig, s3Client, s3Bucket, s3Prefix, offsets); + sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, s3Client, s3Bucket, offsets); } private Set<Integer> getTargetTopicPartitions() { @@ -195,11 +196,12 @@ private List<SourceRecord> getSourceRecords(final List<SourceRecord> results) th for (int i = 0; sourceRecordIterator.hasNext() && i < s3SourceConfig.getInt(MAX_POLL_RECORDS) && !stopped.get(); i++) { final S3SourceRecord record = sourceRecordIterator.next(); - LOGGER.info(record.offset() + record.getToTopic() + record.partition()); + LOGGER.info(record.getOffsetStoragePartitionValue() + record.getToTopic() + record.partition()); final String topic = record.getToTopic(); final Optional<SchemaAndValue> key = keyConverter.map(c -> c.toConnectData(topic, record.key())); final SchemaAndValue value = valueConverter.toConnectData(topic, record.value()); - results.add(new SourceRecord(record.file().asMap(), record.offset().asMap(), topic, record.partition(), + results.add(new SourceRecord(record.getOffsetStoragePartitionKey().toPartitionMap(), + record.getOffsetStoragePartitionValue().asOffsetMap(), topic, record.partition(), key.map(SchemaAndValue::schema).orElse(null), key.map(SchemaAndValue::value).orElse(null), value.schema(), value.value())); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java similarity index 59% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java index 431419d7d..47cd2a743 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java @@ -16,9 +16,12 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.LOGGER; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; @@ -44,8 +47,17 @@ import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; import com.amazonaws.util.IOUtils; +import io.confluent.kafka.serializers.KafkaAvroSerializer; +import org.apache.avro.file.DataFileReader; +import org.apache.avro.file.SeekableByteArrayInput; +import org.apache.avro.file.SeekableInput; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.DatumReader; +import org.apache.avro.io.DecoderFactory; -public final class S3SourceRecordIterator implements Iterator<S3SourceRecord> { +@SuppressWarnings("PMD.ExcessiveImports") +public final class SourceRecordIterator implements Iterator<S3SourceRecord> { public static final Pattern DEFAULT_PATTERN = Pattern .compile("(?<topic>[^/]+?)-" + "(?<partition>\\d{5})-" + "(?<offset>\\d{12})" + "\\.(?<extension>[^.]+)$"); @@ -53,20 +65,18 @@ public final class S3SourceRecordIterator implements Iterator<S3SourceRecord> { private Iterator<S3ObjectSummary> nextFileIterator; private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> recordIterator = Collections.emptyIterator(); - private final Map<S3Partition, S3Offset> offsets; + private final Map<OffsetStoragePartitionKey, OffsetStoragePartitionValue> offsets; private final S3SourceConfig s3SourceConfig; private final String bucketName; - private final String s3Prefix; private final AmazonS3 s3Client; - public S3SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, - final String s3Prefix, final Map<S3Partition, S3Offset> offsets) { + public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, + final Map<OffsetStoragePartitionKey, OffsetStoragePartitionValue> offsets) { this.s3SourceConfig = s3SourceConfig; this.offsets = Optional.ofNullable(offsets).orElseGet(HashMap::new); this.s3Client = s3Client; this.bucketName = bucketName; - this.s3Prefix = s3Prefix; try { final List<S3ObjectSummary> chunks = fetchObjectSummaries(s3Client); nextFileIterator = chunks.iterator(); @@ -77,8 +87,6 @@ public S3SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS private List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOException { final ObjectListing objectListing = s3Client.listObjects(new ListObjectsRequest().withBucketName(bucketName) - // .withPrefix(s3Prefix) - // .withMarker(s3SourceConfig.getString(START_MARKER_KEY)) .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * 2)); return new ArrayList<>(objectListing.getObjectSummaries()); @@ -115,14 +123,21 @@ private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> createIteratorForCurr final String finalTopic = topic; final int finalPartition = partition; final long finalStartOffset = startOffset; - return getIterator(content, finalTopic, finalPartition, finalStartOffset); + if (s3SourceConfig.getString(OUTPUT_FORMAT).equals(AVRO_OUTPUT_FORMAT)) { + final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); + DecoderFactory.get().binaryDecoder(content, null); + return getIterator(content, finalTopic, finalPartition, finalStartOffset, datumReader, true); + } else { + return getIterator(content, finalTopic, finalPartition, finalStartOffset, null, false); + } } } private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> getIterator(final InputStream content, - final String finalTopic, final int finalPartition, final long finalStartOffset) { + final String finalTopic, final int finalPartition, final long finalStartOffset, + final DatumReader<GenericRecord> datumReader, final boolean isAvro) { return new Iterator<>() { - private Map<S3Partition, Long> currentOffsets = new HashMap<>(); + private Map<OffsetStoragePartitionKey, Long> currentOffsets = new HashMap<>(); private Optional<ConsumerRecord<byte[], byte[]>> nextRecord = readNext(); private Optional<ConsumerRecord<byte[], byte[]>> readNext() { @@ -131,7 +146,8 @@ private Optional<ConsumerRecord<byte[], byte[]>> readNext() { if (currentKey != null) { key = Optional.of(currentKey.getBytes(StandardCharsets.UTF_8)); } - final byte[] value = IOUtils.toByteArray(content); + byte[] value; + value = getBytes(isAvro, content, datumReader, finalTopic); if (value == null) { if (key.isPresent()) { @@ -149,26 +165,25 @@ private Optional<ConsumerRecord<byte[], byte[]>> readNext() { private Optional<ConsumerRecord<byte[], byte[]>> getConsumerRecord(final Optional<byte[]> key, final byte[] value) { - final S3Partition s3Partition = S3Partition.from(bucketName, s3Prefix, finalTopic, finalPartition); + final OffsetStoragePartitionKey offsetStoragePartitionKey = OffsetStoragePartitionKey + .fromPartitionMap(bucketName, finalTopic, finalPartition); long currentOffset; - if (offsets.containsKey(s3Partition)) { - LOGGER.info("getConsumerRecord containsKey: " + offsets); - final S3Offset s3Offset = offsets.get(s3Partition); - currentOffset = s3Offset.getOffset() + 1; + if (offsets.containsKey(offsetStoragePartitionKey)) { + final OffsetStoragePartitionValue offsetStoragePartitionValue = offsets + .get(offsetStoragePartitionKey); + currentOffset = offsetStoragePartitionValue.getOffset() + 1; } else { - currentOffset = currentOffsets.getOrDefault(s3Partition, finalStartOffset); + currentOffset = currentOffsets.getOrDefault(offsetStoragePartitionKey, finalStartOffset); } - LOGGER.info("currentOffset :" + currentOffset); final Optional<ConsumerRecord<byte[], byte[]>> record = Optional .of(new ConsumerRecord<>(finalTopic, finalPartition, currentOffset, key.orElse(null), value)); - currentOffsets.put(s3Partition, currentOffset + 1); + currentOffsets.put(offsetStoragePartitionKey, currentOffset + 1); return record; } @Override public boolean hasNext() { - // Check if there's another record return nextRecord.isPresent(); } @@ -184,7 +199,48 @@ public Optional<ConsumerRecord<byte[], byte[]>> next() { }; } - private InputStream getContent(final S3Object object) throws IOException { + private byte[] getBytes(final boolean isAvro, final InputStream content, + final DatumReader<GenericRecord> datumReader, final String topicName) throws IOException { + byte[] value; + if (isAvro) { + List<GenericRecord> items; + try (SeekableInput sin = new SeekableByteArrayInput(IOUtils.toByteArray(content))) { + try (DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, datumReader)) { + items = new ArrayList<>(); + reader.forEach(items::add); + } + } + value = serializeAvroRecordToBytes(items, topicName); + } else { + value = IOUtils.toByteArray(content); + } + return value; + } + + private byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String finalTopic) + throws IOException { + // Create a map to configure the Avro serializer + final Map<String, String> config = new HashMap<>(); + config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); // Replace with your Schema + // Registry URL + + try (KafkaAvroSerializer avroSerializer = new KafkaAvroSerializer()) { + avroSerializer.configure(config, false); // `false` since this is for value serialization + // Use a ByteArrayOutputStream to combine the serialized records + final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + + // Loop through each Avro record and serialize it + for (final GenericRecord avroRecord : avroRecords) { + final byte[] serializedRecord = avroSerializer.serialize(finalTopic, avroRecord); + outputStream.write(serializedRecord); + } + + // Convert the combined output stream to a byte array and return it + return outputStream.toByteArray(); + } + } + + private InputStream getContent(final S3Object object) { return object.getObjectContent(); } @@ -202,9 +258,10 @@ public S3SourceRecord next() { throw new NoSuchElementException(); } final ConsumerRecord<byte[], byte[]> record = recordIterator.next().get(); - return new S3SourceRecord(S3Partition.from(bucketName, s3Prefix, record.topic(), record.partition()), - S3Offset.from(currentKey, record.offset()), record.topic(), record.partition(), record.key(), - record.value()); + return new S3SourceRecord( + OffsetStoragePartitionKey.fromPartitionMap(bucketName, record.topic(), record.partition()), + OffsetStoragePartitionValue.fromOffsetMap(record.offset()), record.topic(), record.partition(), + record.key(), record.value()); } @Override diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index b6835435a..9412f8539 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -94,6 +94,8 @@ final public class S3SourceConfig extends AbstractConfig { @Deprecated public static final String AWS_S3_PREFIX = "aws_s3_prefix"; + public static final String SCHEMA_REGISTRY_URL = "schema.registry.url"; + public static final String AWS_ACCESS_KEY_ID_CONFIG = "aws.access.key.id"; public static final String AWS_SECRET_ACCESS_KEY_CONFIG = "aws.secret.access.key"; @@ -117,6 +119,9 @@ final public class S3SourceConfig extends AbstractConfig { public static final String VALUE_CONVERTER = "value.converter"; public static final int S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT = 3; + public static final String OUTPUT_FORMAT = "output.format"; + + public static final String AVRO_OUTPUT_FORMAT = "avro"; public S3SourceConfig(final Map<String, String> properties) { super(configDef(), preprocessProperties(properties)); @@ -157,6 +162,7 @@ private static Map<String, String> handleDeprecatedYyyyUppercase(final Map<Strin public static ConfigDef configDef() { final var configDef = new S3SourceConfigDef(); + addSchemaRegistryGroup(configDef); addOffsetStorageConfig(configDef); addAwsStsConfigGroup(configDef); addAwsConfigGroup(configDef); @@ -166,6 +172,16 @@ public static ConfigDef configDef() { return configDef; } + private static void addSchemaRegistryGroup(final ConfigDef configDef) { + int srCounter = 0; + configDef.define(SCHEMA_REGISTRY_URL, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "SCHEMA REGISTRY URL", GROUP_AWS, srCounter++, ConfigDef.Width.NONE, + SCHEMA_REGISTRY_URL); + configDef.define(OUTPUT_FORMAT, ConfigDef.Type.STRING, "bytearray", new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "Output format avro/bytearray", GROUP_AWS, srCounter++, // NOPMD + ConfigDef.Width.NONE, OUTPUT_FORMAT); + } + private static void addOtherConfig(final S3SourceConfigDef configDef) { int awsOtherGroupCounter = 0; configDef.define(FETCH_PAGE_SIZE, ConfigDef.Type.INT, 10, ConfigDef.Range.atLeast(1), @@ -298,11 +314,6 @@ private static void addAwsConfigGroup(final ConfigDef configDef) { ConfigDef.Importance.MEDIUM, "AWS S3 Region, e.g. us-east-1", GROUP_AWS, awsGroupCounter++, // NOPMD // UnusedAssignment ConfigDef.Width.NONE, AWS_S3_REGION); - - configDef.define(START_MARKER_KEY, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "AWS S3 Start marker, e.g. prefix", GROUP_AWS, awsGroupCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, START_MARKER_KEY); } protected static class AwsRegionValidator implements ConfigDef.Validator { From 8c2b87a55d12313478f860c94db813f9c7f1a68f Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Fri, 27 Sep 2024 17:17:28 +0200 Subject: [PATCH 19/90] Update avro format --- .../connect/s3/source/IntegrationTest.java | 73 ++++++++++++++----- .../kafka/connect/s3/source/S3SourceTask.java | 12 +++ .../s3/source/SourceRecordIterator.java | 27 ++++--- .../s3/source/config/S3SourceConfig.java | 10 +++ 4 files changed, 96 insertions(+), 26 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 2a695cca0..de77faa5e 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -23,9 +23,11 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.PARQUET_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; import static org.assertj.core.api.Assertions.assertThat; import java.io.ByteArrayOutputStream; @@ -72,6 +74,7 @@ @Ignore @Testcontainers +@SuppressWarnings("PMD.ExcessiveImports") final class IntegrationTest implements IntegrationBase { private static final Logger LOGGER = LoggerFactory.getLogger(IntegrationTest.class); @@ -187,6 +190,9 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectorConfig.put(OUTPUT_FORMAT, AVRO_OUTPUT_FORMAT); connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); + connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); + connectorConfig.put("value.converter.schema.registry.url", SCHEMA_REGISTRY.getSchemaRegistryUrl()); + connectorConfig.put(VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); connectRunner.createConnector(connectorConfig); @@ -197,25 +203,14 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc final Schema.Parser parser = new Schema.Parser(); final Schema schema = parser.parse(schemaJson); - // Create Avro records - final GenericRecord avroRecord = new GenericData.Record(schema); - avroRecord.put("message", "Hello, Kafka Connect S3 Source! object 1"); - avroRecord.put("id", 1); - - // Serialize Avro records to byte arrays - final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); - final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); - try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) { - dataFileWriter.create(schema, outputStream); - dataFileWriter.append(avroRecord); - dataFileWriter.flush(); - } + final ByteArrayOutputStream outputStream1 = getAvroRecord(schema, 1); + final ByteArrayOutputStream outputStream2 = getAvroRecord(schema, 2); - writeToS3(topicName, outputStream.toByteArray(), 1); - outputStream.close(); + writeToS3(topicName, outputStream1.toByteArray(), 1); + writeToS3(topicName, outputStream2.toByteArray(), 2); final List<String> objects = testBucketAccessor.listObjects(); - assertThat(objects.size()).isEqualTo(1); + assertThat(objects.size()).isEqualTo(2); // Verify that the connector is correctly set up assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); @@ -226,7 +221,51 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc // Verify that the correct data is read from the S3 bucket and pushed to Kafka assertThat(records).extracting(record -> record.get("message").toString()) - .contains("Hello, Kafka Connect S3 Source! object 1"); + .contains("Hello, Kafka Connect S3 Source! object 1") + .contains("Hello, Kafka Connect S3 Source! object 2"); + assertThat(records).extracting(record -> record.get("id").toString()).contains("1").contains("2"); + } + + @Test + @Ignore + void parquetTest(final TestInfo testInfo) throws ExecutionException, InterruptedException { + final var topicName = IntegrationBase.topicName(testInfo); + final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + connectorConfig.put(OUTPUT_FORMAT, PARQUET_OUTPUT_FORMAT); + connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); + + final String partition = "00000"; + final String offset = "000000000123"; + final String fileName = topicName + "-" + partition + "-" + offset + ".txt"; + + connectRunner.createConnector(connectorConfig); + try (InputStream resourceStream = Thread.currentThread() + .getContextClassLoader() + .getResourceAsStream("sample1.parquet")) { + s3Client.putObject(TEST_BUCKET_NAME, fileName, resourceStream, null); + } catch (final Exception e) { // NOPMD broad exception catched + LOGGER.error("Error in reading file" + e.getMessage()); + } + // TODO + assertThat(1).isEqualTo(1); + } + + private static ByteArrayOutputStream getAvroRecord(final Schema schema, final int messageId) throws IOException { + // Create Avro records + final GenericRecord avroRecord = new GenericData.Record(schema); + avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + messageId); + avroRecord.put("id", messageId); + + // Serialize Avro records to byte arrays + final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); + try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) { + dataFileWriter.create(schema, outputStream); + dataFileWriter.append(avroRecord); + dataFileWriter.flush(); + } + outputStream.close(); + return outputStream; } private static void writeToS3(final String topicName, final byte[] testDataBytes, final int offsetId) diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 6a9e1b11c..cca714283 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -18,6 +18,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; import static java.util.stream.Collectors.toList; @@ -25,6 +26,7 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -34,6 +36,8 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Collectors; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; import org.apache.kafka.connect.data.SchemaAndValue; import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.source.SourceRecord; @@ -52,6 +56,7 @@ * S3SourceTask is a Kafka Connect SourceTask implementation that reads from source-s3 buckets and generates Kafka * Connect records. */ +@SuppressWarnings("PMD.ExcessiveImports") public class S3SourceTask extends SourceTask { private static final Logger LOGGER = LoggerFactory.getLogger(AivenKafkaConnectS3SourceConnector.class); @@ -193,13 +198,20 @@ private List<SourceRecord> getSourceRecords(final List<SourceRecord> results) th return results; } + final Map<String, String> config = new HashMap<>(); for (int i = 0; sourceRecordIterator.hasNext() && i < s3SourceConfig.getInt(MAX_POLL_RECORDS) && !stopped.get(); i++) { final S3SourceRecord record = sourceRecordIterator.next(); LOGGER.info(record.getOffsetStoragePartitionValue() + record.getToTopic() + record.partition()); final String topic = record.getToTopic(); final Optional<SchemaAndValue> key = keyConverter.map(c -> c.toConnectData(topic, record.key())); + + config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + config.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); + + valueConverter.configure(config, false); final SchemaAndValue value = valueConverter.toConnectData(topic, record.value()); + results.add(new SourceRecord(record.getOffsetStoragePartitionKey().toPartitionMap(), record.getOffsetStoragePartitionValue().asOffsetMap(), topic, record.partition(), key.map(SchemaAndValue::schema).orElse(null), key.map(SchemaAndValue::value).orElse(null), diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java index 47cd2a743..beda4f65e 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java @@ -19,11 +19,14 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.PARQUET_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.net.ConnectException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; @@ -123,12 +126,15 @@ private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> createIteratorForCurr final String finalTopic = topic; final int finalPartition = partition; final long finalStartOffset = startOffset; - if (s3SourceConfig.getString(OUTPUT_FORMAT).equals(AVRO_OUTPUT_FORMAT)) { - final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); - DecoderFactory.get().binaryDecoder(content, null); - return getIterator(content, finalTopic, finalPartition, finalStartOffset, datumReader, true); - } else { - return getIterator(content, finalTopic, finalPartition, finalStartOffset, null, false); + switch (s3SourceConfig.getString(OUTPUT_FORMAT)) { + case AVRO_OUTPUT_FORMAT : + final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); + DecoderFactory.get().binaryDecoder(content, null); + return getIterator(content, finalTopic, finalPartition, finalStartOffset, datumReader, true); + case PARQUET_OUTPUT_FORMAT : + return getIterator(content, finalTopic, finalPartition, finalStartOffset, null, false); + default : + return getIterator(content, finalTopic, finalPartition, finalStartOffset, null, false); } } } @@ -217,14 +223,15 @@ private byte[] getBytes(final boolean isAvro, final InputStream content, return value; } + @Deprecated private byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String finalTopic) throws IOException { // Create a map to configure the Avro serializer final Map<String, String> config = new HashMap<>(); - config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); // Replace with your Schema - // Registry URL + config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); - try (KafkaAvroSerializer avroSerializer = new KafkaAvroSerializer()) { + try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) s3SourceConfig.getClass(VALUE_SERIALIZER) + .newInstance()) { avroSerializer.configure(config, false); // `false` since this is for value serialization // Use a ByteArrayOutputStream to combine the serialized records final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); @@ -237,6 +244,8 @@ private byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, // Convert the combined output stream to a byte array and return it return outputStream.toByteArray(); + } catch (InstantiationException | IllegalAccessException e) { + throw new ConnectException("Could not create instance of serializer."); } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 9412f8539..544f4850c 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -96,6 +96,9 @@ final public class S3SourceConfig extends AbstractConfig { public static final String SCHEMA_REGISTRY_URL = "schema.registry.url"; + public static final String VALUE_SERIALIZER = "value.serializer"; // ex : + // io.confluent.kafka.serializers.KafkaAvroSerializer + public static final String AWS_ACCESS_KEY_ID_CONFIG = "aws.access.key.id"; public static final String AWS_SECRET_ACCESS_KEY_CONFIG = "aws.secret.access.key"; @@ -123,6 +126,8 @@ final public class S3SourceConfig extends AbstractConfig { public static final String AVRO_OUTPUT_FORMAT = "avro"; + public static final String PARQUET_OUTPUT_FORMAT = "parquet"; + public S3SourceConfig(final Map<String, String> properties) { super(configDef(), preprocessProperties(properties)); validate(); // NOPMD ConstructorCallsOverridableMethod getStsRole is called @@ -180,6 +185,11 @@ private static void addSchemaRegistryGroup(final ConfigDef configDef) { configDef.define(OUTPUT_FORMAT, ConfigDef.Type.STRING, "bytearray", new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "Output format avro/bytearray", GROUP_AWS, srCounter++, // NOPMD ConfigDef.Width.NONE, OUTPUT_FORMAT); + + configDef.define(VALUE_SERIALIZER, ConfigDef.Type.CLASS, "io.confluent.kafka.serializers.KafkaAvroSerializer", + ConfigDef.Importance.MEDIUM, "Value serializer", GROUP_OTHER, srCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, VALUE_SERIALIZER); } private static void addOtherConfig(final S3SourceConfigDef configDef) { From f8e2ac330c62d8c82ff48f942fbcc3fc69dda5bc Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Sat, 28 Sep 2024 12:39:47 +0200 Subject: [PATCH 20/90] Adding json support --- gradle-config/aiven-pmd-test-ruleset.xml | 2 +- gradle-config/spotbugs-exclude.xml | 12 +- .../connect/s3/source/IntegrationBase.java | 30 +++++ .../connect/s3/source/IntegrationTest.java | 21 ++++ .../s3/source/OffsetStoragePartitionKey.java | 92 ---------------- .../source/OffsetStoragePartitionValue.java | 84 -------------- .../connect/s3/source/S3SourceRecord.java | 22 ++-- .../kafka/connect/s3/source/S3SourceTask.java | 89 +++++++++++---- .../s3/source/SourceRecordIterator.java | 104 ++++++++++++++---- .../s3/source/config/S3SourceConfig.java | 2 + 10 files changed, 221 insertions(+), 237 deletions(-) delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionKey.java delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionValue.java diff --git a/gradle-config/aiven-pmd-test-ruleset.xml b/gradle-config/aiven-pmd-test-ruleset.xml index 264226283..65267db4b 100644 --- a/gradle-config/aiven-pmd-test-ruleset.xml +++ b/gradle-config/aiven-pmd-test-ruleset.xml @@ -78,7 +78,7 @@ </rule> <rule ref="category/java/codestyle.xml/TooManyStaticImports"> <properties> - <property name="maximumStaticImports" value="15" /> + <property name="maximumStaticImports" value="20" /> </properties> </rule> diff --git a/gradle-config/spotbugs-exclude.xml b/gradle-config/spotbugs-exclude.xml index ec10b9b20..64a55693e 100644 --- a/gradle-config/spotbugs-exclude.xml +++ b/gradle-config/spotbugs-exclude.xml @@ -24,16 +24,16 @@ <Bug pattern="CT_CONSTRUCTOR_THROW" /> </Match> <Match> - <Class name="io.aiven.kafka.connect.s3.source.OffsetStoragePartitionKey" /> - <Bug pattern="CT_CONSTRUCTOR_THROW" /> + <Class name="io.aiven.kafka.connect.s3.source.SourceRecordIterator" /> + <Bug pattern="EI_EXPOSE_REP2" /> </Match> <Match> - <Class name="io.aiven.kafka.connect.s3.source.OffsetStoragePartitionValue" /> - <Bug pattern="CT_CONSTRUCTOR_THROW" /> + <Class name="io.aiven.kafka.connect.s3.source.S3SourceRecord" /> + <Bug pattern="EI_EXPOSE_REP2" /> </Match> <Match> - <Class name="io.aiven.kafka.connect.s3.source.SourceRecordIterator" /> - <Bug pattern="EI_EXPOSE_REP2" /> + <Class name="io.aiven.kafka.connect.s3.source.S3SourceRecord" /> + <Bug pattern="EI_EXPOSE_REP" /> </Match> diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index 80224f896..0edb23772 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -37,12 +37,14 @@ import org.apache.kafka.clients.consumer.KafkaConsumer; import org.apache.kafka.common.serialization.ByteArrayDeserializer; import org.apache.kafka.common.serialization.StringDeserializer; +import org.apache.kafka.connect.json.JsonDeserializer; import com.amazonaws.auth.AWSStaticCredentialsProvider; import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.client.builder.AwsClientBuilder; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3ClientBuilder; +import com.fasterxml.jackson.databind.JsonNode; import com.github.dockerjava.api.model.Ulimit; import io.confluent.kafka.serializers.KafkaAvroDeserializer; import org.apache.avro.generic.GenericRecord; @@ -178,4 +180,32 @@ static List<GenericRecord> consumeAvroMessages(final String topic, final int exp return recordsList; } } + + static List<JsonNode> consumeJsonMessages(final String topic, final int expectedMessageCount, + final KafkaContainer kafka) { + final Properties props = new Properties(); + props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, kafka.getBootstrapServers()); + props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group-avro"); + props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); // Assuming string + // key + props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, JsonDeserializer.class.getName()); // Json + // deserializer + // for values + props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); + + try (KafkaConsumer<String, JsonNode> consumer = new KafkaConsumer<>(props)) { + consumer.subscribe(Collections.singletonList(topic)); + final List<JsonNode> recordsList = new ArrayList<>(); + + // Poll messages from the topic + while (recordsList.size() < expectedMessageCount) { + final ConsumerRecords<String, JsonNode> records = consumer.poll(500L); + for (final ConsumerRecord<String, JsonNode> record : records) { + recordsList.add(record.value()); // Add the GenericRecord to the list + } + } + + return recordsList; + } + } } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index de77faa5e..6419579d0 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -22,6 +22,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_ENDPOINT_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.PARQUET_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; @@ -52,6 +53,7 @@ import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.PutObjectRequest; +import com.fasterxml.jackson.databind.JsonNode; import org.apache.avro.Schema; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericData; @@ -250,6 +252,25 @@ void parquetTest(final TestInfo testInfo) throws ExecutionException, Interrupted assertThat(1).isEqualTo(1); } + @Test + void jsonTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { + final var topicName = IntegrationBase.topicName(testInfo); + final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + connectorConfig.put(OUTPUT_FORMAT, JSON_OUTPUT_FORMAT); + connectorConfig.put("value.converter", "org.apache.kafka.connect.json.JsonConverter"); + + connectRunner.createConnector(connectorConfig); + final String testMessage = "This is a test"; + final String jsonContent = "{\"message\": \"" + testMessage + "\", \"id\":\"1\"}"; + writeToS3(topicName, jsonContent.getBytes(StandardCharsets.UTF_8), 7); + + // Poll Json messages from the Kafka topic and deserialize them + final List<JsonNode> records = IntegrationBase.consumeJsonMessages(topicName, 1, KAFKA_CONTAINER); + + assertThat(records).extracting(record -> record.get("payload").get("message").asText()).contains(testMessage); + assertThat(records).extracting(record -> record.get("payload").get("id").asText()).contains("1"); + } + private static ByteArrayOutputStream getAvroRecord(final Schema schema, final int messageId) throws IOException { // Create Avro records final GenericRecord avroRecord = new GenericData.Record(schema); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionKey.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionKey.java deleted file mode 100644 index 6ac7359d9..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionKey.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.s3.source; - -import java.util.HashMap; -import java.util.Map; -import java.util.Objects; - -public class OffsetStoragePartitionKey { - - public static final String BUCKET_NAME = "bucketName"; - public static final String TOPIC = "topic"; - public static final String TOPIC_PARTITION = "topicPartition"; - private final String s3BucketName; - private final String topic; - private final int partition; - - public OffsetStoragePartitionKey(final String s3BucketName, final String topic, final int partition) { - if (s3BucketName == null || s3BucketName.isEmpty()) { - throw new IllegalArgumentException("S3 bucket name cannot be null or empty"); - } - if (topic == null || topic.isEmpty()) { - throw new IllegalArgumentException("Topic cannot be null or empty"); - } - if (partition < 0) { - throw new IllegalArgumentException("Partition must be a non-negative integer"); - } - - this.s3BucketName = s3BucketName; - this.topic = topic; - this.partition = partition; - } - - public static OffsetStoragePartitionKey fromPartitionMap(final String bucket, final String topic, - final int partition) { - return new OffsetStoragePartitionKey(bucket, topic, partition); - } - - public static OffsetStoragePartitionKey fromPartitionMap(final Map<String, Object> map) { - Objects.requireNonNull(map, "Input map cannot be null"); - final String bucket = (String) map.getOrDefault(BUCKET_NAME, ""); - final String topic = (String) map.getOrDefault(TOPIC, ""); - final int partition = ((Number) map.getOrDefault(TOPIC_PARTITION, -1)).intValue(); - return fromPartitionMap(bucket, topic, partition); - } - - public Map<String, Object> toPartitionMap() { - final Map<String, Object> map = new HashMap<>(); - map.put(BUCKET_NAME, s3BucketName); - map.put(TOPIC, topic); - map.put(TOPIC_PARTITION, partition); - return map; - } - - @Override - public boolean equals(final Object obj) { - if (this == obj) { - return true; - } - if (obj == null || getClass() != obj.getClass()) { - return false; - } - final OffsetStoragePartitionKey other = (OffsetStoragePartitionKey) obj; - return partition == other.partition && Objects.equals(s3BucketName, other.s3BucketName) - && Objects.equals(topic, other.topic); - } - - @Override - public int hashCode() { - return Objects.hash(s3BucketName, topic, partition); - } - - @Override - public String toString() { - return String.format("OffsetStoragePartitionKey{bucketName='%s', topic='%s', partition=%d}", s3BucketName, - topic, partition); - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionValue.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionValue.java deleted file mode 100644 index 5fa965ffa..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetStoragePartitionValue.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.s3.source; - -import java.util.HashMap; -import java.util.Map; -import java.util.Objects; - -public class OffsetStoragePartitionValue implements Comparable<OffsetStoragePartitionValue> { - - public static final String ORIGINAL_OFFSET = "storedOriginalOffset"; - private final long offset; - - public OffsetStoragePartitionValue(final long offset) { - if (offset < 0) { - throw new IllegalArgumentException("Offset cannot be negative"); - } - this.offset = offset; - } - - public static OffsetStoragePartitionValue fromOffsetMap(final long offset) { - return new OffsetStoragePartitionValue(offset); - } - - public static OffsetStoragePartitionValue fromOffsetMap(final Map<String, Object> map) { - Objects.requireNonNull(map, "Input map cannot be null"); - final Object offsetValue = map.get(ORIGINAL_OFFSET); - if (!(offsetValue instanceof Number)) { - throw new IllegalArgumentException("Original offset must be a valid number"); - } - return fromOffsetMap(((Number) offsetValue).longValue()); - } - - @Override - public String toString() { - return String.valueOf(offset); - } - - @Override - public int compareTo(final OffsetStoragePartitionValue other) { - return Long.compare(this.offset, other.offset); - } - - public Map<String, ?> asOffsetMap() { - final Map<String, Object> map = new HashMap<>(); - map.put(ORIGINAL_OFFSET, offset); - return map; - } - - public long getOffset() { - return offset; - } - - @Override - public boolean equals(final Object obj) { - if (this == obj) { - return true; - } - if (obj == null || getClass() != obj.getClass()) { - return false; - } - final OffsetStoragePartitionValue other = (OffsetStoragePartitionValue) obj; - return offset == other.offset; - } - - @Override - public int hashCode() { - return Objects.hash(offset); - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java index 156859bd2..1b9478bb0 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java @@ -17,32 +17,32 @@ package io.aiven.kafka.connect.s3.source; import java.util.Arrays; +import java.util.Map; public class S3SourceRecord { - private final OffsetStoragePartitionKey offsetStoragePartitionKey; - private final OffsetStoragePartitionValue offsetStoragePartitionValue; + private final Map<String, Object> partitionMap; + private final Map<String, Object> offsetMap; private final String toTopic; private final int topicPartition; private final byte[] recordKey; private final byte[] recordValue; - public S3SourceRecord(final OffsetStoragePartitionKey offsetStoragePartitionKey, - final OffsetStoragePartitionValue offsetStoragePartitionValue, final String toTopic, - final int topicPartition, final byte[] recordKey, final byte[] recordValue) { - this.offsetStoragePartitionKey = offsetStoragePartitionKey; - this.offsetStoragePartitionValue = offsetStoragePartitionValue; + public S3SourceRecord(final Map<String, Object> partitionMap, final Map<String, Object> offsetMap, + final String toTopic, final int topicPartition, final byte[] recordKey, final byte[] recordValue) { + this.partitionMap = partitionMap; + this.offsetMap = offsetMap; this.toTopic = toTopic; this.topicPartition = topicPartition; this.recordKey = Arrays.copyOf(recordKey, recordKey.length); this.recordValue = Arrays.copyOf(recordValue, recordValue.length); } - public OffsetStoragePartitionKey getOffsetStoragePartitionKey() { - return offsetStoragePartitionKey; + public Map<String, Object> getPartitionMap() { + return partitionMap; } - public OffsetStoragePartitionValue getOffsetStoragePartitionValue() { - return offsetStoragePartitionValue; + public Map<String, Object> getOffsetMap() { + return offsetMap; } public String getToTopic() { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index cca714283..844ff9b23 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -16,8 +16,11 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; @@ -63,7 +66,8 @@ public class S3SourceTask extends SourceTask { private S3SourceConfig s3SourceConfig; - private Map<OffsetStoragePartitionKey, OffsetStoragePartitionValue> offsets; + // private Map<OffsetStoragePartitionKey, OffsetStoragePartitionValue> offsets; + private Map<Map<String, Object>, Map<String, Object>> offsets; Iterator<S3SourceRecord> sourceRecordIterator; @@ -117,27 +121,65 @@ private void prepareReaderFromOffsetStorageReader() { final Set<String> targetTopics = getTargetTopics(); // map to s3 partitions - final List<OffsetStoragePartitionKey> offsetStoragePartitionKeys = offsetStorageTopicPartitions.stream() - .flatMap( - p -> targetTopics.stream().map(t -> OffsetStoragePartitionKey.fromPartitionMap(s3Bucket, t, p))) + // final List<OffsetStoragePartitionKey> offsetStoragePartitionKeys = offsetStorageTopicPartitions.stream() + // .flatMap( + // p -> targetTopics.stream().map(t -> OffsetStoragePartitionKey.fromPartitionMap(s3Bucket, t, p))) + // .collect(toList()); + + final List<Map<String, Object>> partitionKeys = offsetStorageTopicPartitions.stream() + .flatMap(p -> targetTopics.stream().map(t -> { + final Map<String, Object> partitionMap = new HashMap<>(); + partitionMap.put("bucket", s3Bucket); + partitionMap.put("topic", t); + partitionMap.put("partition", p); + return partitionMap; + })) .collect(toList()); + // Map<String, Object> partitionMapK = new HashMap<>(); + // partitionMapK.put("bucket", s3Bucket); + // partitionMapK.put("topic", "basicTest"); + // partitionMapK.put("partition", 0); + // + // Map<String, Object> partitionMapV = new HashMap<>(); + // partitionMapV.put("offset", 123); + // + // Map<Map<String, Object>, Map<String, Object>> offsetMapO = new HashMap<>(); + // offsetMapO.put(partitionMapK, partitionMapV); + // get partition offsets - final List<Map<String, Object>> partitions = offsetStoragePartitionKeys.stream() - .map(OffsetStoragePartitionKey::toPartitionMap) - .collect(toList()); + // final List<Map<String, Object>> partitions = offsetStoragePartitionKeys.stream() + // .map(OffsetStoragePartitionKey::toPartitionMap) + // .collect(toList()); + + // final Map<Map<String, Object>, Map<String, Object>> offsetMap = context.offsetStorageReader() + // .offsets(partitions); final Map<Map<String, Object>, Map<String, Object>> offsetMap = context.offsetStorageReader() - .offsets(partitions); + .offsets(partitionKeys); + + // offsetMap = offsetMapO; LOGGER.info("offsetMap : " + offsetMap); LOGGER.info("offsetMap entry set : " + offsetMap.entrySet()); if (offsets == null) { - offsets = offsetMap.entrySet() - .stream() - .filter(e -> e.getValue() != null) - .collect(toMap(entry -> OffsetStoragePartitionKey.fromPartitionMap(entry.getKey()), - entry -> OffsetStoragePartitionValue.fromOffsetMap(entry.getValue()))); + // offsets = offsetMap.entrySet() + // .stream() + // .filter(e -> e.getValue() != null) + // .collect(toMap(entry -> OffsetStoragePartitionKey.fromPartitionMap(entry.getKey()), + // entry -> OffsetStoragePartitionValue.fromOffsetMap(entry.getValue()))); + offsets = offsetMap.entrySet().stream().filter(e -> e.getValue() != null).collect(toMap(entry -> { + // Directly use the partition map (entry.getKey()) + final Map<String, Object> partitionMap = new HashMap<>(); + partitionMap.putAll(entry.getKey()); // Assuming entry.getKey() is already a map + return partitionMap; + }, entry -> { + // Directly use the offset map (entry.getValue()) + final Map<String, Object> offsetValueMap = new HashMap<>(); + offsetValueMap.putAll(entry.getValue()); // Assuming entry.getValue() is already a map + return offsetValueMap; + })); + } LOGGER.info("Storage offsets : " + offsets); sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, s3Client, s3Bucket, offsets); @@ -202,20 +244,25 @@ private List<SourceRecord> getSourceRecords(final List<SourceRecord> results) th for (int i = 0; sourceRecordIterator.hasNext() && i < s3SourceConfig.getInt(MAX_POLL_RECORDS) && !stopped.get(); i++) { final S3SourceRecord record = sourceRecordIterator.next(); - LOGGER.info(record.getOffsetStoragePartitionValue() + record.getToTopic() + record.partition()); + LOGGER.info(record.getOffsetMap() + record.getToTopic() + record.partition()); final String topic = record.getToTopic(); final Optional<SchemaAndValue> key = keyConverter.map(c -> c.toConnectData(topic, record.key())); - config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); - config.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); - + if (s3SourceConfig.getString(OUTPUT_FORMAT).equals(AVRO_OUTPUT_FORMAT)) { + config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + config.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); + } else if (s3SourceConfig.getString(OUTPUT_FORMAT).equals(JSON_OUTPUT_FORMAT)) { + config.put("schemas.enable", "false"); + } valueConverter.configure(config, false); + final SchemaAndValue value = valueConverter.toConnectData(topic, record.value()); - results.add(new SourceRecord(record.getOffsetStoragePartitionKey().toPartitionMap(), - record.getOffsetStoragePartitionValue().asOffsetMap(), topic, record.partition(), - key.map(SchemaAndValue::schema).orElse(null), key.map(SchemaAndValue::value).orElse(null), - value.schema(), value.value())); + // Create SourceRecord using partition and offset maps from the S3SourceRecord + results.add(new SourceRecord(record.getPartitionMap(), // Use partition map + record.getOffsetMap(), // Use offset map + topic, record.partition(), key.map(SchemaAndValue::schema).orElse(null), + key.map(SchemaAndValue::value).orElse(null), value.schema(), value.value())); } LOGGER.debug("{} records.", results.size()); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java index beda4f65e..5b0cb8ce3 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java @@ -18,6 +18,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.PARQUET_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; @@ -50,6 +51,8 @@ import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; import com.amazonaws.util.IOUtils; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import io.confluent.kafka.serializers.KafkaAvroSerializer; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.SeekableByteArrayInput; @@ -68,14 +71,14 @@ public final class SourceRecordIterator implements Iterator<S3SourceRecord> { private Iterator<S3ObjectSummary> nextFileIterator; private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> recordIterator = Collections.emptyIterator(); - private final Map<OffsetStoragePartitionKey, OffsetStoragePartitionValue> offsets; + private final Map<Map<String, Object>, Map<String, Object>> offsets; private final S3SourceConfig s3SourceConfig; private final String bucketName; private final AmazonS3 s3Client; public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, - final Map<OffsetStoragePartitionKey, OffsetStoragePartitionValue> offsets) { + final Map<Map<String, Object>, Map<String, Object>> offsets) { this.s3SourceConfig = s3SourceConfig; this.offsets = Optional.ofNullable(offsets).orElseGet(HashMap::new); this.s3Client = s3Client; @@ -130,20 +133,24 @@ private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> createIteratorForCurr case AVRO_OUTPUT_FORMAT : final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); DecoderFactory.get().binaryDecoder(content, null); - return getIterator(content, finalTopic, finalPartition, finalStartOffset, datumReader, true); + return getIterator(content, finalTopic, finalPartition, finalStartOffset, datumReader, + AVRO_OUTPUT_FORMAT); case PARQUET_OUTPUT_FORMAT : - return getIterator(content, finalTopic, finalPartition, finalStartOffset, null, false); + return getIterator(content, finalTopic, finalPartition, finalStartOffset, null, + PARQUET_OUTPUT_FORMAT); + case JSON_OUTPUT_FORMAT : + return getIterator(content, finalTopic, finalPartition, finalStartOffset, null, JSON_OUTPUT_FORMAT); default : - return getIterator(content, finalTopic, finalPartition, finalStartOffset, null, false); + return getIterator(content, finalTopic, finalPartition, finalStartOffset, null, ""); } } } private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> getIterator(final InputStream content, final String finalTopic, final int finalPartition, final long finalStartOffset, - final DatumReader<GenericRecord> datumReader, final boolean isAvro) { + final DatumReader<GenericRecord> datumReader, final String fileFormat) { return new Iterator<>() { - private Map<OffsetStoragePartitionKey, Long> currentOffsets = new HashMap<>(); + private Map<Map<String, Object>, Long> currentOffsets = new HashMap<>(); private Optional<ConsumerRecord<byte[], byte[]>> nextRecord = readNext(); private Optional<ConsumerRecord<byte[], byte[]>> readNext() { @@ -153,7 +160,7 @@ private Optional<ConsumerRecord<byte[], byte[]>> readNext() { key = Optional.of(currentKey.getBytes(StandardCharsets.UTF_8)); } byte[] value; - value = getBytes(isAvro, content, datumReader, finalTopic); + value = getBytes(fileFormat, content, datumReader, finalTopic); if (value == null) { if (key.isPresent()) { @@ -169,22 +176,53 @@ private Optional<ConsumerRecord<byte[], byte[]>> readNext() { } } + // private Optional<ConsumerRecord<byte[], byte[]>> getConsumerRecord(final Optional<byte[]> key, + // final byte[] value) { + // final OffsetStoragePartitionKey offsetStoragePartitionKey = OffsetStoragePartitionKey + // .fromPartitionMap(bucketName, finalTopic, finalPartition); + // + // long currentOffset; + // if (offsets.containsKey(offsetStoragePartitionKey)) { + // final OffsetStoragePartitionValue offsetStoragePartitionValue = offsets + // .get(offsetStoragePartitionKey); + // currentOffset = offsetStoragePartitionValue.getOffset() + 1; + // } else { + // currentOffset = currentOffsets.getOrDefault(offsetStoragePartitionKey, finalStartOffset); + // } + // final Optional<ConsumerRecord<byte[], byte[]>> record = Optional + // .of(new ConsumerRecord<>(finalTopic, finalPartition, currentOffset, key.orElse(null), value)); + // currentOffsets.put(offsetStoragePartitionKey, currentOffset + 1); + // return record; + // } + private Optional<ConsumerRecord<byte[], byte[]>> getConsumerRecord(final Optional<byte[]> key, final byte[] value) { - final OffsetStoragePartitionKey offsetStoragePartitionKey = OffsetStoragePartitionKey - .fromPartitionMap(bucketName, finalTopic, finalPartition); + // Create a map to represent the partition information + final Map<String, Object> partitionMap = new HashMap<>(); + partitionMap.put("bucket", bucketName); + partitionMap.put("topic", finalTopic); + partitionMap.put("partition", finalPartition); long currentOffset; - if (offsets.containsKey(offsetStoragePartitionKey)) { - final OffsetStoragePartitionValue offsetStoragePartitionValue = offsets - .get(offsetStoragePartitionKey); - currentOffset = offsetStoragePartitionValue.getOffset() + 1; + + // Check if the partition is present in the offsets map + if (offsets.containsKey(partitionMap)) { + // Retrieve the offset map and extract the offset value + final Map<String, Object> offsetMap = offsets.get(partitionMap); + currentOffset = (long) offsetMap.get("offset") + 1; // Assuming "offset" is the key for the offset + // value } else { - currentOffset = currentOffsets.getOrDefault(offsetStoragePartitionKey, finalStartOffset); + // If not present in offsets, check currentOffsets or use the finalStartOffset + currentOffset = currentOffsets.getOrDefault(partitionMap, finalStartOffset); } + + // Create the ConsumerRecord final Optional<ConsumerRecord<byte[], byte[]>> record = Optional .of(new ConsumerRecord<>(finalTopic, finalPartition, currentOffset, key.orElse(null), value)); - currentOffsets.put(offsetStoragePartitionKey, currentOffset + 1); + + // Update currentOffsets with the next offset + currentOffsets.put(partitionMap, currentOffset + 1); + return record; } @@ -205,10 +243,10 @@ public Optional<ConsumerRecord<byte[], byte[]>> next() { }; } - private byte[] getBytes(final boolean isAvro, final InputStream content, + private byte[] getBytes(final String fileFormat, final InputStream content, final DatumReader<GenericRecord> datumReader, final String topicName) throws IOException { byte[] value; - if (isAvro) { + if (fileFormat.equals(AVRO_OUTPUT_FORMAT)) { List<GenericRecord> items; try (SeekableInput sin = new SeekableByteArrayInput(IOUtils.toByteArray(content))) { try (DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, datumReader)) { @@ -217,12 +255,20 @@ private byte[] getBytes(final boolean isAvro, final InputStream content, } } value = serializeAvroRecordToBytes(items, topicName); + } else if (fileFormat.equals(JSON_OUTPUT_FORMAT)) { + value = serializeJsonData(content); } else { value = IOUtils.toByteArray(content); } return value; } + private byte[] serializeJsonData(final InputStream inputStream) throws IOException { + final ObjectMapper objectMapper = new ObjectMapper(); + final JsonNode jsonNode = objectMapper.readTree(inputStream); + return objectMapper.writeValueAsBytes(jsonNode); + } + @Deprecated private byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String finalTopic) throws IOException { @@ -267,10 +313,24 @@ public S3SourceRecord next() { throw new NoSuchElementException(); } final ConsumerRecord<byte[], byte[]> record = recordIterator.next().get(); - return new S3SourceRecord( - OffsetStoragePartitionKey.fromPartitionMap(bucketName, record.topic(), record.partition()), - OffsetStoragePartitionValue.fromOffsetMap(record.offset()), record.topic(), record.partition(), - record.key(), record.value()); + + // Create the partition map + final Map<String, Object> partitionMap = new HashMap<>(); + partitionMap.put("bucket", bucketName); + partitionMap.put("topic", record.topic()); + partitionMap.put("partition", record.partition()); + + // Create the offset map + final Map<String, Object> offsetMap = new HashMap<>(); + offsetMap.put("offset", record.offset()); + + return new S3SourceRecord(partitionMap, // Use the partition map + offsetMap, // Use the offset map + record.topic(), // Topic + record.partition(), // Partition + record.key(), // Record key + record.value() // Record value + ); } @Override diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 544f4850c..4cd4384e1 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -128,6 +128,8 @@ final public class S3SourceConfig extends AbstractConfig { public static final String PARQUET_OUTPUT_FORMAT = "parquet"; + public static final String JSON_OUTPUT_FORMAT = "json"; + public S3SourceConfig(final Map<String, String> properties) { super(configDef(), preprocessProperties(properties)); validate(); // NOPMD ConstructorCallsOverridableMethod getStsRole is called From 0102aa576d9ea8a8c36f0b69ce92b87a3429743f Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Mon, 30 Sep 2024 16:08:04 +0200 Subject: [PATCH 21/90] Refactor source task --- gradle-config/spotbugs-exclude.xml | 4 + .../connect/s3/source/OffsetManager.java | 156 +++++++++++++ .../connect/s3/source/RecordProcessor.java | 84 +++++++ .../kafka/connect/s3/source/S3SourceTask.java | 215 ++++-------------- .../s3/source/SourceRecordIterator.java | 202 ++++++++-------- 5 files changed, 386 insertions(+), 275 deletions(-) create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetManager.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/RecordProcessor.java diff --git a/gradle-config/spotbugs-exclude.xml b/gradle-config/spotbugs-exclude.xml index 64a55693e..69e2343b2 100644 --- a/gradle-config/spotbugs-exclude.xml +++ b/gradle-config/spotbugs-exclude.xml @@ -35,6 +35,10 @@ <Class name="io.aiven.kafka.connect.s3.source.S3SourceRecord" /> <Bug pattern="EI_EXPOSE_REP" /> </Match> + <Match> + <Class name="io.aiven.kafka.connect.s3.source.OffsetManager" /> + <Bug pattern="EI_EXPOSE_REP" /> + </Match> diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetManager.java new file mode 100644 index 000000000..3790d0fde --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetManager.java @@ -0,0 +1,156 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import static io.aiven.kafka.connect.s3.source.S3SourceTask.BUCKET; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.PARTITION; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.TOPIC; +import static java.util.stream.Collectors.toMap; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.kafka.connect.source.SourceTaskContext; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +public class OffsetManager { + private final Map<Map<String, Object>, Map<String, Object>> offsets; + + /** + * Constructor for OffsetManager. Initializes with the task context and S3 source configuration, and retrieves + * offsets. + * + * @param context + * SourceTaskContext that provides access to the offset storage + * @param s3SourceConfig + * S3SourceConfig that contains the source configuration details + */ + public OffsetManager(final SourceTaskContext context, final S3SourceConfig s3SourceConfig) { + final String s3Bucket = s3SourceConfig.getString(S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG); + final Set<Integer> partitions = parsePartitions(s3SourceConfig); + final Set<String> topics = parseTopics(s3SourceConfig); + + // Build the partition keys and fetch offsets from offset storage + final List<Map<String, Object>> partitionKeys = buildPartitionKeys(s3Bucket, partitions, topics); + final Map<Map<String, Object>, Map<String, Object>> offsetMap = context.offsetStorageReader() + .offsets(partitionKeys); + // Map<String, Object> partitionMapK = new HashMap<>(); + // partitionMapK.put("bucket", s3Bucket); + // partitionMapK.put("topic", "basicTest"); + // partitionMapK.put("partition", 0); + // + // Map<String, Object> partitionMapV = new HashMap<>(); + // partitionMapV.put("offset", 123l); + // + // offsetMap = context.offsetStorageReader() + // .offsets(partitionKeys); + // + // offsetMap.put(partitionMapK, partitionMapV); + + this.offsets = offsetMap.entrySet() + .stream() + .filter(e -> e.getValue() != null) + .collect(toMap(entry -> new HashMap<>(entry.getKey()), entry -> new HashMap<>(entry.getValue()))); + } + + /** + * Fetches all offsets for the current partitions and topics from the context. + * + * @return Map of partition keys and their corresponding offsets + */ + public Map<Map<String, Object>, Map<String, Object>> getOffsets() { + return offsets; + } + + /** + * Get the current offset for a specific partition. + * + * @param partitionMap + * The partition map containing bucket, topic, partition, etc. + * @return The offset for the given partition, or null if no offset exists. + */ + public Map<String, Object> getOffsetForPartition(final Map<String, Object> partitionMap) { + return offsets.get(partitionMap); + } + + /** + * Updates the offset for a specific partition. + * + * @param partitionMap + * The partition map. + * @param newOffset + * The new offset to be updated. + */ + public void updateOffset(final Map<String, Object> partitionMap, final Map<String, Object> newOffset) { + offsets.put(partitionMap, newOffset); + // You can persist offsets here if needed + } + + /** + * Helper method to parse partitions from the configuration. + * + * @param s3SourceConfig + * The S3 source configuration. + * @return Set of partitions. + */ + private static Set<Integer> parsePartitions(final S3SourceConfig s3SourceConfig) { + final String partitionString = s3SourceConfig.getString(S3SourceConfig.TARGET_TOPIC_PARTITIONS); + return Arrays.stream(partitionString.split(",")).map(Integer::parseInt).collect(Collectors.toSet()); + } + + /** + * Helper method to parse topics from the configuration. + * + * @param s3SourceConfig + * The S3 source configuration. + * @return Set of topics. + */ + private static Set<String> parseTopics(final S3SourceConfig s3SourceConfig) { + final String topicString = s3SourceConfig.getString(S3SourceConfig.TARGET_TOPICS); + return Arrays.stream(topicString.split(",")).collect(Collectors.toSet()); + } + + /** + * Builds partition keys to be used for offset retrieval. + * + * @param bucket + * The S3 bucket name. + * @param partitions + * The set of partitions. + * @param topics + * The set of topics. + * @return List of partition keys (maps) used for fetching offsets. + */ + private static List<Map<String, Object>> buildPartitionKeys(final String bucket, final Set<Integer> partitions, + final Set<String> topics) { + final List<Map<String, Object>> partitionKeys = new ArrayList<>(); + partitions.forEach(partition -> topics.forEach(topic -> { + final Map<String, Object> partitionMap = new HashMap<>(); + partitionMap.put(BUCKET, bucket); + partitionMap.put(TOPIC, topic); + partitionMap.put(PARTITION, partition); + partitionKeys.add(partitionMap); + })); + return partitionKeys; + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/RecordProcessor.java new file mode 100644 index 000000000..d1e1ab2e7 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/RecordProcessor.java @@ -0,0 +1,84 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.source.SourceRecord; +import org.apache.kafka.connect.storage.Converter; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +public final class RecordProcessor { + + private RecordProcessor() { + + } + public static List<SourceRecord> processRecords(final Iterator<S3SourceRecord> sourceRecordIterator, + final List<SourceRecord> results, final S3SourceConfig s3SourceConfig, + final Optional<Converter> keyConverter, final Converter valueConverter, + final AtomicBoolean connectorStopped) { + + final Map<String, String> conversionConfig = new HashMap<>(); + final int maxPollRecords = s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS); + + for (int i = 0; sourceRecordIterator.hasNext() && i < maxPollRecords && !connectorStopped.get(); i++) { + final S3SourceRecord record = sourceRecordIterator.next(); + final SourceRecord sourceRecord = createSourceRecord(record, s3SourceConfig, keyConverter, valueConverter, + conversionConfig); + results.add(sourceRecord); + } + + return results; + } + + private static SourceRecord createSourceRecord(final S3SourceRecord record, final S3SourceConfig s3SourceConfig, + final Optional<Converter> keyConverter, final Converter valueConverter, + final Map<String, String> conversionConfig) { + + final String topic = record.getToTopic(); + final Optional<SchemaAndValue> keyData = keyConverter.map(c -> c.toConnectData(topic, record.key())); + + configureValueConverter(s3SourceConfig.getString(S3SourceConfig.OUTPUT_FORMAT), conversionConfig, + s3SourceConfig); + valueConverter.configure(conversionConfig, false); + final SchemaAndValue value = valueConverter.toConnectData(topic, record.value()); + + return new SourceRecord(record.getPartitionMap(), record.getOffsetMap(), topic, record.partition(), + keyData.map(SchemaAndValue::schema).orElse(null), keyData.map(SchemaAndValue::value).orElse(null), + value.schema(), value.value()); + } + + private static void configureValueConverter(final String outputFormat, final Map<String, String> config, + final S3SourceConfig s3SourceConfig) { + if (AVRO_OUTPUT_FORMAT.equals(outputFormat)) { + config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + } else if (JSON_OUTPUT_FORMAT.equals(outputFormat)) { + config.put("schemas.enable", "false"); + } + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 844ff9b23..063ace0c6 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -16,32 +16,16 @@ package io.aiven.kafka.connect.s3.source; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; -import static java.util.stream.Collectors.toList; -import static java.util.stream.Collectors.toMap; import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.Optional; -import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; -import java.util.stream.Collectors; -import org.apache.kafka.clients.consumer.ConsumerConfig; -import org.apache.kafka.common.serialization.ByteArrayDeserializer; -import org.apache.kafka.connect.data.SchemaAndValue; import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.source.SourceTask; @@ -59,31 +43,28 @@ * S3SourceTask is a Kafka Connect SourceTask implementation that reads from source-s3 buckets and generates Kafka * Connect records. */ -@SuppressWarnings("PMD.ExcessiveImports") public class S3SourceTask extends SourceTask { - private static final Logger LOGGER = LoggerFactory.getLogger(AivenKafkaConnectS3SourceConnector.class); + private static final Logger LOGGER = LoggerFactory.getLogger(S3SourceTask.class); - private S3SourceConfig s3SourceConfig; + public static final String BUCKET = "bucket"; + public static final String TOPIC = "topic"; + public static final String PARTITION = "partition"; - // private Map<OffsetStoragePartitionKey, OffsetStoragePartitionValue> offsets; - private Map<Map<String, Object>, Map<String, Object>> offsets; + private static final long S_3_POLL_INTERVAL_MS = 10_000L; + private static final long ERROR_BACKOFF = 1000L; - Iterator<S3SourceRecord> sourceRecordIterator; + private S3SourceConfig s3SourceConfig; + private AmazonS3 s3Client; + private Iterator<S3SourceRecord> sourceRecordIterator; private Optional<Converter> keyConverter; private Converter valueConverter; - private final AtomicBoolean stopped = new AtomicBoolean(); - - private final static long S_3_POLL_INTERVAL_MS = 10_000L; - - private final static long ERROR_BACKOFF = 1000L; - - final S3ClientFactory s3ClientFactory = new S3ClientFactory(); - private AmazonS3 s3Client; + private final AtomicBoolean connectorStopped = new AtomicBoolean(); + private final S3ClientFactory s3ClientFactory = new S3ClientFactory(); - @SuppressWarnings("PMD.UnnecessaryConstructor") // required by Connect + @SuppressWarnings("PMD.UnnecessaryConstructor") public S3SourceTask() { super(); } @@ -93,184 +74,84 @@ public String version() { return Version.VERSION; } - @Deprecated @Override public void start(final Map<String, String> props) { LOGGER.info("S3 Source task started."); - Objects.requireNonNull(props, "props hasn't been set"); s3SourceConfig = new S3SourceConfig(props); + initializeConverters(); + initializeS3Client(); + prepareReaderFromOffsetStorageReader(); + } + @Deprecated + private void initializeConverters() { try { keyConverter = Optional.of((Converter) s3SourceConfig.getClass("key.converter").newInstance()); valueConverter = (Converter) s3SourceConfig.getClass("value.converter").newInstance(); } catch (InstantiationException | IllegalAccessException e) { throw new ConnectException("Connect converters could not be instantiated.", e); } + } + private void initializeS3Client() { this.s3Client = s3ClientFactory.createAmazonS3Client(s3SourceConfig); - - LOGGER.info("S3 client initialized "); - prepareReaderFromOffsetStorageReader(); + LOGGER.debug("S3 client initialized"); } - @Deprecated private void prepareReaderFromOffsetStorageReader() { - final String s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); - - final Set<Integer> offsetStorageTopicPartitions = getTargetTopicPartitions(); - final Set<String> targetTopics = getTargetTopics(); - - // map to s3 partitions - // final List<OffsetStoragePartitionKey> offsetStoragePartitionKeys = offsetStorageTopicPartitions.stream() - // .flatMap( - // p -> targetTopics.stream().map(t -> OffsetStoragePartitionKey.fromPartitionMap(s3Bucket, t, p))) - // .collect(toList()); - - final List<Map<String, Object>> partitionKeys = offsetStorageTopicPartitions.stream() - .flatMap(p -> targetTopics.stream().map(t -> { - final Map<String, Object> partitionMap = new HashMap<>(); - partitionMap.put("bucket", s3Bucket); - partitionMap.put("topic", t); - partitionMap.put("partition", p); - return partitionMap; - })) - .collect(toList()); - - // Map<String, Object> partitionMapK = new HashMap<>(); - // partitionMapK.put("bucket", s3Bucket); - // partitionMapK.put("topic", "basicTest"); - // partitionMapK.put("partition", 0); - // - // Map<String, Object> partitionMapV = new HashMap<>(); - // partitionMapV.put("offset", 123); - // - // Map<Map<String, Object>, Map<String, Object>> offsetMapO = new HashMap<>(); - // offsetMapO.put(partitionMapK, partitionMapV); + final OffsetManager offsetManager = new OffsetManager(context, s3SourceConfig); - // get partition offsets - // final List<Map<String, Object>> partitions = offsetStoragePartitionKeys.stream() - // .map(OffsetStoragePartitionKey::toPartitionMap) - // .collect(toList()); - - // final Map<Map<String, Object>, Map<String, Object>> offsetMap = context.offsetStorageReader() - // .offsets(partitions); - final Map<Map<String, Object>, Map<String, Object>> offsetMap = context.offsetStorageReader() - .offsets(partitionKeys); - - // offsetMap = offsetMapO; - - LOGGER.info("offsetMap : " + offsetMap); - LOGGER.info("offsetMap entry set : " + offsetMap.entrySet()); - - if (offsets == null) { - // offsets = offsetMap.entrySet() - // .stream() - // .filter(e -> e.getValue() != null) - // .collect(toMap(entry -> OffsetStoragePartitionKey.fromPartitionMap(entry.getKey()), - // entry -> OffsetStoragePartitionValue.fromOffsetMap(entry.getValue()))); - offsets = offsetMap.entrySet().stream().filter(e -> e.getValue() != null).collect(toMap(entry -> { - // Directly use the partition map (entry.getKey()) - final Map<String, Object> partitionMap = new HashMap<>(); - partitionMap.putAll(entry.getKey()); // Assuming entry.getKey() is already a map - return partitionMap; - }, entry -> { - // Directly use the offset map (entry.getValue()) - final Map<String, Object> offsetValueMap = new HashMap<>(); - offsetValueMap.putAll(entry.getValue()); // Assuming entry.getValue() is already a map - return offsetValueMap; - })); - - } - LOGGER.info("Storage offsets : " + offsets); - sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, s3Client, s3Bucket, offsets); - } - - private Set<Integer> getTargetTopicPartitions() { - final String partitionString = s3SourceConfig.getString(TARGET_TOPIC_PARTITIONS); - if (Objects.nonNull(partitionString)) { - return Arrays.stream(partitionString.split(",")).map(Integer::parseInt).collect(Collectors.toSet()); - } else { - throw new IllegalStateException("Offset storage topics partition list is not configured."); - } - } - - private Set<String> getTargetTopics() { - final String topicString = s3SourceConfig.getString(TARGET_TOPICS); - if (Objects.nonNull(topicString)) { - return Arrays.stream(topicString.split(",")).collect(Collectors.toSet()); - } else { - throw new IllegalStateException("Offset storage topics list is not configured."); - } + final String s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); + sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, s3Client, s3Bucket, offsetManager); } @Override public List<SourceRecord> poll() throws InterruptedException { final List<SourceRecord> results = new ArrayList<>(s3SourceConfig.getInt(MAX_POLL_RECORDS)); - if (stopped.get()) { + if (connectorStopped.get()) { return results; } - while (!stopped.get()) { + while (!connectorStopped.get()) { try { - return getSourceRecords(results); + return extractSourceRecords(results); } catch (AmazonS3Exception e) { - if (e.isRetryable()) { - LOGGER.warn("Retryable error while polling. Will sleep and try again.", e); - Thread.sleep(ERROR_BACKOFF); - prepareReaderFromOffsetStorageReader(); - } else { - // die - throw e; - } + handleS3Exception(e); } } return results; } - private List<SourceRecord> getSourceRecords(final List<SourceRecord> results) throws InterruptedException { - while (!sourceRecordIterator.hasNext() && !stopped.get()) { + private List<SourceRecord> extractSourceRecords(final List<SourceRecord> results) throws InterruptedException { + waitForObjects(); + if (connectorStopped.get()) { + return results; + } + return RecordProcessor.processRecords(sourceRecordIterator, results, s3SourceConfig, keyConverter, + valueConverter, connectorStopped); + } + + private void waitForObjects() throws InterruptedException { + while (!sourceRecordIterator.hasNext() && !connectorStopped.get()) { LOGGER.debug("Blocking until new S3 files are available."); - // sleep and block here until new files are available Thread.sleep(S_3_POLL_INTERVAL_MS); prepareReaderFromOffsetStorageReader(); } + } - if (stopped.get()) { - return results; - } - - final Map<String, String> config = new HashMap<>(); - for (int i = 0; sourceRecordIterator.hasNext() && i < s3SourceConfig.getInt(MAX_POLL_RECORDS) - && !stopped.get(); i++) { - final S3SourceRecord record = sourceRecordIterator.next(); - LOGGER.info(record.getOffsetMap() + record.getToTopic() + record.partition()); - final String topic = record.getToTopic(); - final Optional<SchemaAndValue> key = keyConverter.map(c -> c.toConnectData(topic, record.key())); - - if (s3SourceConfig.getString(OUTPUT_FORMAT).equals(AVRO_OUTPUT_FORMAT)) { - config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); - config.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); - } else if (s3SourceConfig.getString(OUTPUT_FORMAT).equals(JSON_OUTPUT_FORMAT)) { - config.put("schemas.enable", "false"); - } - valueConverter.configure(config, false); - - final SchemaAndValue value = valueConverter.toConnectData(topic, record.value()); - - // Create SourceRecord using partition and offset maps from the S3SourceRecord - results.add(new SourceRecord(record.getPartitionMap(), // Use partition map - record.getOffsetMap(), // Use offset map - topic, record.partition(), key.map(SchemaAndValue::schema).orElse(null), - key.map(SchemaAndValue::value).orElse(null), value.schema(), value.value())); + private void handleS3Exception(final AmazonS3Exception amazonS3Exception) throws InterruptedException { + if (amazonS3Exception.isRetryable()) { + LOGGER.warn("Retryable error while polling. Will sleep and try again.", amazonS3Exception); + Thread.sleep(ERROR_BACKOFF); + prepareReaderFromOffsetStorageReader(); + } else { + throw amazonS3Exception; } - - LOGGER.debug("{} records.", results.size()); - return results; } @Override public void stop() { - this.stopped.set(true); + this.connectorStopped.set(true); } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java index 5b0cb8ce3..3e2c6c8f3 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java @@ -16,6 +16,9 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.BUCKET; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.PARTITION; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.TOPIC; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; @@ -27,7 +30,6 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; -import java.net.ConnectException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; @@ -56,31 +58,42 @@ import io.confluent.kafka.serializers.KafkaAvroSerializer; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.SeekableByteArrayInput; -import org.apache.avro.file.SeekableInput; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import org.apache.avro.io.DecoderFactory; +/** + * Iterator that processes S3 files and creates Kafka source records. Supports different output formats (Avro, JSON, + * Parquet). + */ @SuppressWarnings("PMD.ExcessiveImports") public final class SourceRecordIterator implements Iterator<S3SourceRecord> { public static final Pattern DEFAULT_PATTERN = Pattern .compile("(?<topic>[^/]+?)-" + "(?<partition>\\d{5})-" + "(?<offset>\\d{12})" + "\\.(?<extension>[^.]+)$"); + public static final String PATTERN_TOPIC_KEY = "topic"; + public static final String PATTERN_PARTITION_KEY = "partition"; + public static final String OFFSET_KEY = "offset"; private String currentKey; + + final ObjectMapper objectMapper = new ObjectMapper(); private Iterator<S3ObjectSummary> nextFileIterator; private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> recordIterator = Collections.emptyIterator(); - private final Map<Map<String, Object>, Map<String, Object>> offsets; + // private final Map<Map<String, Object>, Map<String, Object>> offsets; + + private final OffsetManager offsetManager; private final S3SourceConfig s3SourceConfig; private final String bucketName; private final AmazonS3 s3Client; public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, - final Map<Map<String, Object>, Map<String, Object>> offsets) { + final OffsetManager offsetManager) { this.s3SourceConfig = s3SourceConfig; - this.offsets = Optional.ofNullable(offsets).orElseGet(HashMap::new); + // this.offsets = Optional.ofNullable(offsets).orElseGet(HashMap::new); + this.offsetManager = offsetManager; this.s3Client = s3Client; this.bucketName = bucketName; try { @@ -98,7 +111,7 @@ private List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) thro return new ArrayList<>(objectListing.getObjectSummaries()); } - private void nextObject() { + private void nextS3Object() { if (!nextFileIterator.hasNext()) { recordIterator = Collections.emptyIterator(); return; @@ -119,35 +132,37 @@ private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> createIteratorForCurr final Matcher matcher = DEFAULT_PATTERN.matcher(currentKey); String topic = null; int partition = 0; - long startOffset = 0l; + long startOffset = 0L; if (matcher.find()) { - topic = matcher.group("topic"); - partition = Integer.parseInt(matcher.group("partition")); - startOffset = Long.parseLong(matcher.group("offset")); + topic = matcher.group(PATTERN_TOPIC_KEY); + partition = Integer.parseInt(matcher.group(PATTERN_PARTITION_KEY)); + startOffset = Long.parseLong(matcher.group(OFFSET_KEY)); } final String finalTopic = topic; final int finalPartition = partition; final long finalStartOffset = startOffset; + switch (s3SourceConfig.getString(OUTPUT_FORMAT)) { case AVRO_OUTPUT_FORMAT : final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); DecoderFactory.get().binaryDecoder(content, null); - return getIterator(content, finalTopic, finalPartition, finalStartOffset, datumReader, + return getObjectIterator(content, finalTopic, finalPartition, finalStartOffset, datumReader, AVRO_OUTPUT_FORMAT); case PARQUET_OUTPUT_FORMAT : - return getIterator(content, finalTopic, finalPartition, finalStartOffset, null, + return getObjectIterator(content, finalTopic, finalPartition, finalStartOffset, null, PARQUET_OUTPUT_FORMAT); case JSON_OUTPUT_FORMAT : - return getIterator(content, finalTopic, finalPartition, finalStartOffset, null, JSON_OUTPUT_FORMAT); + return getObjectIterator(content, finalTopic, finalPartition, finalStartOffset, null, + JSON_OUTPUT_FORMAT); default : - return getIterator(content, finalTopic, finalPartition, finalStartOffset, null, ""); + return getObjectIterator(content, finalTopic, finalPartition, finalStartOffset, null, ""); } } } - private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> getIterator(final InputStream content, - final String finalTopic, final int finalPartition, final long finalStartOffset, + private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> getObjectIterator(final InputStream content, + final String topic, final int topicPartition, final long startOffset, final DatumReader<GenericRecord> datumReader, final String fileFormat) { return new Iterator<>() { private Map<Map<String, Object>, Long> currentOffsets = new HashMap<>(); @@ -155,12 +170,9 @@ private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> getIterator(final Inp private Optional<ConsumerRecord<byte[], byte[]>> readNext() { try { - Optional<byte[]> key = Optional.empty(); - if (currentKey != null) { - key = Optional.of(currentKey.getBytes(StandardCharsets.UTF_8)); - } - byte[] value; - value = getBytes(fileFormat, content, datumReader, finalTopic); + final Optional<byte[]> key = Optional.ofNullable(currentKey) + .map(k -> k.getBytes(StandardCharsets.UTF_8)); + final byte[] value = getValueBytes(fileFormat, content, datumReader, topic); if (value == null) { if (key.isPresent()) { @@ -176,52 +188,28 @@ private Optional<ConsumerRecord<byte[], byte[]>> readNext() { } } - // private Optional<ConsumerRecord<byte[], byte[]>> getConsumerRecord(final Optional<byte[]> key, - // final byte[] value) { - // final OffsetStoragePartitionKey offsetStoragePartitionKey = OffsetStoragePartitionKey - // .fromPartitionMap(bucketName, finalTopic, finalPartition); - // - // long currentOffset; - // if (offsets.containsKey(offsetStoragePartitionKey)) { - // final OffsetStoragePartitionValue offsetStoragePartitionValue = offsets - // .get(offsetStoragePartitionKey); - // currentOffset = offsetStoragePartitionValue.getOffset() + 1; - // } else { - // currentOffset = currentOffsets.getOrDefault(offsetStoragePartitionKey, finalStartOffset); - // } - // final Optional<ConsumerRecord<byte[], byte[]>> record = Optional - // .of(new ConsumerRecord<>(finalTopic, finalPartition, currentOffset, key.orElse(null), value)); - // currentOffsets.put(offsetStoragePartitionKey, currentOffset + 1); - // return record; - // } - private Optional<ConsumerRecord<byte[], byte[]>> getConsumerRecord(final Optional<byte[]> key, final byte[] value) { - // Create a map to represent the partition information final Map<String, Object> partitionMap = new HashMap<>(); - partitionMap.put("bucket", bucketName); - partitionMap.put("topic", finalTopic); - partitionMap.put("partition", finalPartition); + partitionMap.put(BUCKET, bucketName); + partitionMap.put(TOPIC, topic); + partitionMap.put(PARTITION, topicPartition); long currentOffset; - // Check if the partition is present in the offsets map - if (offsets.containsKey(partitionMap)) { - // Retrieve the offset map and extract the offset value - final Map<String, Object> offsetMap = offsets.get(partitionMap); - currentOffset = (long) offsetMap.get("offset") + 1; // Assuming "offset" is the key for the offset - // value + if (offsetManager.getOffsets().containsKey(partitionMap)) { + final Map<String, Object> offsetMap = offsetManager.getOffsetForPartition(partitionMap); + currentOffset = (long) offsetMap.get(OFFSET_KEY) + 1; } else { - // If not present in offsets, check currentOffsets or use the finalStartOffset - currentOffset = currentOffsets.getOrDefault(partitionMap, finalStartOffset); + currentOffset = currentOffsets.getOrDefault(partitionMap, startOffset); } - // Create the ConsumerRecord final Optional<ConsumerRecord<byte[], byte[]>> record = Optional - .of(new ConsumerRecord<>(finalTopic, finalPartition, currentOffset, key.orElse(null), value)); + .of(new ConsumerRecord<>(topic, topicPartition, currentOffset, key.orElse(null), value)); - // Update currentOffsets with the next offset - currentOffsets.put(partitionMap, currentOffset + 1); + final Map<String, Object> newOffset = new HashMap<>(); + newOffset.put(OFFSET_KEY, currentOffset + 1); + offsetManager.updateOffset(partitionMap, newOffset); return record; } @@ -243,55 +231,48 @@ public Optional<ConsumerRecord<byte[], byte[]>> next() { }; } - private byte[] getBytes(final String fileFormat, final InputStream content, + private byte[] getValueBytes(final String fileFormat, final InputStream content, final DatumReader<GenericRecord> datumReader, final String topicName) throws IOException { - byte[] value; - if (fileFormat.equals(AVRO_OUTPUT_FORMAT)) { - List<GenericRecord> items; - try (SeekableInput sin = new SeekableByteArrayInput(IOUtils.toByteArray(content))) { - try (DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, datumReader)) { - items = new ArrayList<>(); - reader.forEach(items::add); - } - } - value = serializeAvroRecordToBytes(items, topicName); - } else if (fileFormat.equals(JSON_OUTPUT_FORMAT)) { - value = serializeJsonData(content); + if (AVRO_OUTPUT_FORMAT.equals(fileFormat)) { + return serializeAvroRecordToBytes(readAvroRecords(content, datumReader), topicName); + } else if (JSON_OUTPUT_FORMAT.equals(fileFormat)) { + return serializeJsonData(content); } else { - value = IOUtils.toByteArray(content); + return IOUtils.toByteArray(content); } - return value; + } + + private List<GenericRecord> readAvroRecords(final InputStream content, final DatumReader<GenericRecord> datumReader) + throws IOException { + final List<GenericRecord> records = new ArrayList<>(); + try (SeekableByteArrayInput sin = new SeekableByteArrayInput(IOUtils.toByteArray(content))) { + try (DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, datumReader)) { + reader.forEach(records::add); + } + } + return records; } private byte[] serializeJsonData(final InputStream inputStream) throws IOException { - final ObjectMapper objectMapper = new ObjectMapper(); final JsonNode jsonNode = objectMapper.readTree(inputStream); return objectMapper.writeValueAsBytes(jsonNode); } @Deprecated - private byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String finalTopic) + private byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String topic) throws IOException { - // Create a map to configure the Avro serializer - final Map<String, String> config = new HashMap<>(); - config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + final Map<String, String> config = Collections.singletonMap(SCHEMA_REGISTRY_URL, + s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) s3SourceConfig.getClass(VALUE_SERIALIZER) - .newInstance()) { - avroSerializer.configure(config, false); // `false` since this is for value serialization - // Use a ByteArrayOutputStream to combine the serialized records - final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); - - // Loop through each Avro record and serialize it + .newInstance(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { + avroSerializer.configure(config, false); for (final GenericRecord avroRecord : avroRecords) { - final byte[] serializedRecord = avroSerializer.serialize(finalTopic, avroRecord); - outputStream.write(serializedRecord); + out.write(avroSerializer.serialize(topic, avroRecord)); } - - // Convert the combined output stream to a byte array and return it - return outputStream.toByteArray(); + return out.toByteArray(); } catch (InstantiationException | IllegalAccessException e) { - throw new ConnectException("Could not create instance of serializer."); + throw new IllegalStateException("Failed to initialize serializer", e); } } @@ -299,38 +280,43 @@ private InputStream getContent(final S3Object object) { return object.getObjectContent(); } + // @Override + // public boolean hasNext() { + // while (!recordIterator.hasNext() && nextFileIterator.hasNext()) { + // nextS3Object(); + // } + // return recordIterator.hasNext(); + // } + @Override public boolean hasNext() { - while (!recordIterator.hasNext() && nextFileIterator.hasNext()) { - nextObject(); - } - return recordIterator.hasNext(); + return recordIterator.hasNext() || nextFileIterator.hasNext(); } @Override public S3SourceRecord next() { - if (!hasNext()) { + if (!recordIterator.hasNext()) { + nextS3Object(); + } + + final Optional<ConsumerRecord<byte[], byte[]>> consumerRecord = recordIterator.next(); + if (consumerRecord.isEmpty()) { throw new NoSuchElementException(); } - final ConsumerRecord<byte[], byte[]> record = recordIterator.next().get(); - // Create the partition map + final ConsumerRecord<byte[], byte[]> currentRecord = consumerRecord.get(); + final Map<String, Object> partitionMap = new HashMap<>(); - partitionMap.put("bucket", bucketName); - partitionMap.put("topic", record.topic()); - partitionMap.put("partition", record.partition()); + partitionMap.put(BUCKET, bucketName); + partitionMap.put(TOPIC, currentRecord.topic()); + partitionMap.put(PARTITION, currentRecord.partition()); // Create the offset map final Map<String, Object> offsetMap = new HashMap<>(); - offsetMap.put("offset", record.offset()); - - return new S3SourceRecord(partitionMap, // Use the partition map - offsetMap, // Use the offset map - record.topic(), // Topic - record.partition(), // Partition - record.key(), // Record key - record.value() // Record value - ); + offsetMap.put(OFFSET_KEY, currentRecord.offset()); + + return new S3SourceRecord(partitionMap, offsetMap, currentRecord.topic(), currentRecord.partition(), + currentRecord.key(), currentRecord.value()); } @Override From 0687becc689e00e016cccbde898b31d2a7b34444 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Tue, 1 Oct 2024 09:15:33 +0200 Subject: [PATCH 22/90] Refactor classes --- gradle-config/spotbugs-exclude.xml | 10 ++-- .../AivenKafkaConnectS3SourceConnector.java | 1 + .../kafka/connect/s3/source/S3SourceTask.java | 5 ++ .../connect/s3/source/utils/FileReader.java | 55 +++++++++++++++++++ .../s3/source/{ => utils}/OffsetManager.java | 22 +++----- .../source/{ => utils}/RecordProcessor.java | 2 +- .../s3/source/{ => utils}/S3SourceRecord.java | 2 +- .../{ => utils}/SourceRecordIterator.java | 41 +++----------- .../s3/source/{ => utils}/Version.java | 6 +- 9 files changed, 87 insertions(+), 57 deletions(-) create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/{ => utils}/OffsetManager.java (89%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/{ => utils}/RecordProcessor.java (98%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/{ => utils}/S3SourceRecord.java (97%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/{ => utils}/SourceRecordIterator.java (88%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/{ => utils}/Version.java (90%) diff --git a/gradle-config/spotbugs-exclude.xml b/gradle-config/spotbugs-exclude.xml index 69e2343b2..362069917 100644 --- a/gradle-config/spotbugs-exclude.xml +++ b/gradle-config/spotbugs-exclude.xml @@ -20,23 +20,23 @@ <Bug pattern="CT_CONSTRUCTOR_THROW" /> </Match> <Match> - <Class name="io.aiven.kafka.connect.s3.source.SourceRecordIterator$1" /> + <Class name="io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator$1" /> <Bug pattern="CT_CONSTRUCTOR_THROW" /> </Match> <Match> - <Class name="io.aiven.kafka.connect.s3.source.SourceRecordIterator" /> + <Class name="io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator" /> <Bug pattern="EI_EXPOSE_REP2" /> </Match> <Match> - <Class name="io.aiven.kafka.connect.s3.source.S3SourceRecord" /> + <Class name="io.aiven.kafka.connect.s3.source.utils.S3SourceRecord" /> <Bug pattern="EI_EXPOSE_REP2" /> </Match> <Match> - <Class name="io.aiven.kafka.connect.s3.source.S3SourceRecord" /> + <Class name="io.aiven.kafka.connect.s3.source.utils.S3SourceRecord" /> <Bug pattern="EI_EXPOSE_REP" /> </Match> <Match> - <Class name="io.aiven.kafka.connect.s3.source.OffsetManager" /> + <Class name="io.aiven.kafka.connect.s3.source.utils.OffsetManager" /> <Bug pattern="EI_EXPOSE_REP" /> </Match> diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java index 308ea39da..65b25235e 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java @@ -26,6 +26,7 @@ import org.apache.kafka.connect.source.SourceConnector; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.utils.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 063ace0c6..f1e512f49 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -33,6 +33,11 @@ import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; +import io.aiven.kafka.connect.s3.source.utils.RecordProcessor; +import io.aiven.kafka.connect.s3.source.utils.S3SourceRecord; +import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; +import io.aiven.kafka.connect.s3.source.utils.Version; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.AmazonS3Exception; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java new file mode 100644 index 000000000..2dfcad3ea --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -0,0 +1,55 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.model.ListObjectsRequest; +import com.amazonaws.services.s3.model.ObjectListing; +import com.amazonaws.services.s3.model.S3Object; +import com.amazonaws.services.s3.model.S3ObjectSummary; + +public class FileReader { + + private final S3SourceConfig s3SourceConfig; + private final String bucketName; + + public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName) { + this.s3SourceConfig = s3SourceConfig; + this.bucketName = bucketName; + } + + List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOException { + final ObjectListing objectListing = s3Client.listObjects(new ListObjectsRequest().withBucketName(bucketName) + .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * 2)); + + return new ArrayList<>(objectListing.getObjectSummaries()); + } + + InputStream getContent(final S3Object object) { + return object.getObjectContent(); + } + +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java similarity index 89% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetManager.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index 3790d0fde..c84dd5254 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -14,11 +14,12 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source; +package io.aiven.kafka.connect.s3.source.utils; import static io.aiven.kafka.connect.s3.source.S3SourceTask.BUCKET; import static io.aiven.kafka.connect.s3.source.S3SourceTask.PARTITION; import static io.aiven.kafka.connect.s3.source.S3SourceTask.TOPIC; +import static io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator.OFFSET_KEY; import static java.util.stream.Collectors.toMap; import java.util.ArrayList; @@ -82,15 +83,8 @@ public Map<Map<String, Object>, Map<String, Object>> getOffsets() { return offsets; } - /** - * Get the current offset for a specific partition. - * - * @param partitionMap - * The partition map containing bucket, topic, partition, etc. - * @return The offset for the given partition, or null if no offset exists. - */ - public Map<String, Object> getOffsetForPartition(final Map<String, Object> partitionMap) { - return offsets.get(partitionMap); + public long getIncrementedOffsetForPartition(final Map<String, Object> partitionMap) { + return (long) (offsets.get(partitionMap)).get(OFFSET_KEY) + 1L; } /** @@ -98,12 +92,12 @@ public Map<String, Object> getOffsetForPartition(final Map<String, Object> parti * * @param partitionMap * The partition map. - * @param newOffset - * The new offset to be updated. */ - public void updateOffset(final Map<String, Object> partitionMap, final Map<String, Object> newOffset) { + public void updateOffset(final Map<String, Object> partitionMap, final long currentOffset) { + final Map<String, Object> newOffset = new HashMap<>(); + // increment offset id by 1 + newOffset.put(OFFSET_KEY, currentOffset + 1); offsets.put(partitionMap, newOffset); - // You can persist offsets here if needed } /** diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java similarity index 98% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/RecordProcessor.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index d1e1ab2e7..59429db8a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source; +package io.aiven.kafka.connect.s3.source.utils; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java similarity index 97% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java index 1b9478bb0..9ed47ac9e 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceRecord.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source; +package io.aiven.kafka.connect.s3.source.utils; import java.util.Arrays; import java.util.Map; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java similarity index 88% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 3e2c6c8f3..538437e34 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -14,13 +14,12 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source; +package io.aiven.kafka.connect.s3.source.utils; import static io.aiven.kafka.connect.s3.source.S3SourceTask.BUCKET; import static io.aiven.kafka.connect.s3.source.S3SourceTask.PARTITION; import static io.aiven.kafka.connect.s3.source.S3SourceTask.TOPIC; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.PARQUET_OUTPUT_FORMAT; @@ -48,8 +47,6 @@ import com.amazonaws.AmazonClientException; import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.model.ListObjectsRequest; -import com.amazonaws.services.s3.model.ObjectListing; import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; import com.amazonaws.util.IOUtils; @@ -81,36 +78,29 @@ public final class SourceRecordIterator implements Iterator<S3SourceRecord> { private Iterator<S3ObjectSummary> nextFileIterator; private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> recordIterator = Collections.emptyIterator(); - // private final Map<Map<String, Object>, Map<String, Object>> offsets; - private final OffsetManager offsetManager; private final S3SourceConfig s3SourceConfig; private final String bucketName; private final AmazonS3 s3Client; + private final FileReader fileReader; + public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, final OffsetManager offsetManager) { this.s3SourceConfig = s3SourceConfig; - // this.offsets = Optional.ofNullable(offsets).orElseGet(HashMap::new); this.offsetManager = offsetManager; this.s3Client = s3Client; this.bucketName = bucketName; + this.fileReader = new FileReader(s3SourceConfig, bucketName); try { - final List<S3ObjectSummary> chunks = fetchObjectSummaries(s3Client); + final List<S3ObjectSummary> chunks = fileReader.fetchObjectSummaries(s3Client); nextFileIterator = chunks.iterator(); } catch (IOException e) { throw new AmazonClientException("Failed to initialize S3 file reader", e); } } - private List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOException { - final ObjectListing objectListing = s3Client.listObjects(new ListObjectsRequest().withBucketName(bucketName) - .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * 2)); - - return new ArrayList<>(objectListing.getObjectSummaries()); - } - private void nextS3Object() { if (!nextFileIterator.hasNext()) { recordIterator = Collections.emptyIterator(); @@ -128,7 +118,7 @@ private void nextS3Object() { private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentFile() throws IOException { final S3Object s3Object = s3Client.getObject(bucketName, currentKey); - try (InputStream content = getContent(s3Object)) { + try (InputStream content = fileReader.getContent(s3Object)) { final Matcher matcher = DEFAULT_PATTERN.matcher(currentKey); String topic = null; int partition = 0; @@ -198,8 +188,7 @@ private Optional<ConsumerRecord<byte[], byte[]>> getConsumerRecord(final Optiona long currentOffset; if (offsetManager.getOffsets().containsKey(partitionMap)) { - final Map<String, Object> offsetMap = offsetManager.getOffsetForPartition(partitionMap); - currentOffset = (long) offsetMap.get(OFFSET_KEY) + 1; + currentOffset = offsetManager.getIncrementedOffsetForPartition(partitionMap); } else { currentOffset = currentOffsets.getOrDefault(partitionMap, startOffset); } @@ -207,9 +196,7 @@ private Optional<ConsumerRecord<byte[], byte[]>> getConsumerRecord(final Optiona final Optional<ConsumerRecord<byte[], byte[]>> record = Optional .of(new ConsumerRecord<>(topic, topicPartition, currentOffset, key.orElse(null), value)); - final Map<String, Object> newOffset = new HashMap<>(); - newOffset.put(OFFSET_KEY, currentOffset + 1); - offsetManager.updateOffset(partitionMap, newOffset); + offsetManager.updateOffset(partitionMap, currentOffset); return record; } @@ -276,18 +263,6 @@ private byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, } } - private InputStream getContent(final S3Object object) { - return object.getObjectContent(); - } - - // @Override - // public boolean hasNext() { - // while (!recordIterator.hasNext() && nextFileIterator.hasNext()) { - // nextS3Object(); - // } - // return recordIterator.hasNext(); - // } - @Override public boolean hasNext() { return recordIterator.hasNext() || nextFileIterator.hasNext(); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/Version.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/Version.java similarity index 90% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/Version.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/Version.java index 2ee4feb44..1d4dcb33d 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/Version.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/Version.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source; +package io.aiven.kafka.connect.s3.source.utils; import java.io.InputStream; import java.util.Properties; @@ -22,12 +22,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -final class Version { +public final class Version { private static final Logger LOGGER = LoggerFactory.getLogger(Version.class); private static final String PROPERTIES_FILENAME = "s3-source-connector-for-apache-kafka-version.properties"; - static final String VERSION; // NOPMD AvoidFieldNameMatchingTypeName + public static final String VERSION; // NOPMD AvoidFieldNameMatchingTypeName static { final Properties props = new Properties(); From 914c2cd78d1447f9c0cf5d75e564c13650bc85e0 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Thu, 3 Oct 2024 15:56:12 +0200 Subject: [PATCH 23/90] Adding parquet --- gradle-config/spotbugs-exclude.xml | 4 +- s3-source-connector/build.gradle.kts | 46 +++++++ .../connect/s3/source/IntegrationTest.java | 74 +++++++++-- .../kafka/connect/s3/source/S3SourceTask.java | 4 +- ...ceRecord.java => AivenS3SourceRecord.java} | 4 +- .../s3/source/utils/OffsetManager.java | 12 -- .../connect/s3/source/utils/ParquetUtils.java | 102 ++++++++++++++++ .../s3/source/utils/RecordProcessor.java | 51 +++++--- .../s3/source/utils/SourceRecordIterator.java | 115 +++++++++++------- 9 files changed, 320 insertions(+), 92 deletions(-) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/{S3SourceRecord.java => AivenS3SourceRecord.java} (92%) create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java diff --git a/gradle-config/spotbugs-exclude.xml b/gradle-config/spotbugs-exclude.xml index 362069917..0ab87e082 100644 --- a/gradle-config/spotbugs-exclude.xml +++ b/gradle-config/spotbugs-exclude.xml @@ -28,11 +28,11 @@ <Bug pattern="EI_EXPOSE_REP2" /> </Match> <Match> - <Class name="io.aiven.kafka.connect.s3.source.utils.S3SourceRecord" /> + <Class name="io.aiven.kafka.connect.s3.source.utils.AivenS3SourceRecord" /> <Bug pattern="EI_EXPOSE_REP2" /> </Match> <Match> - <Class name="io.aiven.kafka.connect.s3.source.utils.S3SourceRecord" /> + <Class name="io.aiven.kafka.connect.s3.source.utils.AivenS3SourceRecord" /> <Bug pattern="EI_EXPOSE_REP" /> </Match> <Match> diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts index 9e699e72b..73c952f42 100644 --- a/s3-source-connector/build.gradle.kts +++ b/s3-source-connector/build.gradle.kts @@ -20,6 +20,7 @@ plugins { id("aiven-apache-kafka-connectors-all.java-conventions") } val amazonS3Version by extra("1.12.729") val amazonSTSVersion by extra("1.12.729") +val parquetVersion by extra("1.11.2") val integrationTest: SourceSet = sourceSets.create("integrationTest") { @@ -73,6 +74,12 @@ dependencies { implementation(confluent.kafka.connect.avro.converter) { exclude(group = "org.apache.kafka", module = "kafka-clients") } + implementation(apache.parquet.tools) + implementation(apache.parquet.avro) { + exclude(group = "org.xerial.snappy", module = "snappy-java") + exclude(group = "org.slf4j", module = "slf4j-api") + exclude(group = "org.apache.avro", module = "avro") + } testImplementation(compressionlibs.snappy) testImplementation(compressionlibs.zstd.jni) @@ -88,6 +95,45 @@ dependencies { testRuntimeOnly(testinglibs.junit.jupiter.engine) testImplementation(testinglibs.mockito.junit.jupiter) + // implementation(apache.hadoop.common) + + implementation(apache.hadoop.common) { + exclude(group = "org.apache.hadoop", module = "hadoop-yarn-client") + exclude(group = "org.apache.hadoop.thirdparty", module = "hadoop-shaded-protobuf_3_7") + exclude(group = "com.google.guava", module = "guava") + exclude(group = "commons-cli", module = "commons-cli") + exclude(group = "org.apache.commons", module = "commons-math3") + exclude(group = "org.apache.httpcomponents", module = "httpclient") + exclude(group = "commons-codec", module = "commons-codec") + exclude(group = "commons-io", module = "commons-io") + exclude(group = "commons-net", module = "commons-net") + exclude(group = "org.eclipse.jetty") + exclude(group = "org.eclipse.jetty.websocket") + exclude(group = "javax.servlet") + exclude(group = "javax.servlet.jsp") + exclude(group = "javax.activation") + exclude(group = "com.sun.jersey") + exclude(group = "log4j") + exclude(group = "org.apache.commons", module = "commons-text") + exclude(group = "org.slf4j", module = "slf4j-api") + // exclude(group = "org.apache.hadoop", module = "hadoop-auth") + exclude(group = "org.apache.hadoop", module = "hadoop-yarn-api") + exclude(group = "com.google.re2j") + exclude(group = "com.google.protobuf") + exclude(group = "com.google.code.gson") + exclude(group = "com.jcraft") + exclude(group = "org.apache.curator") + exclude(group = "org.apache.zookeeper") + exclude(group = "org.apache.htrace") + exclude(group = "com.google.code.findbugs") + exclude(group = "org.apache.kerby") + exclude(group = "com.fasterxml.jackson.core") + exclude(group = "com.fasterxml.woodstox", module = "woodstox-core:5.0.3") + exclude(group = "org.apache.avro", module = "avro") + exclude(group = "org.apache.hadoop", module = "hadoop-yarn-common") + exclude(group = "com.google.inject.extensions", module = "guice-servlet") + exclude(group = "io.netty", module = "netty") + } testRuntimeOnly(logginglibs.logback.classic) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 6419579d0..924ffdfd0 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -35,6 +35,7 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.net.ConnectException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; @@ -61,6 +62,9 @@ import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumWriter; import org.apache.commons.io.IOUtils; +import org.apache.parquet.avro.AvroParquetWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.junit.Ignore; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; @@ -229,27 +233,34 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc } @Test - @Ignore - void parquetTest(final TestInfo testInfo) throws ExecutionException, InterruptedException { + void parquetTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectorConfig.put(OUTPUT_FORMAT, PARQUET_OUTPUT_FORMAT); connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); + connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); + connectorConfig.put("value.converter.schema.registry.url", SCHEMA_REGISTRY.getSchemaRegistryUrl()); final String partition = "00000"; final String offset = "000000000123"; final String fileName = topicName + "-" + partition + "-" + offset + ".txt"; connectRunner.createConnector(connectorConfig); - try (InputStream resourceStream = Thread.currentThread() - .getContextClassLoader() - .getResourceAsStream("sample1.parquet")) { - s3Client.putObject(TEST_BUCKET_NAME, fileName, resourceStream, null); - } catch (final Exception e) { // NOPMD broad exception catched + final String tmpFilePath = "/tmp/users.parquet"; + final String name1 = "Alice"; + final String name2 = "Bob"; + writeParquetFile(tmpFilePath, name1, name2); + final Path path = Paths.get(tmpFilePath); + try { + s3Client.putObject(TEST_BUCKET_NAME, fileName, Files.newInputStream(path), null); + } catch (final Exception e) { // NOPMD broad exception caught LOGGER.error("Error in reading file" + e.getMessage()); } - // TODO - assertThat(1).isEqualTo(1); + Files.delete(path); + final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 2, KAFKA_CONTAINER, + SCHEMA_REGISTRY.getSchemaRegistryUrl()); + assertThat(2).isEqualTo(records.size()); + assertThat(records).extracting(record -> record.get("name").toString()).contains(name1).contains(name2); } @Test @@ -298,6 +309,7 @@ private static void writeToS3(final String topicName, final byte[] testDataBytes Files.write(testFilePath, testDataBytes); saveToS3(TEST_BUCKET_NAME, "", fileName, testFilePath.toFile()); + Files.delete(testFilePath); } @Deprecated @@ -342,4 +354,48 @@ public void multipartUpload(final String bucketName, final String key) { } } + public static void writeParquetFile(final String tempFilePath, final String name1, final String name2) + throws IOException { + // Define the Avro schema + final String schemaString = "{" + "\"type\":\"record\"," + "\"name\":\"User\"," + "\"fields\":[" + + "{\"name\":\"name\",\"type\":\"string\"}," + "{\"name\":\"age\",\"type\":\"int\"}," + + "{\"name\":\"email\",\"type\":\"string\"}" + "]" + "}"; + final Schema schema = new Schema.Parser().parse(schemaString); + + // Write the Parquet file + try { + writeParquetFile(tempFilePath, schema, name1, name2); + } catch (IOException e) { + throw new ConnectException("Error writing parquet file"); + } + } + + private static void writeParquetFile(final String outputPath, final Schema schema, final String name1, + final String name2) throws IOException { + + // Create sample records + final GenericData.Record user1 = new GenericData.Record(schema); + user1.put("name", name1); + user1.put("age", 30); + user1.put("email", "alice@example.com"); + + final GenericData.Record user2 = new GenericData.Record(schema); + user2.put("name", name2); + user2.put("age", 25); + user2.put("email", "bob@example.com"); + + // Create a Parquet writer + final org.apache.hadoop.fs.Path path = new org.apache.hadoop.fs.Path(outputPath); // NOPMD + try (ParquetWriter<GenericData.Record> writer = AvroParquetWriter.<GenericData.Record>builder(path) + .withSchema(schema) + .withCompressionCodec(CompressionCodecName.SNAPPY) // You can choose GZIP, LZO, etc. + .withRowGroupSize(100 * 1024) // Customize row group size + .withPageSize(1024 * 1024) // Customize page size + .build()) { + // Write records to the Parquet file + writer.write(user1); + writer.write(user2); + } + } + } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index f1e512f49..d3351de1e 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -33,9 +33,9 @@ import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.utils.AivenS3SourceRecord; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import io.aiven.kafka.connect.s3.source.utils.RecordProcessor; -import io.aiven.kafka.connect.s3.source.utils.S3SourceRecord; import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; import io.aiven.kafka.connect.s3.source.utils.Version; @@ -62,7 +62,7 @@ public class S3SourceTask extends SourceTask { private S3SourceConfig s3SourceConfig; private AmazonS3 s3Client; - private Iterator<S3SourceRecord> sourceRecordIterator; + private Iterator<List<AivenS3SourceRecord>> sourceRecordIterator; private Optional<Converter> keyConverter; private Converter valueConverter; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java similarity index 92% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java index 9ed47ac9e..e3ea1bc77 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java @@ -19,7 +19,7 @@ import java.util.Arrays; import java.util.Map; -public class S3SourceRecord { +public class AivenS3SourceRecord { private final Map<String, Object> partitionMap; private final Map<String, Object> offsetMap; private final String toTopic; @@ -27,7 +27,7 @@ public class S3SourceRecord { private final byte[] recordKey; private final byte[] recordValue; - public S3SourceRecord(final Map<String, Object> partitionMap, final Map<String, Object> offsetMap, + public AivenS3SourceRecord(final Map<String, Object> partitionMap, final Map<String, Object> offsetMap, final String toTopic, final int topicPartition, final byte[] recordKey, final byte[] recordValue) { this.partitionMap = partitionMap; this.offsetMap = offsetMap; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index c84dd5254..3343702c2 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -55,18 +55,6 @@ public OffsetManager(final SourceTaskContext context, final S3SourceConfig s3Sou final List<Map<String, Object>> partitionKeys = buildPartitionKeys(s3Bucket, partitions, topics); final Map<Map<String, Object>, Map<String, Object>> offsetMap = context.offsetStorageReader() .offsets(partitionKeys); - // Map<String, Object> partitionMapK = new HashMap<>(); - // partitionMapK.put("bucket", s3Bucket); - // partitionMapK.put("topic", "basicTest"); - // partitionMapK.put("partition", 0); - // - // Map<String, Object> partitionMapV = new HashMap<>(); - // partitionMapV.put("offset", 123l); - // - // offsetMap = context.offsetStorageReader() - // .offsets(partitionKeys); - // - // offsetMap.put(partitionMapK, partitionMapV); this.offsets = offsetMap.entrySet() .stream() diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java new file mode 100644 index 000000000..a3dc9807c --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java @@ -0,0 +1,102 @@ +/* + * Copyright 2021 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.channels.Channels; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.Instant; +import java.util.ArrayList; +import java.util.List; + +import org.apache.kafka.connect.errors.ConnectException; + +import org.apache.avro.generic.GenericRecord; +import org.apache.parquet.avro.AvroParquetReader; +import org.apache.parquet.io.DelegatingSeekableInputStream; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.SeekableInputStream; + +final class ParquetUtils { + + public static final String TMP_DIR = "/tmp"; + public static final int BUFFER_SIZE = 8192; + + private ParquetUtils() { + /* hide constructor */ } + + static List<GenericRecord> getRecords(final InputStream inputStream, final String topic) throws IOException { + final Path tmpDir = Paths.get(TMP_DIR); + + final String timestamp = String.valueOf(Instant.now().toEpochMilli()); + final Path parquetFile = tmpDir.resolve(topic + "_" + timestamp + ".parquet"); + + // Write the byte array to a file + try (OutputStream outputStream = Files.newOutputStream(parquetFile)) { + final byte[] buffer = new byte[BUFFER_SIZE]; + + int bytesRead = inputStream.read(buffer); + while (bytesRead != -1) { + outputStream.write(buffer, 0, bytesRead); // Write buffer to file + bytesRead = inputStream.read(buffer); + } + } catch (IOException e) { + throw new ConnectException("Error writing tmp parquet file", e); + } + + final var records = new ArrayList<GenericRecord>(); + final var seekableByteChannel = Files.newByteChannel(parquetFile); + try (var parquetReader = AvroParquetReader.<GenericRecord>builder(new InputFile() { + @Override + public long getLength() throws IOException { + return seekableByteChannel.size(); + } + + @Override + public SeekableInputStream newStream() { + return new DelegatingSeekableInputStream(Channels.newInputStream(seekableByteChannel)) { + @Override + public long getPos() throws IOException { + return seekableByteChannel.position(); + } + + @Override + public void seek(final long value) throws IOException { + seekableByteChannel.position(value); + } + }; + } + + }).withCompatibility(false).build()) { + var record = parquetReader.read(); + while (record != null) { + records.add(record); + record = parquetReader.read(); + } + } + if (Files.exists(parquetFile)) { + Files.delete(parquetFile); + } + + return records; + } + +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index 59429db8a..871c1cd69 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -18,8 +18,10 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.PARQUET_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; +import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -35,10 +37,12 @@ public final class RecordProcessor { + public static final String SCHEMAS_ENABLE = "schemas.enable"; + private RecordProcessor() { } - public static List<SourceRecord> processRecords(final Iterator<S3SourceRecord> sourceRecordIterator, + public static List<SourceRecord> processRecords(final Iterator<List<AivenS3SourceRecord>> sourceRecordIterator, final List<SourceRecord> results, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, final AtomicBoolean connectorStopped) { @@ -47,38 +51,47 @@ public static List<SourceRecord> processRecords(final Iterator<S3SourceRecord> s final int maxPollRecords = s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS); for (int i = 0; sourceRecordIterator.hasNext() && i < maxPollRecords && !connectorStopped.get(); i++) { - final S3SourceRecord record = sourceRecordIterator.next(); - final SourceRecord sourceRecord = createSourceRecord(record, s3SourceConfig, keyConverter, valueConverter, - conversionConfig); - results.add(sourceRecord); + final List<AivenS3SourceRecord> recordList = sourceRecordIterator.next(); + final List<SourceRecord> sourceRecords = createSourceRecords(recordList, s3SourceConfig, keyConverter, + valueConverter, conversionConfig); + results.addAll(sourceRecords); } return results; } - private static SourceRecord createSourceRecord(final S3SourceRecord record, final S3SourceConfig s3SourceConfig, - final Optional<Converter> keyConverter, final Converter valueConverter, + @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") + private static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRecord> aivenS3SourceRecordList, + final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, final Map<String, String> conversionConfig) { - final String topic = record.getToTopic(); - final Optional<SchemaAndValue> keyData = keyConverter.map(c -> c.toConnectData(topic, record.key())); - - configureValueConverter(s3SourceConfig.getString(S3SourceConfig.OUTPUT_FORMAT), conversionConfig, - s3SourceConfig); - valueConverter.configure(conversionConfig, false); - final SchemaAndValue value = valueConverter.toConnectData(topic, record.value()); + final List<SourceRecord> sourceRecordList = new ArrayList<>(); + for (final AivenS3SourceRecord aivenS3SourceRecord : aivenS3SourceRecordList) { + final String topic = aivenS3SourceRecord.getToTopic(); + final Optional<SchemaAndValue> keyData = keyConverter + .map(c -> c.toConnectData(topic, aivenS3SourceRecord.key())); + + configureValueConverter(s3SourceConfig.getString(S3SourceConfig.OUTPUT_FORMAT), conversionConfig, + s3SourceConfig); + valueConverter.configure(conversionConfig, false); + final SchemaAndValue value = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); + + final SourceRecord sourceRecord = new SourceRecord(aivenS3SourceRecord.getPartitionMap(), + aivenS3SourceRecord.getOffsetMap(), topic, aivenS3SourceRecord.partition(), + keyData.map(SchemaAndValue::schema).orElse(null), keyData.map(SchemaAndValue::value).orElse(null), + value.schema(), value.value()); + sourceRecordList.add(sourceRecord); + } - return new SourceRecord(record.getPartitionMap(), record.getOffsetMap(), topic, record.partition(), - keyData.map(SchemaAndValue::schema).orElse(null), keyData.map(SchemaAndValue::value).orElse(null), - value.schema(), value.value()); + return sourceRecordList; } private static void configureValueConverter(final String outputFormat, final Map<String, String> config, final S3SourceConfig s3SourceConfig) { - if (AVRO_OUTPUT_FORMAT.equals(outputFormat)) { + if (AVRO_OUTPUT_FORMAT.equals(outputFormat) || PARQUET_OUTPUT_FORMAT.equals(outputFormat)) { config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); } else if (JSON_OUTPUT_FORMAT.equals(outputFormat)) { - config.put("schemas.enable", "false"); + config.put(SCHEMAS_ENABLE, "false"); } } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 538437e34..d69b2601c 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -65,7 +65,7 @@ * Parquet). */ @SuppressWarnings("PMD.ExcessiveImports") -public final class SourceRecordIterator implements Iterator<S3SourceRecord> { +public final class SourceRecordIterator implements Iterator<List<AivenS3SourceRecord>> { public static final Pattern DEFAULT_PATTERN = Pattern .compile("(?<topic>[^/]+?)-" + "(?<partition>\\d{5})-" + "(?<offset>\\d{12})" + "\\.(?<extension>[^.]+)$"); @@ -76,7 +76,7 @@ public final class SourceRecordIterator implements Iterator<S3SourceRecord> { final ObjectMapper objectMapper = new ObjectMapper(); private Iterator<S3ObjectSummary> nextFileIterator; - private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> recordIterator = Collections.emptyIterator(); + private Iterator<List<ConsumerRecord<byte[], byte[]>>> recordIterator = Collections.emptyIterator(); private final OffsetManager offsetManager; @@ -116,7 +116,7 @@ private void nextS3Object() { } } - private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentFile() throws IOException { + private Iterator<List<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentFile() throws IOException { final S3Object s3Object = s3Client.getObject(bucketName, currentKey); try (InputStream content = fileReader.getContent(s3Object)) { final Matcher matcher = DEFAULT_PATTERN.matcher(currentKey); @@ -151,35 +151,60 @@ private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> createIteratorForCurr } } - private Iterator<Optional<ConsumerRecord<byte[], byte[]>>> getObjectIterator(final InputStream content, + @SuppressWarnings("PMD.CognitiveComplexity") + private Iterator<List<ConsumerRecord<byte[], byte[]>>> getObjectIterator(final InputStream inputStream, final String topic, final int topicPartition, final long startOffset, final DatumReader<GenericRecord> datumReader, final String fileFormat) { return new Iterator<>() { private Map<Map<String, Object>, Long> currentOffsets = new HashMap<>(); - private Optional<ConsumerRecord<byte[], byte[]>> nextRecord = readNext(); + private List<ConsumerRecord<byte[], byte[]>> nextRecord = readNext(); - private Optional<ConsumerRecord<byte[], byte[]>> readNext() { + private List<ConsumerRecord<byte[], byte[]>> readNext() { try { final Optional<byte[]> key = Optional.ofNullable(currentKey) .map(k -> k.getBytes(StandardCharsets.UTF_8)); - final byte[] value = getValueBytes(fileFormat, content, datumReader, topic); + final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = new ArrayList<>(); + handleValueData(key, consumerRecordList); - if (value == null) { - if (key.isPresent()) { - throw new IllegalStateException("missing value for key!" + key); - } - return Optional.empty(); - } + return consumerRecordList; - return getConsumerRecord(key, value); } catch (IOException e) { throw new org.apache.kafka.connect.errors.ConnectException( "Connect converters could not be instantiated.", e); } } - private Optional<ConsumerRecord<byte[], byte[]>> getConsumerRecord(final Optional<byte[]> key, - final byte[] value) { + private void handleValueData(final Optional<byte[]> key, + final List<ConsumerRecord<byte[], byte[]>> consumerRecordList) throws IOException { + switch (fileFormat) { + case PARQUET_OUTPUT_FORMAT : { + final List<GenericRecord> records = ParquetUtils.getRecords(inputStream, topic); + for (final GenericRecord record : records) { + final byte[] valueBytes = serializeAvroRecordToBytes(Collections.singletonList(record), + topic); + consumerRecordList.add(getConsumerRecord(key, valueBytes)); + } + break; + } + case AVRO_OUTPUT_FORMAT : { + final List<GenericRecord> records = readAvroRecords(inputStream, datumReader); + for (final GenericRecord record : records) { + final byte[] valueBytes = serializeAvroRecordToBytes(Collections.singletonList(record), + topic); + consumerRecordList.add(getConsumerRecord(key, valueBytes)); + } + break; + } + case JSON_OUTPUT_FORMAT : + consumerRecordList.add(getConsumerRecord(key, serializeJsonData(inputStream))); + break; + default : + consumerRecordList.add(getConsumerRecord(key, IOUtils.toByteArray(inputStream))); + break; + } + } + + private ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> key, final byte[] value) { final Map<String, Object> partitionMap = new HashMap<>(); partitionMap.put(BUCKET, bucketName); partitionMap.put(TOPIC, topic); @@ -193,8 +218,8 @@ private Optional<ConsumerRecord<byte[], byte[]>> getConsumerRecord(final Optiona currentOffset = currentOffsets.getOrDefault(partitionMap, startOffset); } - final Optional<ConsumerRecord<byte[], byte[]>> record = Optional - .of(new ConsumerRecord<>(topic, topicPartition, currentOffset, key.orElse(null), value)); + final ConsumerRecord<byte[], byte[]> record = new ConsumerRecord<>(topic, topicPartition, currentOffset, + key.orElse(null), value); offsetManager.updateOffset(partitionMap, currentOffset); @@ -203,32 +228,21 @@ private Optional<ConsumerRecord<byte[], byte[]>> getConsumerRecord(final Optiona @Override public boolean hasNext() { - return nextRecord.isPresent(); + return !nextRecord.isEmpty(); } @Override - public Optional<ConsumerRecord<byte[], byte[]>> next() { + public List<ConsumerRecord<byte[], byte[]>> next() { if (nextRecord.isEmpty()) { throw new NoSuchElementException(); } - final Optional<ConsumerRecord<byte[], byte[]>> currentRecord = nextRecord; - nextRecord = Optional.empty(); + final List<ConsumerRecord<byte[], byte[]>> currentRecord = nextRecord; + nextRecord = Collections.emptyList(); return currentRecord; } }; } - private byte[] getValueBytes(final String fileFormat, final InputStream content, - final DatumReader<GenericRecord> datumReader, final String topicName) throws IOException { - if (AVRO_OUTPUT_FORMAT.equals(fileFormat)) { - return serializeAvroRecordToBytes(readAvroRecords(content, datumReader), topicName); - } else if (JSON_OUTPUT_FORMAT.equals(fileFormat)) { - return serializeJsonData(content); - } else { - return IOUtils.toByteArray(content); - } - } - private List<GenericRecord> readAvroRecords(final InputStream content, final DatumReader<GenericRecord> datumReader) throws IOException { final List<GenericRecord> records = new ArrayList<>(); @@ -269,29 +283,38 @@ public boolean hasNext() { } @Override - public S3SourceRecord next() { + @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") + public List<AivenS3SourceRecord> next() { if (!recordIterator.hasNext()) { nextS3Object(); } - final Optional<ConsumerRecord<byte[], byte[]>> consumerRecord = recordIterator.next(); - if (consumerRecord.isEmpty()) { + final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = recordIterator.next(); + if (consumerRecordList.isEmpty()) { throw new NoSuchElementException(); } + final List<AivenS3SourceRecord> aivenS3SourceRecordList = new ArrayList<>(); + + AivenS3SourceRecord aivenS3SourceRecord; + Map<String, Object> offsetMap; + Map<String, Object> partitionMap; + for (final ConsumerRecord<byte[], byte[]> currentRecord : consumerRecordList) { + partitionMap = new HashMap<>(); + partitionMap.put(BUCKET, bucketName); + partitionMap.put(TOPIC, currentRecord.topic()); + partitionMap.put(PARTITION, currentRecord.partition()); - final ConsumerRecord<byte[], byte[]> currentRecord = consumerRecord.get(); + // Create the offset map + offsetMap = new HashMap<>(); + offsetMap.put(OFFSET_KEY, currentRecord.offset()); - final Map<String, Object> partitionMap = new HashMap<>(); - partitionMap.put(BUCKET, bucketName); - partitionMap.put(TOPIC, currentRecord.topic()); - partitionMap.put(PARTITION, currentRecord.partition()); + aivenS3SourceRecord = new AivenS3SourceRecord(partitionMap, offsetMap, currentRecord.topic(), + currentRecord.partition(), currentRecord.key(), currentRecord.value()); - // Create the offset map - final Map<String, Object> offsetMap = new HashMap<>(); - offsetMap.put(OFFSET_KEY, currentRecord.offset()); + aivenS3SourceRecordList.add(aivenS3SourceRecord); + } - return new S3SourceRecord(partitionMap, offsetMap, currentRecord.topic(), currentRecord.partition(), - currentRecord.key(), currentRecord.value()); + return aivenS3SourceRecordList; } @Override From 828d0ea9e3d043b998410a556e0a4f8bd13190b4 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Sat, 5 Oct 2024 23:08:59 +0200 Subject: [PATCH 24/90] Refactor with new output writer classes --- .../connect/s3/source/IntegrationTest.java | 12 +- .../kafka/connect/s3/source/S3SourceTask.java | 16 +- .../s3/source/config/S3SourceConfig.java | 4 +- .../connect/s3/source/output/AvroWriter.java | 101 ++++++++ .../s3/source/output/ByteArrayWriter.java | 53 ++++ .../connect/s3/source/output/JsonWriter.java | 62 +++++ .../s3/source/output/OutputWriter.java | 66 +++++ .../s3/source/output/OutputWriterFactory.java | 43 ++++ .../s3/source/output/ParquetWriter.java | 81 ++++++ .../connect/s3/source/utils/ParquetUtils.java | 4 +- .../s3/source/utils/RecordProcessor.java | 31 +-- .../s3/source/utils/SourceRecordIterator.java | 241 +++++++++--------- 12 files changed, 561 insertions(+), 153 deletions(-) create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 924ffdfd0..c3a16c069 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -22,6 +22,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_ENDPOINT_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.BYTE_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.PARQUET_OUTPUT_FORMAT; @@ -154,6 +155,7 @@ void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectRunner.createConnector(connectorConfig); + connectorConfig.put(OUTPUT_FORMAT, BYTE_OUTPUT_FORMAT); final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; @@ -247,8 +249,8 @@ void parquetTest(final TestInfo testInfo) throws ExecutionException, Interrupted connectRunner.createConnector(connectorConfig); final String tmpFilePath = "/tmp/users.parquet"; - final String name1 = "Alice"; - final String name2 = "Bob"; + final String name1 = "testuser1"; + final String name2 = "testuser2"; writeParquetFile(tmpFilePath, name1, name2); final Path path = Paths.get(tmpFilePath); try { @@ -345,7 +347,7 @@ public void multipartUpload(final String bucketName, final String key) { s3Client); InputStream resourceStream = Thread.currentThread() .getContextClassLoader() - .getResourceAsStream(S3_FILE_NAME);) { + .getResourceAsStream(S3_FILE_NAME)) { assert resourceStream != null; final byte[] fileBytes = IOUtils.toByteArray(resourceStream); s3OutputStream.write(fileBytes); @@ -377,12 +379,12 @@ private static void writeParquetFile(final String outputPath, final Schema schem final GenericData.Record user1 = new GenericData.Record(schema); user1.put("name", name1); user1.put("age", 30); - user1.put("email", "alice@example.com"); + user1.put("email", name1 + "@test"); final GenericData.Record user2 = new GenericData.Record(schema); user2.put("name", name2); user2.put("age", 25); - user2.put("email", "bob@example.com"); + user2.put("email", name2 + "@test"); // Create a Parquet writer final org.apache.hadoop.fs.Path path = new org.apache.hadoop.fs.Path(outputPath); // NOPMD diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index d3351de1e..a6e5ab5ac 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -18,6 +18,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT; import java.util.ArrayList; import java.util.Iterator; @@ -33,6 +34,8 @@ import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.output.OutputWriter; +import io.aiven.kafka.connect.s3.source.output.OutputWriterFactory; import io.aiven.kafka.connect.s3.source.utils.AivenS3SourceRecord; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import io.aiven.kafka.connect.s3.source.utils.RecordProcessor; @@ -66,6 +69,10 @@ public class S3SourceTask extends SourceTask { private Optional<Converter> keyConverter; private Converter valueConverter; + private OutputWriter outputWriter; + + private String s3Bucket; + private final AtomicBoolean connectorStopped = new AtomicBoolean(); private final S3ClientFactory s3ClientFactory = new S3ClientFactory(); @@ -85,6 +92,8 @@ public void start(final Map<String, String> props) { s3SourceConfig = new S3SourceConfig(props); initializeConverters(); initializeS3Client(); + this.s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); + this.outputWriter = OutputWriterFactory.getWriter(s3SourceConfig.getString(OUTPUT_FORMAT), this.s3Bucket); prepareReaderFromOffsetStorageReader(); } @@ -105,9 +114,8 @@ private void initializeS3Client() { private void prepareReaderFromOffsetStorageReader() { final OffsetManager offsetManager = new OffsetManager(context, s3SourceConfig); - - final String s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); - sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, s3Client, s3Bucket, offsetManager); + sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, s3Client, this.s3Bucket, offsetManager, + this.outputWriter); } @Override @@ -134,7 +142,7 @@ private List<SourceRecord> extractSourceRecords(final List<SourceRecord> results return results; } return RecordProcessor.processRecords(sourceRecordIterator, results, s3SourceConfig, keyConverter, - valueConverter, connectorStopped); + valueConverter, connectorStopped, this.outputWriter); } private void waitForObjects() throws InterruptedException { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 4cd4384e1..4fe938bce 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -130,6 +130,8 @@ final public class S3SourceConfig extends AbstractConfig { public static final String JSON_OUTPUT_FORMAT = "json"; + public static final String BYTE_OUTPUT_FORMAT = "bytes"; + public S3SourceConfig(final Map<String, String> properties) { super(configDef(), preprocessProperties(properties)); validate(); // NOPMD ConstructorCallsOverridableMethod getStsRole is called @@ -184,7 +186,7 @@ private static void addSchemaRegistryGroup(final ConfigDef configDef) { configDef.define(SCHEMA_REGISTRY_URL, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "SCHEMA REGISTRY URL", GROUP_AWS, srCounter++, ConfigDef.Width.NONE, SCHEMA_REGISTRY_URL); - configDef.define(OUTPUT_FORMAT, ConfigDef.Type.STRING, "bytearray", new ConfigDef.NonEmptyString(), + configDef.define(OUTPUT_FORMAT, ConfigDef.Type.STRING, BYTE_OUTPUT_FORMAT, new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "Output format avro/bytearray", GROUP_AWS, srCounter++, // NOPMD ConfigDef.Width.NONE, OUTPUT_FORMAT); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java new file mode 100644 index 000000000..36fb4ae2f --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java @@ -0,0 +1,101 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.output; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import org.apache.kafka.clients.consumer.ConsumerRecord; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; + +import com.amazonaws.util.IOUtils; +import io.confluent.kafka.serializers.KafkaAvroSerializer; +import org.apache.avro.file.DataFileReader; +import org.apache.avro.file.SeekableByteArrayInput; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.DatumReader; +import org.apache.avro.io.DecoderFactory; + +public class AvroWriter implements OutputWriter { + + private final String bucketName; + + public AvroWriter(final String bucketName) { + this.bucketName = bucketName; + } + + @Override + public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { + config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + } + + @Override + public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, + final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, + final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, + final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) throws IOException { + final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); + DecoderFactory.get().binaryDecoder(inputStream, null); + final List<GenericRecord> records = readAvroRecords(inputStream, datumReader); + for (final GenericRecord record : records) { + final byte[] valueBytes = serializeRecordToBytes(Collections.singletonList(record), topic, s3SourceConfig); + consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, this.bucketName, topic, + topicPartition, offsetManager, currentOffsets, startOffset)); + } + } + + private List<GenericRecord> readAvroRecords(final InputStream content, final DatumReader<GenericRecord> datumReader) + throws IOException { + final List<GenericRecord> records = new ArrayList<>(); + try (SeekableByteArrayInput sin = new SeekableByteArrayInput(IOUtils.toByteArray(content))) { + try (DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, datumReader)) { + reader.forEach(records::add); + } + } + return records; + } + + @Deprecated + private byte[] serializeRecordToBytes(final List<GenericRecord> avroRecords, final String topic, + final S3SourceConfig s3SourceConfig) throws IOException { + final Map<String, String> config = Collections.singletonMap(SCHEMA_REGISTRY_URL, + s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + + try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) s3SourceConfig.getClass(VALUE_SERIALIZER) + .newInstance(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { + avroSerializer.configure(config, false); + for (final GenericRecord avroRecord : avroRecords) { + out.write(avroSerializer.serialize(topic, avroRecord)); + } + return out.toByteArray(); + } catch (InstantiationException | IllegalAccessException e) { + throw new IllegalStateException("Failed to initialize serializer", e); + } + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java new file mode 100644 index 000000000..8b9a33d08 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java @@ -0,0 +1,53 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.output; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import org.apache.kafka.clients.consumer.ConsumerRecord; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; + +import com.amazonaws.util.IOUtils; + +public class ByteArrayWriter implements OutputWriter { + + private final String bucketName; + + public ByteArrayWriter(final String bucketName) { + this.bucketName = bucketName; + } + + @Override + public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { + + } + + @Override + public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, + final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, + final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, + final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) throws IOException { + consumerRecordList.add(getConsumerRecord(optionalKeyBytes, IOUtils.toByteArray(inputStream), this.bucketName, + topic, topicPartition, offsetManager, currentOffsets, startOffset)); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java new file mode 100644 index 000000000..0655f1b51 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java @@ -0,0 +1,62 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.output; + +import static io.aiven.kafka.connect.s3.source.utils.RecordProcessor.SCHEMAS_ENABLE; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import org.apache.kafka.clients.consumer.ConsumerRecord; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +public class JsonWriter implements OutputWriter { + + private final String bucketName; + final ObjectMapper objectMapper = new ObjectMapper(); + + public JsonWriter(final String bucketName) { + this.bucketName = bucketName; + } + + @Override + public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { + config.put(SCHEMAS_ENABLE, "false"); + } + + @Override + public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, + final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, + final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, + final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) throws IOException { + consumerRecordList.add(getConsumerRecord(optionalKeyBytes, serializeJsonData(inputStream), this.bucketName, + topic, topicPartition, offsetManager, currentOffsets, startOffset)); + } + + private byte[] serializeJsonData(final InputStream inputStream) throws IOException { + final JsonNode jsonNode = objectMapper.readTree(inputStream); + return objectMapper.writeValueAsBytes(jsonNode); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java new file mode 100644 index 000000000..6e9b87631 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java @@ -0,0 +1,66 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.output; + +import static io.aiven.kafka.connect.s3.source.S3SourceTask.BUCKET; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.PARTITION; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.TOPIC; + +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import org.apache.kafka.clients.consumer.ConsumerRecord; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; + +public interface OutputWriter { + + void configureValueConverter(Map<String, String> config, S3SourceConfig s3SourceConfig); + void handleValueData(Optional<byte[]> optionalKeyBytes, InputStream inputStream, String topic, + List<ConsumerRecord<byte[], byte[]>> consumerRecordList, S3SourceConfig s3SourceConfig, int topicPartition, + long startOffset, OffsetManager offsetManager, Map<Map<String, Object>, Long> currentOffsets) + throws IOException; + + default ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> key, final byte[] value, + final String bucketName, final String topic, final int topicPartition, final OffsetManager offsetManager, + final Map<Map<String, Object>, Long> currentOffsets, final long startOffset) { + final Map<String, Object> partitionMap = new HashMap<>(); + partitionMap.put(BUCKET, bucketName); + partitionMap.put(TOPIC, topic); + partitionMap.put(PARTITION, topicPartition); + + long currentOffset; + + if (offsetManager.getOffsets().containsKey(partitionMap)) { + currentOffset = offsetManager.getIncrementedOffsetForPartition(partitionMap); + } else { + currentOffset = currentOffsets.getOrDefault(partitionMap, startOffset); + } + + final ConsumerRecord<byte[], byte[]> record = new ConsumerRecord<>(topic, topicPartition, currentOffset, + key.orElse(null), value); + + offsetManager.updateOffset(partitionMap, currentOffset); + + return record; + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java new file mode 100644 index 000000000..e0b0d2e2d --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java @@ -0,0 +1,43 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.output; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.BYTE_OUTPUT_FORMAT; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.PARQUET_OUTPUT_FORMAT; + +public final class OutputWriterFactory { + + private OutputWriterFactory() { + throw new UnsupportedOperationException("Class cannot be instantiated"); + } + public static OutputWriter getWriter(final String outputFormat, final String bucket) { + switch (outputFormat) { + case AVRO_OUTPUT_FORMAT : + return new AvroWriter(bucket); + case PARQUET_OUTPUT_FORMAT : + return new ParquetWriter(bucket); + case JSON_OUTPUT_FORMAT : + return new JsonWriter(bucket); + case BYTE_OUTPUT_FORMAT : + return new ByteArrayWriter(bucket); + default : + throw new IllegalArgumentException("Unknown output format: " + outputFormat); + } + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java new file mode 100644 index 000000000..407c2ab32 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java @@ -0,0 +1,81 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.output; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import org.apache.kafka.clients.consumer.ConsumerRecord; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; +import io.aiven.kafka.connect.s3.source.utils.ParquetUtils; + +import io.confluent.kafka.serializers.KafkaAvroSerializer; +import org.apache.avro.generic.GenericRecord; + +public class ParquetWriter implements OutputWriter { + + private final String bucketName; + + public ParquetWriter(final String bucketName) { + this.bucketName = bucketName; + } + @Override + public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { + config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + } + + @Override + public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, + final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, + final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, + final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) throws IOException { + final List<GenericRecord> records = ParquetUtils.getRecords(inputStream, topic); + for (final GenericRecord record : records) { + final byte[] valueBytes = serializeRecordToBytes(Collections.singletonList(record), topic, s3SourceConfig); + consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, this.bucketName, topic, + topicPartition, offsetManager, currentOffsets, startOffset)); + } + } + + @Deprecated + private byte[] serializeRecordToBytes(final List<GenericRecord> avroRecords, final String topic, + final S3SourceConfig s3SourceConfig) throws IOException { + final Map<String, String> config = Collections.singletonMap(SCHEMA_REGISTRY_URL, + s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + + try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) s3SourceConfig.getClass(VALUE_SERIALIZER) + .newInstance(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { + avroSerializer.configure(config, false); + for (final GenericRecord avroRecord : avroRecords) { + out.write(avroSerializer.serialize(topic, avroRecord)); + } + return out.toByteArray(); + } catch (InstantiationException | IllegalAccessException e) { + throw new IllegalStateException("Failed to initialize serializer", e); + } + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java index a3dc9807c..d5046c657 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java @@ -35,7 +35,7 @@ import org.apache.parquet.io.InputFile; import org.apache.parquet.io.SeekableInputStream; -final class ParquetUtils { +public final class ParquetUtils { public static final String TMP_DIR = "/tmp"; public static final int BUFFER_SIZE = 8192; @@ -43,7 +43,7 @@ final class ParquetUtils { private ParquetUtils() { /* hide constructor */ } - static List<GenericRecord> getRecords(final InputStream inputStream, final String topic) throws IOException { + public static List<GenericRecord> getRecords(final InputStream inputStream, final String topic) throws IOException { final Path tmpDir = Paths.get(TMP_DIR); final String timestamp = String.valueOf(Instant.now().toEpochMilli()); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index 871c1cd69..23e1f4e13 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -16,11 +16,6 @@ package io.aiven.kafka.connect.s3.source.utils; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.PARQUET_OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; - import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; @@ -34,6 +29,7 @@ import org.apache.kafka.connect.storage.Converter; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.output.OutputWriter; public final class RecordProcessor { @@ -45,7 +41,7 @@ private RecordProcessor() { public static List<SourceRecord> processRecords(final Iterator<List<AivenS3SourceRecord>> sourceRecordIterator, final List<SourceRecord> results, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, - final AtomicBoolean connectorStopped) { + final AtomicBoolean connectorStopped, final OutputWriter outputWriter) { final Map<String, String> conversionConfig = new HashMap<>(); final int maxPollRecords = s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS); @@ -53,7 +49,7 @@ public static List<SourceRecord> processRecords(final Iterator<List<AivenS3Sourc for (int i = 0; sourceRecordIterator.hasNext() && i < maxPollRecords && !connectorStopped.get(); i++) { final List<AivenS3SourceRecord> recordList = sourceRecordIterator.next(); final List<SourceRecord> sourceRecords = createSourceRecords(recordList, s3SourceConfig, keyConverter, - valueConverter, conversionConfig); + valueConverter, conversionConfig, outputWriter); results.addAll(sourceRecords); } @@ -63,7 +59,7 @@ public static List<SourceRecord> processRecords(final Iterator<List<AivenS3Sourc @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") private static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRecord> aivenS3SourceRecordList, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, - final Map<String, String> conversionConfig) { + final Map<String, String> conversionConfig, final OutputWriter outputWriter) { final List<SourceRecord> sourceRecordList = new ArrayList<>(); for (final AivenS3SourceRecord aivenS3SourceRecord : aivenS3SourceRecordList) { @@ -71,8 +67,7 @@ private static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRe final Optional<SchemaAndValue> keyData = keyConverter .map(c -> c.toConnectData(topic, aivenS3SourceRecord.key())); - configureValueConverter(s3SourceConfig.getString(S3SourceConfig.OUTPUT_FORMAT), conversionConfig, - s3SourceConfig); + outputWriter.configureValueConverter(conversionConfig, s3SourceConfig); valueConverter.configure(conversionConfig, false); final SchemaAndValue value = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); @@ -86,12 +81,12 @@ private static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRe return sourceRecordList; } - private static void configureValueConverter(final String outputFormat, final Map<String, String> config, - final S3SourceConfig s3SourceConfig) { - if (AVRO_OUTPUT_FORMAT.equals(outputFormat) || PARQUET_OUTPUT_FORMAT.equals(outputFormat)) { - config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); - } else if (JSON_OUTPUT_FORMAT.equals(outputFormat)) { - config.put(SCHEMAS_ENABLE, "false"); - } - } + // private static void configureValueConverter(final String outputFormat, final Map<String, String> config, + // final S3SourceConfig s3SourceConfig) { + // if (AVRO_OUTPUT_FORMAT.equals(outputFormat) || PARQUET_OUTPUT_FORMAT.equals(outputFormat)) { + // config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + // } else if (JSON_OUTPUT_FORMAT.equals(outputFormat)) { + // config.put(SCHEMAS_ENABLE, "false"); + // } + // } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index d69b2601c..abe04265a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -19,14 +19,7 @@ import static io.aiven.kafka.connect.s3.source.S3SourceTask.BUCKET; import static io.aiven.kafka.connect.s3.source.S3SourceTask.PARTITION; import static io.aiven.kafka.connect.s3.source.S3SourceTask.TOPIC; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.PARQUET_OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; - -import java.io.ByteArrayOutputStream; + import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; @@ -44,21 +37,13 @@ import org.apache.kafka.clients.consumer.ConsumerRecord; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.output.OutputWriter; import com.amazonaws.AmazonClientException; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; -import com.amazonaws.util.IOUtils; -import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; -import io.confluent.kafka.serializers.KafkaAvroSerializer; -import org.apache.avro.file.DataFileReader; -import org.apache.avro.file.SeekableByteArrayInput; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.DatumReader; -import org.apache.avro.io.DecoderFactory; /** * Iterator that processes S3 files and creates Kafka source records. Supports different output formats (Avro, JSON, @@ -86,12 +71,15 @@ public final class SourceRecordIterator implements Iterator<List<AivenS3SourceRe private final FileReader fileReader; + private final OutputWriter outputWriter; + public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, - final OffsetManager offsetManager) { + final OffsetManager offsetManager, final OutputWriter outputWriter) { this.s3SourceConfig = s3SourceConfig; this.offsetManager = offsetManager; this.s3Client = s3Client; this.bucketName = bucketName; + this.outputWriter = outputWriter; this.fileReader = new FileReader(s3SourceConfig, bucketName); try { final List<S3ObjectSummary> chunks = fileReader.fetchObjectSummaries(s3Client); @@ -118,7 +106,7 @@ private void nextS3Object() { private Iterator<List<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentFile() throws IOException { final S3Object s3Object = s3Client.getObject(bucketName, currentKey); - try (InputStream content = fileReader.getContent(s3Object)) { + try (InputStream inputStream = fileReader.getContent(s3Object)) { final Matcher matcher = DEFAULT_PATTERN.matcher(currentKey); String topic = null; int partition = 0; @@ -133,38 +121,40 @@ private Iterator<List<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentF final int finalPartition = partition; final long finalStartOffset = startOffset; - switch (s3SourceConfig.getString(OUTPUT_FORMAT)) { - case AVRO_OUTPUT_FORMAT : - final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); - DecoderFactory.get().binaryDecoder(content, null); - return getObjectIterator(content, finalTopic, finalPartition, finalStartOffset, datumReader, - AVRO_OUTPUT_FORMAT); - case PARQUET_OUTPUT_FORMAT : - return getObjectIterator(content, finalTopic, finalPartition, finalStartOffset, null, - PARQUET_OUTPUT_FORMAT); - case JSON_OUTPUT_FORMAT : - return getObjectIterator(content, finalTopic, finalPartition, finalStartOffset, null, - JSON_OUTPUT_FORMAT); - default : - return getObjectIterator(content, finalTopic, finalPartition, finalStartOffset, null, ""); - } + return getObjectIterator(inputStream, finalTopic, finalPartition, finalStartOffset, outputWriter); + + // switch (s3SourceConfig.getString(OUTPUT_FORMAT)) { + // case AVRO_OUTPUT_FORMAT : + // + // return getObjectIterator(inputStream, finalTopic, finalPartition, finalStartOffset, datumReader, + // AVRO_OUTPUT_FORMAT); + // case PARQUET_OUTPUT_FORMAT : + // return getObjectIterator(inputStream, finalTopic, finalPartition, finalStartOffset, null, + // PARQUET_OUTPUT_FORMAT); + // case JSON_OUTPUT_FORMAT : + // return getObjectIterator(inputStream, finalTopic, finalPartition, finalStartOffset, null, + // JSON_OUTPUT_FORMAT); + // default : + // return getObjectIterator(inputStream, finalTopic, finalPartition, finalStartOffset, null, ""); + // } } } @SuppressWarnings("PMD.CognitiveComplexity") - private Iterator<List<ConsumerRecord<byte[], byte[]>>> getObjectIterator(final InputStream inputStream, - final String topic, final int topicPartition, final long startOffset, - final DatumReader<GenericRecord> datumReader, final String fileFormat) { + private Iterator<List<ConsumerRecord<byte[], byte[]>>> getObjectIterator(final InputStream valueInputStream, + final String topic, final int topicPartition, final long startOffset, final OutputWriter outputWriter) { return new Iterator<>() { private Map<Map<String, Object>, Long> currentOffsets = new HashMap<>(); private List<ConsumerRecord<byte[], byte[]>> nextRecord = readNext(); private List<ConsumerRecord<byte[], byte[]>> readNext() { try { - final Optional<byte[]> key = Optional.ofNullable(currentKey) + final Optional<byte[]> optionalKeyBytes = Optional.ofNullable(currentKey) .map(k -> k.getBytes(StandardCharsets.UTF_8)); final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = new ArrayList<>(); - handleValueData(key, consumerRecordList); + // handleValueData(optionalKeyBytes, consumerRecordList); + outputWriter.handleValueData(optionalKeyBytes, valueInputStream, topic, consumerRecordList, + s3SourceConfig, topicPartition, startOffset, offsetManager, currentOffsets); return consumerRecordList; @@ -174,57 +164,61 @@ private List<ConsumerRecord<byte[], byte[]>> readNext() { } } - private void handleValueData(final Optional<byte[]> key, - final List<ConsumerRecord<byte[], byte[]>> consumerRecordList) throws IOException { - switch (fileFormat) { - case PARQUET_OUTPUT_FORMAT : { - final List<GenericRecord> records = ParquetUtils.getRecords(inputStream, topic); - for (final GenericRecord record : records) { - final byte[] valueBytes = serializeAvroRecordToBytes(Collections.singletonList(record), - topic); - consumerRecordList.add(getConsumerRecord(key, valueBytes)); - } - break; - } - case AVRO_OUTPUT_FORMAT : { - final List<GenericRecord> records = readAvroRecords(inputStream, datumReader); - for (final GenericRecord record : records) { - final byte[] valueBytes = serializeAvroRecordToBytes(Collections.singletonList(record), - topic); - consumerRecordList.add(getConsumerRecord(key, valueBytes)); - } - break; - } - case JSON_OUTPUT_FORMAT : - consumerRecordList.add(getConsumerRecord(key, serializeJsonData(inputStream))); - break; - default : - consumerRecordList.add(getConsumerRecord(key, IOUtils.toByteArray(inputStream))); - break; - } - } - - private ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> key, final byte[] value) { - final Map<String, Object> partitionMap = new HashMap<>(); - partitionMap.put(BUCKET, bucketName); - partitionMap.put(TOPIC, topic); - partitionMap.put(PARTITION, topicPartition); - - long currentOffset; - - if (offsetManager.getOffsets().containsKey(partitionMap)) { - currentOffset = offsetManager.getIncrementedOffsetForPartition(partitionMap); - } else { - currentOffset = currentOffsets.getOrDefault(partitionMap, startOffset); - } - - final ConsumerRecord<byte[], byte[]> record = new ConsumerRecord<>(topic, topicPartition, currentOffset, - key.orElse(null), value); - - offsetManager.updateOffset(partitionMap, currentOffset); - - return record; - } + // private void handleValueData(final Optional<byte[]> key, + // final List<ConsumerRecord<byte[], byte[]>> consumerRecordList) throws IOException { + // + // switch (fileFormat) { + // case PARQUET_OUTPUT_FORMAT : { + // final List<GenericRecord> records = ParquetUtils.getRecords(valueInputStream, topic); + // for (final GenericRecord record : records) { + // final byte[] valueBytes = serializeAvroRecordToBytes(Collections.singletonList(record), + // topic); + // consumerRecordList.add(getConsumerRecord(key, valueBytes)); + // } + // break; + // } + // case AVRO_OUTPUT_FORMAT : { + // final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); + // DecoderFactory.get().binaryDecoder(valueInputStream, null); + // final List<GenericRecord> records = readAvroRecords(valueInputStream, datumReader); + // for (final GenericRecord record : records) { + // final byte[] valueBytes = serializeAvroRecordToBytes(Collections.singletonList(record), + // topic); + // consumerRecordList.add(getConsumerRecord(key, valueBytes)); + // } + // break; + // } + // case JSON_OUTPUT_FORMAT : + // consumerRecordList.add(getConsumerRecord(key, serializeJsonData(valueInputStream))); + // break; + // default : + // consumerRecordList.add(getConsumerRecord(key, IOUtils.toByteArray(valueInputStream))); + // break; + // } + // } + + // private ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> key, final byte[] value) + // { + // final Map<String, Object> partitionMap = new HashMap<>(); + // partitionMap.put(BUCKET, bucketName); + // partitionMap.put(TOPIC, topic); + // partitionMap.put(PARTITION, topicPartition); + // + // long currentOffset; + // + // if (offsetManager.getOffsets().containsKey(partitionMap)) { + // currentOffset = offsetManager.getIncrementedOffsetForPartition(partitionMap); + // } else { + // currentOffset = currentOffsets.getOrDefault(partitionMap, startOffset); + // } + // + // final ConsumerRecord<byte[], byte[]> record = new ConsumerRecord<>(topic, topicPartition, currentOffset, + // key.orElse(null), value); + // + // offsetManager.updateOffset(partitionMap, currentOffset); + // + // return record; + // } @Override public boolean hasNext() { @@ -243,39 +237,40 @@ public List<ConsumerRecord<byte[], byte[]>> next() { }; } - private List<GenericRecord> readAvroRecords(final InputStream content, final DatumReader<GenericRecord> datumReader) - throws IOException { - final List<GenericRecord> records = new ArrayList<>(); - try (SeekableByteArrayInput sin = new SeekableByteArrayInput(IOUtils.toByteArray(content))) { - try (DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, datumReader)) { - reader.forEach(records::add); - } - } - return records; - } - - private byte[] serializeJsonData(final InputStream inputStream) throws IOException { - final JsonNode jsonNode = objectMapper.readTree(inputStream); - return objectMapper.writeValueAsBytes(jsonNode); - } - - @Deprecated - private byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String topic) - throws IOException { - final Map<String, String> config = Collections.singletonMap(SCHEMA_REGISTRY_URL, - s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); - - try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) s3SourceConfig.getClass(VALUE_SERIALIZER) - .newInstance(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { - avroSerializer.configure(config, false); - for (final GenericRecord avroRecord : avroRecords) { - out.write(avroSerializer.serialize(topic, avroRecord)); - } - return out.toByteArray(); - } catch (InstantiationException | IllegalAccessException e) { - throw new IllegalStateException("Failed to initialize serializer", e); - } - } + // private List<GenericRecord> readAvroRecords(final InputStream content, final DatumReader<GenericRecord> + // datumReader) + // throws IOException { + // final List<GenericRecord> records = new ArrayList<>(); + // try (SeekableByteArrayInput sin = new SeekableByteArrayInput(IOUtils.toByteArray(content))) { + // try (DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, datumReader)) { + // reader.forEach(records::add); + // } + // } + // return records; + // } + + // private byte[] serializeJsonData(final InputStream inputStream) throws IOException { + // final JsonNode jsonNode = objectMapper.readTree(inputStream); + // return objectMapper.writeValueAsBytes(jsonNode); + // } + + // @Deprecated + // private byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String topic) + // throws IOException { + // final Map<String, String> config = Collections.singletonMap(SCHEMA_REGISTRY_URL, + // s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + // + // try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) s3SourceConfig.getClass(VALUE_SERIALIZER) + // .newInstance(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { + // avroSerializer.configure(config, false); + // for (final GenericRecord avroRecord : avroRecords) { + // out.write(avroSerializer.serialize(topic, avroRecord)); + // } + // return out.toByteArray(); + // } catch (InstantiationException | IllegalAccessException e) { + // throw new IllegalStateException("Failed to initialize serializer", e); + // } + // } @Override public boolean hasNext() { From 4dabbf629de5b5845cf91bdf687b98a6e8271845 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Sat, 5 Oct 2024 23:57:42 +0200 Subject: [PATCH 25/90] Topic derive if not found --- .../connect/s3/source/output/AvroWriter.java | 24 +--- .../s3/source/output/OutputWriter.java | 25 ++++ .../s3/source/output/ParquetWriter.java | 24 +--- .../s3/source/utils/OffsetManager.java | 8 ++ .../s3/source/utils/RecordProcessor.java | 9 -- .../s3/source/utils/SourceRecordIterator.java | 111 +----------------- 6 files changed, 40 insertions(+), 161 deletions(-) diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java index 36fb4ae2f..666bad98f 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java @@ -17,9 +17,7 @@ package io.aiven.kafka.connect.s3.source.output; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; @@ -34,7 +32,6 @@ import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import com.amazonaws.util.IOUtils; -import io.confluent.kafka.serializers.KafkaAvroSerializer; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.SeekableByteArrayInput; import org.apache.avro.generic.GenericDatumReader; @@ -64,7 +61,8 @@ public void handleValueData(final Optional<byte[]> optionalKeyBytes, final Input DecoderFactory.get().binaryDecoder(inputStream, null); final List<GenericRecord> records = readAvroRecords(inputStream, datumReader); for (final GenericRecord record : records) { - final byte[] valueBytes = serializeRecordToBytes(Collections.singletonList(record), topic, s3SourceConfig); + final byte[] valueBytes = serializeAvroRecordToBytes(Collections.singletonList(record), topic, + s3SourceConfig); consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, this.bucketName, topic, topicPartition, offsetManager, currentOffsets, startOffset)); } @@ -80,22 +78,4 @@ private List<GenericRecord> readAvroRecords(final InputStream content, final Dat } return records; } - - @Deprecated - private byte[] serializeRecordToBytes(final List<GenericRecord> avroRecords, final String topic, - final S3SourceConfig s3SourceConfig) throws IOException { - final Map<String, String> config = Collections.singletonMap(SCHEMA_REGISTRY_URL, - s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); - - try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) s3SourceConfig.getClass(VALUE_SERIALIZER) - .newInstance(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { - avroSerializer.configure(config, false); - for (final GenericRecord avroRecord : avroRecords) { - out.write(avroSerializer.serialize(topic, avroRecord)); - } - return out.toByteArray(); - } catch (InstantiationException | IllegalAccessException e) { - throw new IllegalStateException("Failed to initialize serializer", e); - } - } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java index 6e9b87631..97fef9bef 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java @@ -19,9 +19,13 @@ import static io.aiven.kafka.connect.s3.source.S3SourceTask.BUCKET; import static io.aiven.kafka.connect.s3.source.S3SourceTask.PARTITION; import static io.aiven.kafka.connect.s3.source.S3SourceTask.TOPIC; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -32,6 +36,9 @@ import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; +import io.confluent.kafka.serializers.KafkaAvroSerializer; +import org.apache.avro.generic.GenericRecord; + public interface OutputWriter { void configureValueConverter(Map<String, String> config, S3SourceConfig s3SourceConfig); @@ -63,4 +70,22 @@ default ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> return record; } + + @Deprecated + default byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String topic, + final S3SourceConfig s3SourceConfig) throws IOException { + final Map<String, String> config = Collections.singletonMap(SCHEMA_REGISTRY_URL, + s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + + try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) s3SourceConfig.getClass(VALUE_SERIALIZER) + .newInstance(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { + avroSerializer.configure(config, false); + for (final GenericRecord avroRecord : avroRecords) { + out.write(avroSerializer.serialize(topic, avroRecord)); + } + return out.toByteArray(); + } catch (InstantiationException | IllegalAccessException e) { + throw new IllegalStateException("Failed to initialize serializer", e); + } + } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java index 407c2ab32..4276808f8 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java @@ -17,9 +17,7 @@ package io.aiven.kafka.connect.s3.source.output; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.Collections; @@ -33,7 +31,6 @@ import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import io.aiven.kafka.connect.s3.source.utils.ParquetUtils; -import io.confluent.kafka.serializers.KafkaAvroSerializer; import org.apache.avro.generic.GenericRecord; public class ParquetWriter implements OutputWriter { @@ -55,27 +52,10 @@ public void handleValueData(final Optional<byte[]> optionalKeyBytes, final Input final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) throws IOException { final List<GenericRecord> records = ParquetUtils.getRecords(inputStream, topic); for (final GenericRecord record : records) { - final byte[] valueBytes = serializeRecordToBytes(Collections.singletonList(record), topic, s3SourceConfig); + final byte[] valueBytes = serializeAvroRecordToBytes(Collections.singletonList(record), topic, + s3SourceConfig); consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, this.bucketName, topic, topicPartition, offsetManager, currentOffsets, startOffset)); } } - - @Deprecated - private byte[] serializeRecordToBytes(final List<GenericRecord> avroRecords, final String topic, - final S3SourceConfig s3SourceConfig) throws IOException { - final Map<String, String> config = Collections.singletonMap(SCHEMA_REGISTRY_URL, - s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); - - try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) s3SourceConfig.getClass(VALUE_SERIALIZER) - .newInstance(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { - avroSerializer.configure(config, false); - for (final GenericRecord avroRecord : avroRecords) { - out.write(avroSerializer.serialize(topic, avroRecord)); - } - return out.toByteArray(); - } catch (InstantiationException | IllegalAccessException e) { - throw new IllegalStateException("Failed to initialize serializer", e); - } - } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index 3343702c2..f499313e4 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -22,6 +22,7 @@ import static io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator.OFFSET_KEY; import static java.util.stream.Collectors.toMap; +import java.net.ConnectException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -112,6 +113,13 @@ private static Set<String> parseTopics(final S3SourceConfig s3SourceConfig) { return Arrays.stream(topicString.split(",")).collect(Collectors.toSet()); } + String getFirstConfiguredTopic(final S3SourceConfig s3SourceConfig) throws ConnectException { + final String topicString = s3SourceConfig.getString(S3SourceConfig.TARGET_TOPICS); + return Arrays.stream(topicString.split(",")) + .findFirst() + .orElseThrow(() -> new ConnectException("Topic could not be derived")); + } + /** * Builds partition keys to be used for offset retrieval. * diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index 23e1f4e13..fad361049 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -80,13 +80,4 @@ private static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRe return sourceRecordList; } - - // private static void configureValueConverter(final String outputFormat, final Map<String, String> config, - // final S3SourceConfig s3SourceConfig) { - // if (AVRO_OUTPUT_FORMAT.equals(outputFormat) || PARQUET_OUTPUT_FORMAT.equals(outputFormat)) { - // config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); - // } else if (JSON_OUTPUT_FORMAT.equals(outputFormat)) { - // config.put(SCHEMAS_ENABLE, "false"); - // } - // } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index abe04265a..1c958e2a4 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -108,13 +108,15 @@ private Iterator<List<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentF final S3Object s3Object = s3Client.getObject(bucketName, currentKey); try (InputStream inputStream = fileReader.getContent(s3Object)) { final Matcher matcher = DEFAULT_PATTERN.matcher(currentKey); - String topic = null; + String topic; int partition = 0; long startOffset = 0L; if (matcher.find()) { topic = matcher.group(PATTERN_TOPIC_KEY); partition = Integer.parseInt(matcher.group(PATTERN_PARTITION_KEY)); startOffset = Long.parseLong(matcher.group(OFFSET_KEY)); + } else { + topic = offsetManager.getFirstConfiguredTopic(s3SourceConfig); } final String finalTopic = topic; @@ -122,21 +124,6 @@ private Iterator<List<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentF final long finalStartOffset = startOffset; return getObjectIterator(inputStream, finalTopic, finalPartition, finalStartOffset, outputWriter); - - // switch (s3SourceConfig.getString(OUTPUT_FORMAT)) { - // case AVRO_OUTPUT_FORMAT : - // - // return getObjectIterator(inputStream, finalTopic, finalPartition, finalStartOffset, datumReader, - // AVRO_OUTPUT_FORMAT); - // case PARQUET_OUTPUT_FORMAT : - // return getObjectIterator(inputStream, finalTopic, finalPartition, finalStartOffset, null, - // PARQUET_OUTPUT_FORMAT); - // case JSON_OUTPUT_FORMAT : - // return getObjectIterator(inputStream, finalTopic, finalPartition, finalStartOffset, null, - // JSON_OUTPUT_FORMAT); - // default : - // return getObjectIterator(inputStream, finalTopic, finalPartition, finalStartOffset, null, ""); - // } } } @@ -152,7 +139,6 @@ private List<ConsumerRecord<byte[], byte[]>> readNext() { final Optional<byte[]> optionalKeyBytes = Optional.ofNullable(currentKey) .map(k -> k.getBytes(StandardCharsets.UTF_8)); final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = new ArrayList<>(); - // handleValueData(optionalKeyBytes, consumerRecordList); outputWriter.handleValueData(optionalKeyBytes, valueInputStream, topic, consumerRecordList, s3SourceConfig, topicPartition, startOffset, offsetManager, currentOffsets); @@ -164,62 +150,6 @@ private List<ConsumerRecord<byte[], byte[]>> readNext() { } } - // private void handleValueData(final Optional<byte[]> key, - // final List<ConsumerRecord<byte[], byte[]>> consumerRecordList) throws IOException { - // - // switch (fileFormat) { - // case PARQUET_OUTPUT_FORMAT : { - // final List<GenericRecord> records = ParquetUtils.getRecords(valueInputStream, topic); - // for (final GenericRecord record : records) { - // final byte[] valueBytes = serializeAvroRecordToBytes(Collections.singletonList(record), - // topic); - // consumerRecordList.add(getConsumerRecord(key, valueBytes)); - // } - // break; - // } - // case AVRO_OUTPUT_FORMAT : { - // final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); - // DecoderFactory.get().binaryDecoder(valueInputStream, null); - // final List<GenericRecord> records = readAvroRecords(valueInputStream, datumReader); - // for (final GenericRecord record : records) { - // final byte[] valueBytes = serializeAvroRecordToBytes(Collections.singletonList(record), - // topic); - // consumerRecordList.add(getConsumerRecord(key, valueBytes)); - // } - // break; - // } - // case JSON_OUTPUT_FORMAT : - // consumerRecordList.add(getConsumerRecord(key, serializeJsonData(valueInputStream))); - // break; - // default : - // consumerRecordList.add(getConsumerRecord(key, IOUtils.toByteArray(valueInputStream))); - // break; - // } - // } - - // private ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> key, final byte[] value) - // { - // final Map<String, Object> partitionMap = new HashMap<>(); - // partitionMap.put(BUCKET, bucketName); - // partitionMap.put(TOPIC, topic); - // partitionMap.put(PARTITION, topicPartition); - // - // long currentOffset; - // - // if (offsetManager.getOffsets().containsKey(partitionMap)) { - // currentOffset = offsetManager.getIncrementedOffsetForPartition(partitionMap); - // } else { - // currentOffset = currentOffsets.getOrDefault(partitionMap, startOffset); - // } - // - // final ConsumerRecord<byte[], byte[]> record = new ConsumerRecord<>(topic, topicPartition, currentOffset, - // key.orElse(null), value); - // - // offsetManager.updateOffset(partitionMap, currentOffset); - // - // return record; - // } - @Override public boolean hasNext() { return !nextRecord.isEmpty(); @@ -237,41 +167,6 @@ public List<ConsumerRecord<byte[], byte[]>> next() { }; } - // private List<GenericRecord> readAvroRecords(final InputStream content, final DatumReader<GenericRecord> - // datumReader) - // throws IOException { - // final List<GenericRecord> records = new ArrayList<>(); - // try (SeekableByteArrayInput sin = new SeekableByteArrayInput(IOUtils.toByteArray(content))) { - // try (DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, datumReader)) { - // reader.forEach(records::add); - // } - // } - // return records; - // } - - // private byte[] serializeJsonData(final InputStream inputStream) throws IOException { - // final JsonNode jsonNode = objectMapper.readTree(inputStream); - // return objectMapper.writeValueAsBytes(jsonNode); - // } - - // @Deprecated - // private byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String topic) - // throws IOException { - // final Map<String, String> config = Collections.singletonMap(SCHEMA_REGISTRY_URL, - // s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); - // - // try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) s3SourceConfig.getClass(VALUE_SERIALIZER) - // .newInstance(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { - // avroSerializer.configure(config, false); - // for (final GenericRecord avroRecord : avroRecords) { - // out.write(avroSerializer.serialize(topic, avroRecord)); - // } - // return out.toByteArray(); - // } catch (InstantiationException | IllegalAccessException e) { - // throw new IllegalStateException("Failed to initialize serializer", e); - // } - // } - @Override public boolean hasNext() { return recordIterator.hasNext() || nextFileIterator.hasNext(); From f9aed28a69424774ba8f9fa75d15b6f4c8073f6c Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Sun, 6 Oct 2024 13:21:19 +0200 Subject: [PATCH 26/90] Add enum for output formats --- .../connect/s3/source/IntegrationTest.java | 40 +++++----- .../kafka/connect/s3/source/S3SourceTask.java | 4 +- .../s3/source/config/S3SourceConfig.java | 74 ++++++------------- .../connect/s3/source/output/JsonWriter.java | 2 +- .../s3/source/output/OutputFormat.java | 45 +++++++++++ .../s3/source/output/OutputWriterFactory.java | 18 ++--- .../s3/source/utils/OffsetManager.java | 45 ----------- .../s3/source/utils/RecordProcessor.java | 2 - .../s3/source/utils/SourceRecordIterator.java | 46 ++++++------ 9 files changed, 118 insertions(+), 158 deletions(-) create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index c3a16c069..3ad82e761 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -16,16 +16,12 @@ package io.aiven.kafka.connect.s3.source; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_ACCESS_KEY_ID_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_ENDPOINT_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.BYTE_OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.PARQUET_OUTPUT_FORMAT; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT_KEY; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; @@ -50,6 +46,7 @@ import org.apache.kafka.clients.admin.AdminClient; +import io.aiven.kafka.connect.s3.source.output.OutputFormat; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import io.aiven.kafka.connect.s3.source.testutils.S3OutputStream; @@ -150,19 +147,19 @@ void tearDown() { } @Test - void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { + void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectRunner.createConnector(connectorConfig); - connectorConfig.put(OUTPUT_FORMAT, BYTE_OUTPUT_FORMAT); + connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.BYTES.getFormat()); final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; // write 2 objects to s3 - writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), 1); - writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), 2); + writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00001"); + writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00002"); final List<String> objects = testBucketAccessor.listObjects(); assertThat(objects.size()).isEqualTo(2); @@ -178,14 +175,13 @@ void basicTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx } @Test - void multiPartUploadTest(final TestInfo testInfo) throws ExecutionException, InterruptedException { + void multiPartUploadBytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectRunner.createConnector(connectorConfig); final String partition = "00001"; - final String offset = "000000000121"; - final String key = topicName + "-" + partition + "-" + offset + ".txt"; + final String key = topicName + "-" + partition + ".txt"; multipartUpload(TEST_BUCKET_NAME, key); // Poll messages from the Kafka topic and verify the consumed data final List<String> records = IntegrationBase.consumeMessages(topicName, 1, KAFKA_CONTAINER); @@ -196,7 +192,7 @@ void multiPartUploadTest(final TestInfo testInfo) throws ExecutionException, Int void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectorConfig.put(OUTPUT_FORMAT, AVRO_OUTPUT_FORMAT); + connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.AVRO.getFormat()); connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); connectorConfig.put("value.converter.schema.registry.url", SCHEMA_REGISTRY.getSchemaRegistryUrl()); @@ -214,8 +210,8 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc final ByteArrayOutputStream outputStream1 = getAvroRecord(schema, 1); final ByteArrayOutputStream outputStream2 = getAvroRecord(schema, 2); - writeToS3(topicName, outputStream1.toByteArray(), 1); - writeToS3(topicName, outputStream2.toByteArray(), 2); + writeToS3(topicName, outputStream1.toByteArray(), "00001"); + writeToS3(topicName, outputStream2.toByteArray(), "00002"); final List<String> objects = testBucketAccessor.listObjects(); assertThat(objects.size()).isEqualTo(2); @@ -238,7 +234,7 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc void parquetTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectorConfig.put(OUTPUT_FORMAT, PARQUET_OUTPUT_FORMAT); + connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.PARQUET.getFormat()); connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); connectorConfig.put("value.converter.schema.registry.url", SCHEMA_REGISTRY.getSchemaRegistryUrl()); @@ -269,13 +265,13 @@ void parquetTest(final TestInfo testInfo) throws ExecutionException, Interrupted void jsonTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectorConfig.put(OUTPUT_FORMAT, JSON_OUTPUT_FORMAT); + connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.JSON.getFormat()); connectorConfig.put("value.converter", "org.apache.kafka.connect.json.JsonConverter"); connectRunner.createConnector(connectorConfig); final String testMessage = "This is a test"; final String jsonContent = "{\"message\": \"" + testMessage + "\", \"id\":\"1\"}"; - writeToS3(topicName, jsonContent.getBytes(StandardCharsets.UTF_8), 7); + writeToS3(topicName, jsonContent.getBytes(StandardCharsets.UTF_8), "00001"); // Poll Json messages from the Kafka topic and deserialize them final List<JsonNode> records = IntegrationBase.consumeJsonMessages(topicName, 1, KAFKA_CONTAINER); @@ -302,11 +298,11 @@ private static ByteArrayOutputStream getAvroRecord(final Schema schema, final in return outputStream; } - private static void writeToS3(final String topicName, final byte[] testDataBytes, final int offsetId) + private static void writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) throws IOException { - final String partition = "00000"; - final String offset = "00000000012" + offsetId; - final String fileName = topicName + "-" + partition + "-" + offset + ".txt"; + // final String partition = "00000"; + // final String offset = "00000000012" + offsetId; + final String fileName = topicName + "-" + partitionId + ".txt"; final Path testFilePath = Paths.get("/tmp/" + fileName); Files.write(testFilePath, testDataBytes); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index a6e5ab5ac..3227f8e62 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -18,7 +18,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT_KEY; import java.util.ArrayList; import java.util.Iterator; @@ -93,7 +93,7 @@ public void start(final Map<String, String> props) { initializeConverters(); initializeS3Client(); this.s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); - this.outputWriter = OutputWriterFactory.getWriter(s3SourceConfig.getString(OUTPUT_FORMAT), this.s3Bucket); + this.outputWriter = OutputWriterFactory.getWriter(s3SourceConfig.getString(OUTPUT_FORMAT_KEY), this.s3Bucket); prepareReaderFromOffsetStorageReader(); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 4fe938bce..51be2bc7b 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -30,6 +30,7 @@ import io.aiven.kafka.connect.common.config.validators.NonEmptyPassword; import io.aiven.kafka.connect.common.config.validators.UrlValidator; +import io.aiven.kafka.connect.s3.source.output.OutputFormat; import com.amazonaws.auth.AWSCredentialsProvider; import com.amazonaws.regions.Region; @@ -47,6 +48,21 @@ final public class S3SourceConfig extends AbstractConfig { @Deprecated public static final String AWS_ACCESS_KEY_ID = "aws_access_key_id"; + @Deprecated + public static final String AWS_S3_ENDPOINT = "aws_s3_endpoint"; + + @Deprecated + public static final String AWS_S3_REGION = "aws_s3_region"; + + @Deprecated + public static final String AWS_SECRET_ACCESS_KEY = "aws_secret_access_key"; + + @Deprecated + public static final String AWS_S3_PREFIX_CONFIG = "aws.s3.prefix"; + + @Deprecated + public static final String AWS_S3_PREFIX = "aws_s3_prefix"; + public static final String AWS_S3_RETRY_BACKOFF_DELAY_MS_CONFIG = "aws.s3.backoff.delay.ms"; public static final String AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_CONFIG = "aws.s3.backoff.max.delay.ms"; @@ -57,80 +73,37 @@ final public class S3SourceConfig extends AbstractConfig { public static final String AWS_S3_ENDPOINT_CONFIG = "aws.s3.endpoint"; - @Deprecated - public static final String AWS_S3_ENDPOINT = "aws_s3_endpoint"; - - @Deprecated - public static final String AWS_S3_REGION = "aws_s3_region"; - public static final String AWS_STS_ROLE_ARN = "aws.sts.role.arn"; - public static final String AWS_STS_ROLE_EXTERNAL_ID = "aws.sts.role.external.id"; - public static final String AWS_STS_ROLE_SESSION_NAME = "aws.sts.role.session.name"; public static final String AWS_STS_ROLE_SESSION_DURATION = "aws.sts.role.session.duration"; public static final String AWS_STS_CONFIG_ENDPOINT = "aws.sts.config.endpoint"; - private static final String GROUP_AWS = "AWS"; private static final String GROUP_AWS_STS = "AWS_STS"; private static final String GROUP_OTHER = "OTHER_CFG"; - private static final String GROUP_OFFSET_TOPIC = "OFFSET_TOPIC"; - - private static final String GROUP_FILE = "FILE_SPECIFIC"; - private static final String GROUP_S3_RETRY_BACKOFF_POLICY = "S3 retry backoff policy"; // Default values from AWS SDK, since they are hidden public static final int AWS_S3_RETRY_BACKOFF_DELAY_MS_DEFAULT = 100; public static final int AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_DEFAULT = 20_000; - - @Deprecated - public static final String AWS_SECRET_ACCESS_KEY = "aws_secret_access_key"; - - @Deprecated - public static final String AWS_S3_PREFIX_CONFIG = "aws.s3.prefix"; - - @Deprecated - public static final String AWS_S3_PREFIX = "aws_s3_prefix"; - public static final String SCHEMA_REGISTRY_URL = "schema.registry.url"; - - public static final String VALUE_SERIALIZER = "value.serializer"; // ex : - // io.confluent.kafka.serializers.KafkaAvroSerializer - + public static final String VALUE_SERIALIZER = "value.serializer"; public static final String AWS_ACCESS_KEY_ID_CONFIG = "aws.access.key.id"; public static final String AWS_SECRET_ACCESS_KEY_CONFIG = "aws.secret.access.key"; - public static final String AWS_CREDENTIALS_PROVIDER_CONFIG = "aws.credentials.provider"; - public static final String AWS_CREDENTIAL_PROVIDER_DEFAULT = "com.amazonaws.auth.DefaultAWSCredentialsProviderChain"; - public static final String AWS_S3_BUCKET_NAME_CONFIG = "aws.s3.bucket.name"; - public static final String AWS_S3_SSE_ALGORITHM_CONFIG = "aws.s3.sse.algorithm"; - public static final String TARGET_TOPIC_PARTITIONS = "topic.partitions"; public static final String TARGET_TOPICS = "topics"; - - public static final String START_MARKER_KEY = "aws.s3.start.marker"; public static final String FETCH_PAGE_SIZE = "aws.s3.fetch.page.size"; - public static final String MAX_POLL_RECORDS = "max.poll.records"; - public static final String KEY_CONVERTER = "key.converter"; public static final String VALUE_CONVERTER = "value.converter"; - public static final int S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT = 3; - public static final String OUTPUT_FORMAT = "output.format"; - - public static final String AVRO_OUTPUT_FORMAT = "avro"; - - public static final String PARQUET_OUTPUT_FORMAT = "parquet"; - - public static final String JSON_OUTPUT_FORMAT = "json"; - - public static final String BYTE_OUTPUT_FORMAT = "bytes"; + public static final String OUTPUT_FORMAT_KEY = "output.format"; + public static final String SCHEMAS_ENABLE = "schemas.enable"; public S3SourceConfig(final Map<String, String> properties) { super(configDef(), preprocessProperties(properties)); @@ -184,11 +157,12 @@ public static ConfigDef configDef() { private static void addSchemaRegistryGroup(final ConfigDef configDef) { int srCounter = 0; configDef.define(SCHEMA_REGISTRY_URL, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "SCHEMA REGISTRY URL", GROUP_AWS, srCounter++, ConfigDef.Width.NONE, + ConfigDef.Importance.MEDIUM, "SCHEMA REGISTRY URL", GROUP_OTHER, srCounter++, ConfigDef.Width.NONE, SCHEMA_REGISTRY_URL); - configDef.define(OUTPUT_FORMAT, ConfigDef.Type.STRING, BYTE_OUTPUT_FORMAT, new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "Output format avro/bytearray", GROUP_AWS, srCounter++, // NOPMD - ConfigDef.Width.NONE, OUTPUT_FORMAT); + configDef.define(OUTPUT_FORMAT_KEY, ConfigDef.Type.STRING, OutputFormat.BYTES.getFormat(), + new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "Output format avro/json/parquet/bytes", + GROUP_OTHER, srCounter++, // NOPMD + ConfigDef.Width.NONE, OUTPUT_FORMAT_KEY); configDef.define(VALUE_SERIALIZER, ConfigDef.Type.CLASS, "io.confluent.kafka.serializers.KafkaAvroSerializer", ConfigDef.Importance.MEDIUM, "Value serializer", GROUP_OTHER, srCounter++, // NOPMD diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java index 0655f1b51..f756d68a9 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java @@ -16,7 +16,7 @@ package io.aiven.kafka.connect.s3.source.output; -import static io.aiven.kafka.connect.s3.source.utils.RecordProcessor.SCHEMAS_ENABLE; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMAS_ENABLE; import java.io.IOException; import java.io.InputStream; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java new file mode 100644 index 000000000..3b734eee8 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java @@ -0,0 +1,45 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.output; + +public enum OutputFormat { + AVRO("avro"), PARQUET("parquet"), JSON("json"), BYTES("bytes"); + + private final String format; + + OutputFormat(final String format) { + this.format = format; + } + + public String getFormat() { + return format; + } + + public static OutputFormat valueOfFormat(final String outFormat) { + for (final OutputFormat outputFormat : values()) { + if (outputFormat.format.equalsIgnoreCase(outFormat)) { + return outputFormat; + } + } + throw new IllegalArgumentException("Unknown outFormat: " + outFormat); + } + + @Override + public String toString() { + return format; + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java index e0b0d2e2d..df7f86338 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java @@ -16,25 +16,21 @@ package io.aiven.kafka.connect.s3.source.output; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AVRO_OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.BYTE_OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.JSON_OUTPUT_FORMAT; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.PARQUET_OUTPUT_FORMAT; - public final class OutputWriterFactory { private OutputWriterFactory() { - throw new UnsupportedOperationException("Class cannot be instantiated"); + // hidden } public static OutputWriter getWriter(final String outputFormat, final String bucket) { - switch (outputFormat) { - case AVRO_OUTPUT_FORMAT : + final OutputFormat outputFormatEnum = OutputFormat.valueOfFormat(outputFormat); + switch (outputFormatEnum) { + case AVRO : return new AvroWriter(bucket); - case PARQUET_OUTPUT_FORMAT : + case PARQUET : return new ParquetWriter(bucket); - case JSON_OUTPUT_FORMAT : + case JSON : return new JsonWriter(bucket); - case BYTE_OUTPUT_FORMAT : + case BYTES : return new ByteArrayWriter(bucket); default : throw new IllegalArgumentException("Unknown output format: " + outputFormat); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index f499313e4..03e562085 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -38,15 +38,6 @@ public class OffsetManager { private final Map<Map<String, Object>, Map<String, Object>> offsets; - /** - * Constructor for OffsetManager. Initializes with the task context and S3 source configuration, and retrieves - * offsets. - * - * @param context - * SourceTaskContext that provides access to the offset storage - * @param s3SourceConfig - * S3SourceConfig that contains the source configuration details - */ public OffsetManager(final SourceTaskContext context, final S3SourceConfig s3SourceConfig) { final String s3Bucket = s3SourceConfig.getString(S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG); final Set<Integer> partitions = parsePartitions(s3SourceConfig); @@ -63,11 +54,6 @@ public OffsetManager(final SourceTaskContext context, final S3SourceConfig s3Sou .collect(toMap(entry -> new HashMap<>(entry.getKey()), entry -> new HashMap<>(entry.getValue()))); } - /** - * Fetches all offsets for the current partitions and topics from the context. - * - * @return Map of partition keys and their corresponding offsets - */ public Map<Map<String, Object>, Map<String, Object>> getOffsets() { return offsets; } @@ -76,12 +62,6 @@ public long getIncrementedOffsetForPartition(final Map<String, Object> partition return (long) (offsets.get(partitionMap)).get(OFFSET_KEY) + 1L; } - /** - * Updates the offset for a specific partition. - * - * @param partitionMap - * The partition map. - */ public void updateOffset(final Map<String, Object> partitionMap, final long currentOffset) { final Map<String, Object> newOffset = new HashMap<>(); // increment offset id by 1 @@ -89,25 +69,11 @@ public void updateOffset(final Map<String, Object> partitionMap, final long curr offsets.put(partitionMap, newOffset); } - /** - * Helper method to parse partitions from the configuration. - * - * @param s3SourceConfig - * The S3 source configuration. - * @return Set of partitions. - */ private static Set<Integer> parsePartitions(final S3SourceConfig s3SourceConfig) { final String partitionString = s3SourceConfig.getString(S3SourceConfig.TARGET_TOPIC_PARTITIONS); return Arrays.stream(partitionString.split(",")).map(Integer::parseInt).collect(Collectors.toSet()); } - /** - * Helper method to parse topics from the configuration. - * - * @param s3SourceConfig - * The S3 source configuration. - * @return Set of topics. - */ private static Set<String> parseTopics(final S3SourceConfig s3SourceConfig) { final String topicString = s3SourceConfig.getString(S3SourceConfig.TARGET_TOPICS); return Arrays.stream(topicString.split(",")).collect(Collectors.toSet()); @@ -120,17 +86,6 @@ String getFirstConfiguredTopic(final S3SourceConfig s3SourceConfig) throws Conne .orElseThrow(() -> new ConnectException("Topic could not be derived")); } - /** - * Builds partition keys to be used for offset retrieval. - * - * @param bucket - * The S3 bucket name. - * @param partitions - * The set of partitions. - * @param topics - * The set of topics. - * @return List of partition keys (maps) used for fetching offsets. - */ private static List<Map<String, Object>> buildPartitionKeys(final String bucket, final Set<Integer> partitions, final Set<String> topics) { final List<Map<String, Object>> partitionKeys = new ArrayList<>(); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index fad361049..9356dede2 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -33,8 +33,6 @@ public final class RecordProcessor { - public static final String SCHEMAS_ENABLE = "schemas.enable"; - private RecordProcessor() { } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 1c958e2a4..64069bf3e 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -43,23 +43,21 @@ import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; -import com.fasterxml.jackson.databind.ObjectMapper; /** * Iterator that processes S3 files and creates Kafka source records. Supports different output formats (Avro, JSON, * Parquet). */ -@SuppressWarnings("PMD.ExcessiveImports") public final class SourceRecordIterator implements Iterator<List<AivenS3SourceRecord>> { - - public static final Pattern DEFAULT_PATTERN = Pattern - .compile("(?<topic>[^/]+?)-" + "(?<partition>\\d{5})-" + "(?<offset>\\d{12})" + "\\.(?<extension>[^.]+)$"); - public static final String PATTERN_TOPIC_KEY = "topic"; - public static final String PATTERN_PARTITION_KEY = "partition"; + public static final String PATTERN_TOPIC_KEY = "topicName"; + public static final String PATTERN_PARTITION_KEY = "partitionId"; public static final String OFFSET_KEY = "offset"; - private String currentKey; - final ObjectMapper objectMapper = new ObjectMapper(); + public static final Pattern FILE_DEFAULT_PATTERN = Pattern + .compile("(?<topicName>[^/]+?)-" + "(?<partitionId>\\d{5})" + "\\.(?<fileExtension>[^.]+)$"); // ex : + // topic-00001.txt + private String currentObjectKey; + private Iterator<S3ObjectSummary> nextFileIterator; private Iterator<List<ConsumerRecord<byte[], byte[]>>> recordIterator = Collections.emptyIterator(); @@ -97,7 +95,7 @@ private void nextS3Object() { try { final S3ObjectSummary file = nextFileIterator.next(); - currentKey = file.getKey(); + currentObjectKey = file.getKey(); recordIterator = createIteratorForCurrentFile(); } catch (IOException e) { throw new AmazonClientException(e); @@ -105,25 +103,23 @@ private void nextS3Object() { } private Iterator<List<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentFile() throws IOException { - final S3Object s3Object = s3Client.getObject(bucketName, currentKey); + final S3Object s3Object = s3Client.getObject(bucketName, currentObjectKey); try (InputStream inputStream = fileReader.getContent(s3Object)) { - final Matcher matcher = DEFAULT_PATTERN.matcher(currentKey); - String topic; - int partition = 0; - long startOffset = 0L; - if (matcher.find()) { - topic = matcher.group(PATTERN_TOPIC_KEY); - partition = Integer.parseInt(matcher.group(PATTERN_PARTITION_KEY)); - startOffset = Long.parseLong(matcher.group(OFFSET_KEY)); + String topicName; + int defaultPartitionId = 0; + final long defaultStartOffsetId = 0L; + + final Matcher fileMatcher = FILE_DEFAULT_PATTERN.matcher(currentObjectKey); + if (fileMatcher.find()) { + topicName = fileMatcher.group(PATTERN_TOPIC_KEY); + defaultPartitionId = Integer.parseInt(fileMatcher.group(PATTERN_PARTITION_KEY)); } else { - topic = offsetManager.getFirstConfiguredTopic(s3SourceConfig); + topicName = offsetManager.getFirstConfiguredTopic(s3SourceConfig); } - final String finalTopic = topic; - final int finalPartition = partition; - final long finalStartOffset = startOffset; + final String finalTopic = topicName; - return getObjectIterator(inputStream, finalTopic, finalPartition, finalStartOffset, outputWriter); + return getObjectIterator(inputStream, finalTopic, defaultPartitionId, defaultStartOffsetId, outputWriter); } } @@ -136,7 +132,7 @@ private Iterator<List<ConsumerRecord<byte[], byte[]>>> getObjectIterator(final I private List<ConsumerRecord<byte[], byte[]>> readNext() { try { - final Optional<byte[]> optionalKeyBytes = Optional.ofNullable(currentKey) + final Optional<byte[]> optionalKeyBytes = Optional.ofNullable(currentObjectKey) .map(k -> k.getBytes(StandardCharsets.UTF_8)); final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = new ArrayList<>(); outputWriter.handleValueData(optionalKeyBytes, valueInputStream, topic, consumerRecordList, From 97551a5a78725f6bf739164eae07ed626e103bc9 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Mon, 7 Oct 2024 11:24:33 +0200 Subject: [PATCH 27/90] Update from review --- gradle-config/spotbugs-exclude.xml | 17 ----- .../s3/source/config/S3SourceConfig.java | 54 ++++------------ .../connect/s3/source/output/AvroWriter.java | 19 ++++-- .../s3/source/output/ByteArrayWriter.java | 13 +++- .../connect/s3/source/output/JsonWriter.java | 21 +++++-- .../s3/source/output/OutputWriter.java | 28 +++++---- .../s3/source/output/ParquetWriter.java | 9 +-- .../s3/source/utils/AivenS3SourceRecord.java | 22 +++++-- .../connect/s3/source/utils/FileReader.java | 6 -- .../s3/source/utils/OffsetManager.java | 22 ++++--- .../connect/s3/source/utils/ParquetUtils.java | 63 ++++++++++++------- .../s3/source/utils/RecordProcessor.java | 8 +-- .../s3/source/utils/SourceRecordIterator.java | 28 +++------ 13 files changed, 153 insertions(+), 157 deletions(-) diff --git a/gradle-config/spotbugs-exclude.xml b/gradle-config/spotbugs-exclude.xml index 0ab87e082..48e98ab56 100644 --- a/gradle-config/spotbugs-exclude.xml +++ b/gradle-config/spotbugs-exclude.xml @@ -19,27 +19,10 @@ <Class name="io.aiven.kafka.connect.common.output.parquet.ParquetOutputWriterTest$ParquetInputFile" /> <Bug pattern="CT_CONSTRUCTOR_THROW" /> </Match> - <Match> - <Class name="io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator$1" /> - <Bug pattern="CT_CONSTRUCTOR_THROW" /> - </Match> <Match> <Class name="io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator" /> <Bug pattern="EI_EXPOSE_REP2" /> </Match> - <Match> - <Class name="io.aiven.kafka.connect.s3.source.utils.AivenS3SourceRecord" /> - <Bug pattern="EI_EXPOSE_REP2" /> - </Match> - <Match> - <Class name="io.aiven.kafka.connect.s3.source.utils.AivenS3SourceRecord" /> - <Bug pattern="EI_EXPOSE_REP" /> - </Match> - <Match> - <Class name="io.aiven.kafka.connect.s3.source.utils.OffsetManager" /> - <Bug pattern="EI_EXPOSE_REP" /> - </Match> - <!-- Test classes --> diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 51be2bc7b..07633efee 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -45,24 +45,8 @@ final public class S3SourceConfig extends AbstractConfig { public static final Logger LOGGER = LoggerFactory.getLogger(S3SourceConfig.class); - @Deprecated - public static final String AWS_ACCESS_KEY_ID = "aws_access_key_id"; - - @Deprecated - public static final String AWS_S3_ENDPOINT = "aws_s3_endpoint"; - - @Deprecated - public static final String AWS_S3_REGION = "aws_s3_region"; - - @Deprecated - public static final String AWS_SECRET_ACCESS_KEY = "aws_secret_access_key"; - - @Deprecated public static final String AWS_S3_PREFIX_CONFIG = "aws.s3.prefix"; - @Deprecated - public static final String AWS_S3_PREFIX = "aws_s3_prefix"; - public static final String AWS_S3_RETRY_BACKOFF_DELAY_MS_CONFIG = "aws.s3.backoff.delay.ms"; public static final String AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_CONFIG = "aws.s3.backoff.max.delay.ms"; @@ -116,12 +100,12 @@ static Map<String, String> preprocessProperties(final Map<String, String> proper } private static Map<String, String> handleDeprecatedYyyyUppercase(final Map<String, String> properties) { - if (!properties.containsKey(AWS_S3_PREFIX_CONFIG) && !properties.containsKey(AWS_S3_PREFIX)) { + if (!properties.containsKey(AWS_S3_PREFIX_CONFIG)) { return properties; } final var result = new HashMap<>(properties); - for (final var prop : List.of(AWS_S3_PREFIX_CONFIG, AWS_S3_PREFIX)) { + for (final var prop : List.of(AWS_S3_PREFIX_CONFIG)) { if (properties.containsKey(prop)) { String template = properties.get(prop); final String originalTemplate = template; @@ -298,10 +282,10 @@ private static void addAwsConfigGroup(final ConfigDef configDef) { ConfigDef.Importance.MEDIUM, "AWS S3 Region, e.g. us-east-1", GROUP_AWS, awsGroupCounter++, // NOPMD // UnusedAssignment ConfigDef.Width.NONE, AWS_S3_REGION_CONFIG); - configDef.define(AWS_S3_REGION, ConfigDef.Type.STRING, null, new AwsRegionValidator(), + configDef.define(AWS_S3_REGION_CONFIG, ConfigDef.Type.STRING, null, new AwsRegionValidator(), ConfigDef.Importance.MEDIUM, "AWS S3 Region, e.g. us-east-1", GROUP_AWS, awsGroupCounter++, // NOPMD // UnusedAssignment - ConfigDef.Width.NONE, AWS_S3_REGION); + ConfigDef.Width.NONE, AWS_S3_REGION_CONFIG); } protected static class AwsRegionValidator implements ConfigDef.Validator { @@ -350,54 +334,42 @@ public int getS3RetryBackoffMaxRetries() { return getInt(AWS_S3_RETRY_BACKOFF_MAX_RETRIES_CONFIG); } - public Region getAwsS3Region() { + Region getAwsS3Region() { // we have priority of properties if old one not set or both old and new one set // the new property value will be selected if (Objects.nonNull(getString(AWS_S3_REGION_CONFIG))) { return RegionUtils.getRegion(getString(AWS_S3_REGION_CONFIG)); - } else if (Objects.nonNull(getString(AWS_S3_REGION))) { - return RegionUtils.getRegion(getString(AWS_S3_REGION)); } else { return RegionUtils.getRegion(Regions.US_EAST_1.getName()); } } - public String getAwsS3EndPoint() { - return Objects.nonNull(getString(AWS_S3_ENDPOINT_CONFIG)) - ? getString(AWS_S3_ENDPOINT_CONFIG) - : getString(AWS_S3_ENDPOINT); + String getAwsS3EndPoint() { + return getString(AWS_S3_ENDPOINT_CONFIG); } - public boolean hasAwsStsRole() { + boolean hasAwsStsRole() { return getStsRole().isValid(); } - public AwsStsRole getStsRole() { + AwsStsRole getStsRole() { return new AwsStsRole(getString(AWS_STS_ROLE_ARN), getString(AWS_STS_ROLE_EXTERNAL_ID), getString(AWS_STS_ROLE_SESSION_NAME), getInt(AWS_STS_ROLE_SESSION_DURATION)); } - public boolean hasStsEndpointConfig() { + boolean hasStsEndpointConfig() { return getStsEndpointConfig().isValid(); } - public AwsStsEndpointConfig getStsEndpointConfig() { + AwsStsEndpointConfig getStsEndpointConfig() { return new AwsStsEndpointConfig(getString(AWS_STS_CONFIG_ENDPOINT), getString(AWS_S3_REGION_CONFIG)); } - public AwsAccessSecret getAwsCredentials() { - return getNewAwsCredentials().isValid() ? getNewAwsCredentials() : getOldAwsCredentials(); - } - - public AwsAccessSecret getNewAwsCredentials() { + AwsAccessSecret getAwsCredentials() { return new AwsAccessSecret(getPassword(AWS_ACCESS_KEY_ID_CONFIG), getPassword(AWS_SECRET_ACCESS_KEY_CONFIG)); } - public AwsAccessSecret getOldAwsCredentials() { - return new AwsAccessSecret(getPassword(AWS_ACCESS_KEY_ID), getPassword(AWS_SECRET_ACCESS_KEY)); - } - - public AWSCredentialsProvider getCustomCredentialsProvider() { + AWSCredentialsProvider getCustomCredentialsProvider() { return getConfiguredInstance(AWS_CREDENTIALS_PROVIDER_CONFIG, AWSCredentialsProvider.class); } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java index 666bad98f..c9dfc16f1 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java @@ -38,9 +38,12 @@ import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import org.apache.avro.io.DecoderFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class AvroWriter implements OutputWriter { + private static final Logger LOGGER = LoggerFactory.getLogger(AvroWriter.class); private final String bucketName; public AvroWriter(final String bucketName) { @@ -56,25 +59,31 @@ public void configureValueConverter(final Map<String, String> config, final S3So public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, - final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) throws IOException { + final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) { final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); DecoderFactory.get().binaryDecoder(inputStream, null); final List<GenericRecord> records = readAvroRecords(inputStream, datumReader); for (final GenericRecord record : records) { final byte[] valueBytes = serializeAvroRecordToBytes(Collections.singletonList(record), topic, s3SourceConfig); - consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, this.bucketName, topic, - topicPartition, offsetManager, currentOffsets, startOffset)); + if (valueBytes.length > 0) { + consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, this.bucketName, topic, + topicPartition, offsetManager, currentOffsets, startOffset)); + } } } - private List<GenericRecord> readAvroRecords(final InputStream content, final DatumReader<GenericRecord> datumReader) - throws IOException { + private List<GenericRecord> readAvroRecords(final InputStream content, + final DatumReader<GenericRecord> datumReader) { final List<GenericRecord> records = new ArrayList<>(); try (SeekableByteArrayInput sin = new SeekableByteArrayInput(IOUtils.toByteArray(content))) { try (DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, datumReader)) { reader.forEach(records::add); + } catch (IOException e) { + LOGGER.error("Error in reading s3 object stream " + e.getMessage()); } + } catch (IOException e) { + LOGGER.error("Error in reading s3 object stream " + e.getMessage()); } return records; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java index 8b9a33d08..c1a43faa0 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java @@ -28,8 +28,11 @@ import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import com.amazonaws.util.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class ByteArrayWriter implements OutputWriter { + private static final Logger LOGGER = LoggerFactory.getLogger(AvroWriter.class); private final String bucketName; @@ -46,8 +49,12 @@ public void configureValueConverter(final Map<String, String> config, final S3So public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, - final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) throws IOException { - consumerRecordList.add(getConsumerRecord(optionalKeyBytes, IOUtils.toByteArray(inputStream), this.bucketName, - topic, topicPartition, offsetManager, currentOffsets, startOffset)); + final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) { + try { + consumerRecordList.add(getConsumerRecord(optionalKeyBytes, IOUtils.toByteArray(inputStream), + this.bucketName, topic, topicPartition, offsetManager, currentOffsets, startOffset)); + } catch (IOException e) { + LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + } } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java index f756d68a9..a734d3b1b 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java @@ -50,13 +50,22 @@ public void configureValueConverter(final Map<String, String> config, final S3So public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, - final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) throws IOException { - consumerRecordList.add(getConsumerRecord(optionalKeyBytes, serializeJsonData(inputStream), this.bucketName, - topic, topicPartition, offsetManager, currentOffsets, startOffset)); + final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) { + final byte[] valueBytes = serializeJsonData(inputStream); + if (valueBytes.length > 0) { + consumerRecordList.add(getConsumerRecord(optionalKeyBytes, serializeJsonData(inputStream), this.bucketName, + topic, topicPartition, offsetManager, currentOffsets, startOffset)); + } } - private byte[] serializeJsonData(final InputStream inputStream) throws IOException { - final JsonNode jsonNode = objectMapper.readTree(inputStream); - return objectMapper.writeValueAsBytes(jsonNode); + private byte[] serializeJsonData(final InputStream inputStream) { + final JsonNode jsonNode; + try { + jsonNode = objectMapper.readTree(inputStream); + return objectMapper.writeValueAsBytes(jsonNode); + } catch (IOException e) { + LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + } + return new byte[0]; } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java index 97fef9bef..11cbd8081 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java @@ -25,6 +25,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.lang.reflect.InvocationTargetException; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -38,14 +39,17 @@ import io.confluent.kafka.serializers.KafkaAvroSerializer; import org.apache.avro.generic.GenericRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public interface OutputWriter { + Logger LOGGER = LoggerFactory.getLogger(AvroWriter.class); + void configureValueConverter(Map<String, String> config, S3SourceConfig s3SourceConfig); void handleValueData(Optional<byte[]> optionalKeyBytes, InputStream inputStream, String topic, List<ConsumerRecord<byte[], byte[]>> consumerRecordList, S3SourceConfig s3SourceConfig, int topicPartition, - long startOffset, OffsetManager offsetManager, Map<Map<String, Object>, Long> currentOffsets) - throws IOException; + long startOffset, OffsetManager offsetManager, Map<Map<String, Object>, Long> currentOffsets); default ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> key, final byte[] value, final String bucketName, final String topic, final int topicPartition, final OffsetManager offsetManager, @@ -58,34 +62,32 @@ default ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> long currentOffset; if (offsetManager.getOffsets().containsKey(partitionMap)) { - currentOffset = offsetManager.getIncrementedOffsetForPartition(partitionMap); + currentOffset = offsetManager.incrementAndUpdateOffsetMap(partitionMap); } else { currentOffset = currentOffsets.getOrDefault(partitionMap, startOffset); + currentOffsets.put(partitionMap, currentOffset + 1); } - final ConsumerRecord<byte[], byte[]> record = new ConsumerRecord<>(topic, topicPartition, currentOffset, - key.orElse(null), value); - - offsetManager.updateOffset(partitionMap, currentOffset); - - return record; + return new ConsumerRecord<>(topic, topicPartition, currentOffset, key.orElse(null), value); } - @Deprecated default byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String topic, - final S3SourceConfig s3SourceConfig) throws IOException { + final S3SourceConfig s3SourceConfig) { final Map<String, String> config = Collections.singletonMap(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) s3SourceConfig.getClass(VALUE_SERIALIZER) + .getDeclaredConstructor() .newInstance(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { avroSerializer.configure(config, false); for (final GenericRecord avroRecord : avroRecords) { out.write(avroSerializer.serialize(topic, avroRecord)); } return out.toByteArray(); - } catch (InstantiationException | IllegalAccessException e) { - throw new IllegalStateException("Failed to initialize serializer", e); + } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException + | IOException e) { + LOGGER.error("Error in reading s3 object stream for topic " + topic + " with error : " + e.getMessage()); } + return new byte[0]; } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java index 4276808f8..a1ac3f14d 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java @@ -18,7 +18,6 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; -import java.io.IOException; import java.io.InputStream; import java.util.Collections; import java.util.List; @@ -49,13 +48,15 @@ public void configureValueConverter(final Map<String, String> config, final S3So public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, - final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) throws IOException { + final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) { final List<GenericRecord> records = ParquetUtils.getRecords(inputStream, topic); for (final GenericRecord record : records) { final byte[] valueBytes = serializeAvroRecordToBytes(Collections.singletonList(record), topic, s3SourceConfig); - consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, this.bucketName, topic, - topicPartition, offsetManager, currentOffsets, startOffset)); + if (valueBytes.length > 0) { + consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, this.bucketName, topic, + topicPartition, offsetManager, currentOffsets, startOffset)); + } } } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java index e3ea1bc77..00e924e9f 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java @@ -17,7 +17,13 @@ package io.aiven.kafka.connect.s3.source.utils; import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; import java.util.Map; +import java.util.Optional; + +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.source.SourceRecord; public class AivenS3SourceRecord { private final Map<String, Object> partitionMap; @@ -29,8 +35,9 @@ public class AivenS3SourceRecord { public AivenS3SourceRecord(final Map<String, Object> partitionMap, final Map<String, Object> offsetMap, final String toTopic, final int topicPartition, final byte[] recordKey, final byte[] recordValue) { - this.partitionMap = partitionMap; - this.offsetMap = offsetMap; + this.partitionMap = new HashMap<>(partitionMap); + this.offsetMap = new HashMap<>(offsetMap); + this.toTopic = toTopic; this.topicPartition = topicPartition; this.recordKey = Arrays.copyOf(recordKey, recordKey.length); @@ -38,11 +45,11 @@ public AivenS3SourceRecord(final Map<String, Object> partitionMap, final Map<Str } public Map<String, Object> getPartitionMap() { - return partitionMap; + return Collections.unmodifiableMap(partitionMap); } public Map<String, Object> getOffsetMap() { - return offsetMap; + return Collections.unmodifiableMap(offsetMap); } public String getToTopic() { @@ -60,4 +67,11 @@ public byte[] key() { public byte[] value() { return recordValue.clone(); } + + public SourceRecord getSourceRecord(final String topic, final Optional<SchemaAndValue> keyData, + final SchemaAndValue schemaAndValue) { + return new SourceRecord(getPartitionMap(), getOffsetMap(), topic, partition(), + keyData.map(SchemaAndValue::schema).orElse(null), keyData.map(SchemaAndValue::value).orElse(null), + schemaAndValue.schema(), schemaAndValue.value()); + } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java index 2dfcad3ea..2b881001c 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -19,7 +19,6 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; import java.io.IOException; -import java.io.InputStream; import java.util.ArrayList; import java.util.List; @@ -28,7 +27,6 @@ import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.ListObjectsRequest; import com.amazonaws.services.s3.model.ObjectListing; -import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; public class FileReader { @@ -48,8 +46,4 @@ List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOExc return new ArrayList<>(objectListing.getObjectSummaries()); } - InputStream getContent(final S3Object object) { - return object.getObjectContent(); - } - } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index 03e562085..05cc8cbdd 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -25,6 +25,7 @@ import java.net.ConnectException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -55,18 +56,19 @@ public OffsetManager(final SourceTaskContext context, final S3SourceConfig s3Sou } public Map<Map<String, Object>, Map<String, Object>> getOffsets() { - return offsets; + return Collections.unmodifiableMap(offsets); } - public long getIncrementedOffsetForPartition(final Map<String, Object> partitionMap) { - return (long) (offsets.get(partitionMap)).get(OFFSET_KEY) + 1L; - } - - public void updateOffset(final Map<String, Object> partitionMap, final long currentOffset) { - final Map<String, Object> newOffset = new HashMap<>(); - // increment offset id by 1 - newOffset.put(OFFSET_KEY, currentOffset + 1); - offsets.put(partitionMap, newOffset); + public long incrementAndUpdateOffsetMap(final Map<String, Object> partitionMap) { + if (offsets.containsKey(partitionMap)) { + final Map<String, Object> offsetValue = offsets.get(partitionMap); + if (offsetValue.containsKey(OFFSET_KEY)) { + final long newOffsetVal = (long) offsetValue.get(OFFSET_KEY) + 1L; + offsetValue.put(OFFSET_KEY, newOffsetVal); + return newOffsetVal; + } + } + return 0L; } private static Set<Integer> parsePartitions(final S3SourceConfig s3SourceConfig) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java index d5046c657..d01276409 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java @@ -20,6 +20,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.channels.Channels; +import java.nio.channels.SeekableByteChannel; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -27,23 +28,27 @@ import java.util.ArrayList; import java.util.List; -import org.apache.kafka.connect.errors.ConnectException; +import io.aiven.kafka.connect.s3.source.output.AvroWriter; import org.apache.avro.generic.GenericRecord; import org.apache.parquet.avro.AvroParquetReader; import org.apache.parquet.io.DelegatingSeekableInputStream; import org.apache.parquet.io.InputFile; import org.apache.parquet.io.SeekableInputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public final class ParquetUtils { + private static final Logger LOGGER = LoggerFactory.getLogger(AvroWriter.class); + public static final String TMP_DIR = "/tmp"; public static final int BUFFER_SIZE = 8192; private ParquetUtils() { /* hide constructor */ } - public static List<GenericRecord> getRecords(final InputStream inputStream, final String topic) throws IOException { + public static List<GenericRecord> getRecords(final InputStream inputStream, final String topic) { final Path tmpDir = Paths.get(TMP_DIR); final String timestamp = String.valueOf(Instant.now().toEpochMilli()); @@ -59,44 +64,54 @@ public static List<GenericRecord> getRecords(final InputStream inputStream, fina bytesRead = inputStream.read(buffer); } } catch (IOException e) { - throw new ConnectException("Error writing tmp parquet file", e); + LOGGER.error("Error in reading s3 object stream for topic " + topic + " with error : " + e.getMessage()); } - final var records = new ArrayList<GenericRecord>(); - final var seekableByteChannel = Files.newByteChannel(parquetFile); - try (var parquetReader = AvroParquetReader.<GenericRecord>builder(new InputFile() { - @Override - public long getLength() throws IOException { - return seekableByteChannel.size(); - } - - @Override - public SeekableInputStream newStream() { - return new DelegatingSeekableInputStream(Channels.newInputStream(seekableByteChannel)) { + try (SeekableByteChannel seekableByteChannel = Files.newByteChannel(parquetFile); + var parquetReader = AvroParquetReader.<GenericRecord>builder(new InputFile() { @Override - public long getPos() throws IOException { - return seekableByteChannel.position(); + public long getLength() throws IOException { + return seekableByteChannel.size(); } @Override - public void seek(final long value) throws IOException { - seekableByteChannel.position(value); + public SeekableInputStream newStream() { + return new DelegatingSeekableInputStream(Channels.newInputStream(seekableByteChannel)) { + @Override + public long getPos() throws IOException { + return seekableByteChannel.position(); + } + + @Override + public void seek(final long value) throws IOException { + seekableByteChannel.position(value); + } + }; } - }; - } - }).withCompatibility(false).build()) { + }).withCompatibility(false).build()) { var record = parquetReader.read(); while (record != null) { records.add(record); record = parquetReader.read(); } + } catch (IOException e) { + LOGGER.error("Error in reading s3 object stream " + e.getMessage()); } - if (Files.exists(parquetFile)) { - Files.delete(parquetFile); - } + + deleteTmpFile(parquetFile); return records; } + private static void deleteTmpFile(final Path parquetFile) { + if (Files.exists(parquetFile)) { + try { + Files.delete(parquetFile); + } catch (IOException e) { + LOGGER.error("Error in deleting tmp file " + e.getMessage()); + } + } + } + } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index 9356dede2..10ae8fc67 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -67,13 +67,9 @@ private static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRe outputWriter.configureValueConverter(conversionConfig, s3SourceConfig); valueConverter.configure(conversionConfig, false); - final SchemaAndValue value = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); + final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); - final SourceRecord sourceRecord = new SourceRecord(aivenS3SourceRecord.getPartitionMap(), - aivenS3SourceRecord.getOffsetMap(), topic, aivenS3SourceRecord.partition(), - keyData.map(SchemaAndValue::schema).orElse(null), keyData.map(SchemaAndValue::value).orElse(null), - value.schema(), value.value()); - sourceRecordList.add(sourceRecord); + sourceRecordList.add(aivenS3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue)); } return sourceRecordList; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 64069bf3e..dd2902802 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -67,8 +67,6 @@ public final class SourceRecordIterator implements Iterator<List<AivenS3SourceRe private final String bucketName; private final AmazonS3 s3Client; - private final FileReader fileReader; - private final OutputWriter outputWriter; public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, @@ -78,7 +76,7 @@ public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 this.s3Client = s3Client; this.bucketName = bucketName; this.outputWriter = outputWriter; - this.fileReader = new FileReader(s3SourceConfig, bucketName); + final FileReader fileReader = new FileReader(s3SourceConfig, bucketName); try { final List<S3ObjectSummary> chunks = fileReader.fetchObjectSummaries(s3Client); nextFileIterator = chunks.iterator(); @@ -103,8 +101,8 @@ private void nextS3Object() { } private Iterator<List<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentFile() throws IOException { - final S3Object s3Object = s3Client.getObject(bucketName, currentObjectKey); - try (InputStream inputStream = fileReader.getContent(s3Object)) { + try (S3Object s3Object = s3Client.getObject(bucketName, currentObjectKey); + InputStream inputStream = s3Object.getObjectContent()) { String topicName; int defaultPartitionId = 0; final long defaultStartOffsetId = 0L; @@ -131,19 +129,13 @@ private Iterator<List<ConsumerRecord<byte[], byte[]>>> getObjectIterator(final I private List<ConsumerRecord<byte[], byte[]>> nextRecord = readNext(); private List<ConsumerRecord<byte[], byte[]>> readNext() { - try { - final Optional<byte[]> optionalKeyBytes = Optional.ofNullable(currentObjectKey) - .map(k -> k.getBytes(StandardCharsets.UTF_8)); - final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = new ArrayList<>(); - outputWriter.handleValueData(optionalKeyBytes, valueInputStream, topic, consumerRecordList, - s3SourceConfig, topicPartition, startOffset, offsetManager, currentOffsets); - - return consumerRecordList; - - } catch (IOException e) { - throw new org.apache.kafka.connect.errors.ConnectException( - "Connect converters could not be instantiated.", e); - } + final Optional<byte[]> optionalKeyBytes = Optional.ofNullable(currentObjectKey) + .map(k -> k.getBytes(StandardCharsets.UTF_8)); + final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = new ArrayList<>(); + outputWriter.handleValueData(optionalKeyBytes, valueInputStream, topic, consumerRecordList, + s3SourceConfig, topicPartition, startOffset, offsetManager, currentOffsets); + + return consumerRecordList; } @Override From 58bcc39b6ffc13f3b6c1d6089ee22f43659c796c Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Mon, 7 Oct 2024 12:07:55 +0200 Subject: [PATCH 28/90] update deprecated code --- .../kafka/connect/s3/source/IntegrationTest.java | 1 - .../aiven/kafka/connect/s3/source/S3SourceTask.java | 12 ++++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 3ad82e761..849bbedc0 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -310,7 +310,6 @@ private static void writeToS3(final String topicName, final byte[] testDataBytes Files.delete(testFilePath); } - @Deprecated private Map<String, String> getConfig(final Map<String, String> config, final String topics) { config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); config.put(AWS_ACCESS_KEY_ID_CONFIG, S3_ACCESS_KEY_ID); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 3227f8e62..e60046d19 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -20,6 +20,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT_KEY; +import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -97,12 +98,15 @@ public void start(final Map<String, String> props) { prepareReaderFromOffsetStorageReader(); } - @Deprecated private void initializeConverters() { try { - keyConverter = Optional.of((Converter) s3SourceConfig.getClass("key.converter").newInstance()); - valueConverter = (Converter) s3SourceConfig.getClass("value.converter").newInstance(); - } catch (InstantiationException | IllegalAccessException e) { + keyConverter = Optional + .of((Converter) s3SourceConfig.getClass("key.converter").getDeclaredConstructor().newInstance()); + valueConverter = (Converter) s3SourceConfig.getClass("value.converter") + .getDeclaredConstructor() + .newInstance(); + } catch (InstantiationException | IllegalAccessException | InvocationTargetException + | NoSuchMethodException e) { throw new ConnectException("Connect converters could not be instantiated.", e); } } From 0568832085628a47339b9119277d178ae5bd1a1e Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Tue, 8 Oct 2024 12:35:14 +0200 Subject: [PATCH 29/90] s3-source-connector/build.gradle.kts --- .../connect/s3/source/IntegrationTest.java | 34 +++-- .../s3/source/SchemaRegistryContainer.java | 2 +- .../kafka/connect/s3/source/S3SourceTask.java | 2 +- .../s3/source/config/S3SourceConfig.java | 4 - .../connect/s3/source/output/AvroWriter.java | 20 +-- .../s3/source/output/ByteArrayWriter.java | 16 +-- .../connect/s3/source/output/JsonWriter.java | 13 +- .../connect/s3/source/output/OutputUtils.java | 82 ++++++++++++ .../s3/source/output/OutputWriter.java | 47 ++----- .../s3/source/output/OutputWriterFactory.java | 10 +- .../s3/source/output/ParquetWriter.java | 98 ++++++++++++--- .../connect/s3/source/utils/ConnectUtils.java | 39 ++++++ .../connect/s3/source/utils/FileReader.java | 3 +- .../s3/source/utils/OffsetManager.java | 9 +- .../connect/s3/source/utils/ParquetUtils.java | 117 ------------------ .../s3/source/utils/SourceRecordIterator.java | 11 +- 16 files changed, 266 insertions(+), 241 deletions(-) create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 849bbedc0..cecbfdbd2 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -240,21 +240,21 @@ void parquetTest(final TestInfo testInfo) throws ExecutionException, Interrupted connectorConfig.put("value.converter.schema.registry.url", SCHEMA_REGISTRY.getSchemaRegistryUrl()); final String partition = "00000"; - final String offset = "000000000123"; - final String fileName = topicName + "-" + partition + "-" + offset + ".txt"; - - connectRunner.createConnector(connectorConfig); - final String tmpFilePath = "/tmp/users.parquet"; + final String fileName = topicName + "-" + partition + ".txt"; final String name1 = "testuser1"; final String name2 = "testuser2"; - writeParquetFile(tmpFilePath, name1, name2); - final Path path = Paths.get(tmpFilePath); + + connectRunner.createConnector(connectorConfig); + final Path path = getTmpFilePath(name1, name2); + try { s3Client.putObject(TEST_BUCKET_NAME, fileName, Files.newInputStream(path), null); } catch (final Exception e) { // NOPMD broad exception caught LOGGER.error("Error in reading file" + e.getMessage()); + } finally { + Files.delete(path); } - Files.delete(path); + final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 2, KAFKA_CONTAINER, SCHEMA_REGISTRY.getSchemaRegistryUrl()); assertThat(2).isEqualTo(records.size()); @@ -300,13 +300,12 @@ private static ByteArrayOutputStream getAvroRecord(final Schema schema, final in private static void writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) throws IOException { - // final String partition = "00000"; - // final String offset = "00000000012" + offsetId; - final String fileName = topicName + "-" + partitionId + ".txt"; - final Path testFilePath = Paths.get("/tmp/" + fileName); + final String filePrefix = topicName + "-" + partitionId; + final String fileSuffix = ".txt"; + final Path testFilePath = File.createTempFile(filePrefix, fileSuffix).toPath(); Files.write(testFilePath, testDataBytes); - saveToS3(TEST_BUCKET_NAME, "", fileName, testFilePath.toFile()); + saveToS3(TEST_BUCKET_NAME, "", filePrefix + fileSuffix, testFilePath.toFile()); Files.delete(testFilePath); } @@ -367,6 +366,15 @@ public static void writeParquetFile(final String tempFilePath, final String name } } + private static Path getTmpFilePath(final String name1, final String name2) throws IOException { + final String tmpFile = "users.parquet"; + final Path parquetFileDir = Files.createTempDirectory("parquet_tests"); + final String parquetFilePath = parquetFileDir.toAbsolutePath() + "/" + tmpFile; + + writeParquetFile(parquetFilePath, name1, name2); + return Paths.get(parquetFilePath); + } + private static void writeParquetFile(final String outputPath, final Schema schema, final String name1, final String name2) throws IOException { diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java index a27864718..e53f0a88b 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java @@ -1,5 +1,5 @@ /* - * Copyright 2020 Aiven Oy + * Copyright 2024 Aiven Oy * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index e60046d19..653a910b5 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -94,7 +94,7 @@ public void start(final Map<String, String> props) { initializeConverters(); initializeS3Client(); this.s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); - this.outputWriter = OutputWriterFactory.getWriter(s3SourceConfig.getString(OUTPUT_FORMAT_KEY), this.s3Bucket); + this.outputWriter = OutputWriterFactory.getWriter(s3SourceConfig.getString(OUTPUT_FORMAT_KEY)); prepareReaderFromOffsetStorageReader(); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 07633efee..ff12cc29a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -282,10 +282,6 @@ private static void addAwsConfigGroup(final ConfigDef configDef) { ConfigDef.Importance.MEDIUM, "AWS S3 Region, e.g. us-east-1", GROUP_AWS, awsGroupCounter++, // NOPMD // UnusedAssignment ConfigDef.Width.NONE, AWS_S3_REGION_CONFIG); - configDef.define(AWS_S3_REGION_CONFIG, ConfigDef.Type.STRING, null, new AwsRegionValidator(), - ConfigDef.Importance.MEDIUM, "AWS S3 Region, e.g. us-east-1", GROUP_AWS, awsGroupCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, AWS_S3_REGION_CONFIG); } protected static class AwsRegionValidator implements ConfigDef.Validator { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java index c9dfc16f1..43b64ef7e 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java @@ -21,7 +21,6 @@ import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Optional; @@ -44,11 +43,6 @@ public class AvroWriter implements OutputWriter { private static final Logger LOGGER = LoggerFactory.getLogger(AvroWriter.class); - private final String bucketName; - - public AvroWriter(final String bucketName) { - this.bucketName = bucketName; - } @Override public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { @@ -56,21 +50,17 @@ public void configureValueConverter(final Map<String, String> config, final S3So } @Override + @SuppressWarnings("PMD.ExcessiveParameterList") public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, - final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) { + final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets, + final Map<String, Object> partitionMap) { final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); DecoderFactory.get().binaryDecoder(inputStream, null); final List<GenericRecord> records = readAvroRecords(inputStream, datumReader); - for (final GenericRecord record : records) { - final byte[] valueBytes = serializeAvroRecordToBytes(Collections.singletonList(record), topic, - s3SourceConfig); - if (valueBytes.length > 0) { - consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, this.bucketName, topic, - topicPartition, offsetManager, currentOffsets, startOffset)); - } - } + OutputUtils.buildConsumerRecordList(this, optionalKeyBytes, topic, consumerRecordList, s3SourceConfig, + topicPartition, startOffset, offsetManager, currentOffsets, records, partitionMap); } private List<GenericRecord> readAvroRecords(final InputStream content, diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java index c1a43faa0..269dd061f 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java @@ -32,13 +32,7 @@ import org.slf4j.LoggerFactory; public class ByteArrayWriter implements OutputWriter { - private static final Logger LOGGER = LoggerFactory.getLogger(AvroWriter.class); - - private final String bucketName; - - public ByteArrayWriter(final String bucketName) { - this.bucketName = bucketName; - } + private static final Logger LOGGER = LoggerFactory.getLogger(ByteArrayWriter.class); @Override public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { @@ -46,13 +40,15 @@ public void configureValueConverter(final Map<String, String> config, final S3So } @Override + @SuppressWarnings("PMD.ExcessiveParameterList") public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, - final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) { + final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets, + final Map<String, Object> partitionMap) { try { - consumerRecordList.add(getConsumerRecord(optionalKeyBytes, IOUtils.toByteArray(inputStream), - this.bucketName, topic, topicPartition, offsetManager, currentOffsets, startOffset)); + consumerRecordList.add(getConsumerRecord(optionalKeyBytes, IOUtils.toByteArray(inputStream), topic, + topicPartition, offsetManager, currentOffsets, startOffset, partitionMap)); } catch (IOException e) { LOGGER.error("Error in reading s3 object stream " + e.getMessage()); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java index a734d3b1b..5c8b0a382 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java @@ -34,27 +34,24 @@ public class JsonWriter implements OutputWriter { - private final String bucketName; final ObjectMapper objectMapper = new ObjectMapper(); - public JsonWriter(final String bucketName) { - this.bucketName = bucketName; - } - @Override public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { config.put(SCHEMAS_ENABLE, "false"); } @Override + @SuppressWarnings("PMD.ExcessiveParameterList") public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, - final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) { + final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets, + final Map<String, Object> partitionMap) { final byte[] valueBytes = serializeJsonData(inputStream); if (valueBytes.length > 0) { - consumerRecordList.add(getConsumerRecord(optionalKeyBytes, serializeJsonData(inputStream), this.bucketName, - topic, topicPartition, offsetManager, currentOffsets, startOffset)); + consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, topic, topicPartition, offsetManager, + currentOffsets, startOffset, partitionMap)); } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java new file mode 100644 index 000000000..5ee31e387 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java @@ -0,0 +1,82 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.output; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import org.apache.kafka.clients.consumer.ConsumerRecord; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; + +import io.confluent.kafka.serializers.KafkaAvroSerializer; +import org.apache.avro.generic.GenericRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +final public class OutputUtils { + private static final Logger LOGGER = LoggerFactory.getLogger(OutputUtils.class); + + private OutputUtils() { + // hidden + } + + static byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String topic, + final S3SourceConfig s3SourceConfig) { + final Map<String, String> config = Collections.singletonMap(SCHEMA_REGISTRY_URL, + s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + + try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) s3SourceConfig.getClass(VALUE_SERIALIZER) + .getDeclaredConstructor() + .newInstance(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { + avroSerializer.configure(config, false); + for (final GenericRecord avroRecord : avroRecords) { + out.write(avroSerializer.serialize(topic, avroRecord)); + } + return out.toByteArray(); + } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException + | IOException e) { + LOGGER.error("Error in reading s3 object stream for topic " + topic + " with error : " + e.getMessage()); + } + return new byte[0]; + } + + @SuppressWarnings("PMD.ExcessiveParameterList") + static void buildConsumerRecordList(final OutputWriter outputWriter, final Optional<byte[]> optionalKeyBytes, + final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, + final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, + final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets, + final List<GenericRecord> records, final Map<String, Object> partitionMap) { + for (final GenericRecord record : records) { + final byte[] valueBytes = OutputUtils.serializeAvroRecordToBytes(Collections.singletonList(record), topic, + s3SourceConfig); + if (valueBytes.length > 0) { + consumerRecordList.add(outputWriter.getConsumerRecord(optionalKeyBytes, valueBytes, topic, + topicPartition, offsetManager, currentOffsets, startOffset, partitionMap)); + } + } + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java index 11cbd8081..210f81b4d 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java @@ -16,18 +16,7 @@ package io.aiven.kafka.connect.s3.source.output; -import static io.aiven.kafka.connect.s3.source.S3SourceTask.BUCKET; -import static io.aiven.kafka.connect.s3.source.S3SourceTask.PARTITION; -import static io.aiven.kafka.connect.s3.source.S3SourceTask.TOPIC; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; import java.io.InputStream; -import java.lang.reflect.InvocationTargetException; -import java.util.Collections; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -37,8 +26,6 @@ import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; -import io.confluent.kafka.serializers.KafkaAvroSerializer; -import org.apache.avro.generic.GenericRecord; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -47,17 +34,17 @@ public interface OutputWriter { Logger LOGGER = LoggerFactory.getLogger(AvroWriter.class); void configureValueConverter(Map<String, String> config, S3SourceConfig s3SourceConfig); + + @SuppressWarnings("PMD.ExcessiveParameterList") void handleValueData(Optional<byte[]> optionalKeyBytes, InputStream inputStream, String topic, List<ConsumerRecord<byte[], byte[]>> consumerRecordList, S3SourceConfig s3SourceConfig, int topicPartition, - long startOffset, OffsetManager offsetManager, Map<Map<String, Object>, Long> currentOffsets); + long startOffset, OffsetManager offsetManager, Map<Map<String, Object>, Long> currentOffsets, + Map<String, Object> partitionMap); default ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> key, final byte[] value, - final String bucketName, final String topic, final int topicPartition, final OffsetManager offsetManager, - final Map<Map<String, Object>, Long> currentOffsets, final long startOffset) { - final Map<String, Object> partitionMap = new HashMap<>(); - partitionMap.put(BUCKET, bucketName); - partitionMap.put(TOPIC, topic); - partitionMap.put(PARTITION, topicPartition); + final String topic, final int topicPartition, final OffsetManager offsetManager, + final Map<Map<String, Object>, Long> currentOffsets, final long startOffset, + final Map<String, Object> partitionMap) { long currentOffset; @@ -70,24 +57,4 @@ default ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> return new ConsumerRecord<>(topic, topicPartition, currentOffset, key.orElse(null), value); } - - default byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String topic, - final S3SourceConfig s3SourceConfig) { - final Map<String, String> config = Collections.singletonMap(SCHEMA_REGISTRY_URL, - s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); - - try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) s3SourceConfig.getClass(VALUE_SERIALIZER) - .getDeclaredConstructor() - .newInstance(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { - avroSerializer.configure(config, false); - for (final GenericRecord avroRecord : avroRecords) { - out.write(avroSerializer.serialize(topic, avroRecord)); - } - return out.toByteArray(); - } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException - | IOException e) { - LOGGER.error("Error in reading s3 object stream for topic " + topic + " with error : " + e.getMessage()); - } - return new byte[0]; - } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java index df7f86338..b7f988cea 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java @@ -21,17 +21,17 @@ public final class OutputWriterFactory { private OutputWriterFactory() { // hidden } - public static OutputWriter getWriter(final String outputFormat, final String bucket) { + public static OutputWriter getWriter(final String outputFormat) { final OutputFormat outputFormatEnum = OutputFormat.valueOfFormat(outputFormat); switch (outputFormatEnum) { case AVRO : - return new AvroWriter(bucket); + return new AvroWriter(); case PARQUET : - return new ParquetWriter(bucket); + return new ParquetWriter(); case JSON : - return new JsonWriter(bucket); + return new JsonWriter(); case BYTES : - return new ByteArrayWriter(bucket); + return new ByteArrayWriter(); default : throw new IllegalArgumentException("Unknown output format: " + outputFormat); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java index a1ac3f14d..7a3070611 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java @@ -18,8 +18,16 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; +import java.io.File; +import java.io.IOException; import java.io.InputStream; -import java.util.Collections; +import java.io.OutputStream; +import java.nio.channels.Channels; +import java.nio.channels.SeekableByteChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Optional; @@ -28,34 +36,94 @@ import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; -import io.aiven.kafka.connect.s3.source.utils.ParquetUtils; import org.apache.avro.generic.GenericRecord; +import org.apache.commons.compress.utils.IOUtils; +import org.apache.parquet.avro.AvroParquetReader; +import org.apache.parquet.io.DelegatingSeekableInputStream; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.SeekableInputStream; public class ParquetWriter implements OutputWriter { - private final String bucketName; - - public ParquetWriter(final String bucketName) { - this.bucketName = bucketName; - } @Override public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); } @Override + @SuppressWarnings("PMD.ExcessiveParameterList") public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, - final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets) { - final List<GenericRecord> records = ParquetUtils.getRecords(inputStream, topic); - for (final GenericRecord record : records) { - final byte[] valueBytes = serializeAvroRecordToBytes(Collections.singletonList(record), topic, - s3SourceConfig); - if (valueBytes.length > 0) { - consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, this.bucketName, topic, - topicPartition, offsetManager, currentOffsets, startOffset)); + final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets, + final Map<String, Object> partitionMap) { + final List<GenericRecord> records = getRecords(inputStream, topic, topicPartition); + OutputUtils.buildConsumerRecordList(this, optionalKeyBytes, topic, consumerRecordList, s3SourceConfig, + topicPartition, startOffset, offsetManager, currentOffsets, records, partitionMap); + } + + public static List<GenericRecord> getRecords(final InputStream inputStream, final String topic, + final int topicPartition) { + final String timestamp = String.valueOf(Instant.now().toEpochMilli()); + File parquetFile; + final var records = new ArrayList<GenericRecord>(); + try { + parquetFile = File.createTempFile(topic + "_" + topicPartition + "_" + timestamp, ".parquet"); + } catch (IOException e) { + LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + return records; + } + + try (OutputStream outputStream = Files.newOutputStream(parquetFile.toPath())) { + // write to a local file + IOUtils.copy(inputStream, outputStream); + + try (SeekableByteChannel seekableByteChannel = Files.newByteChannel(parquetFile.toPath()); + var parquetReader = AvroParquetReader.<GenericRecord>builder(new InputFile() { + @Override + public long getLength() throws IOException { + return seekableByteChannel.size(); + } + + @Override + public SeekableInputStream newStream() { + return new DelegatingSeekableInputStream(Channels.newInputStream(seekableByteChannel)) { + @Override + public long getPos() throws IOException { + return seekableByteChannel.position(); + } + + @Override + public void seek(final long value) throws IOException { + seekableByteChannel.position(value); + } + }; + } + + }).withCompatibility(false).build()) { + var record = parquetReader.read(); + while (record != null) { + records.add(record); + record = parquetReader.read(); + } + } catch (IOException e) { + LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + } finally { + deleteTmpFile(parquetFile.toPath()); + } + } catch (IOException e) { + LOGGER.error("Error in reading s3 object stream for topic " + topic + " with error : " + e.getMessage()); + } + return records; + } + + private static void deleteTmpFile(final Path parquetFile) { + if (Files.exists(parquetFile)) { + try { + Files.delete(parquetFile); + } catch (IOException e) { + LOGGER.error("Error in deleting tmp file " + e.getMessage()); } } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java new file mode 100644 index 000000000..6c7d1e3d3 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java @@ -0,0 +1,39 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import static io.aiven.kafka.connect.s3.source.S3SourceTask.BUCKET; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.PARTITION; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.TOPIC; + +import java.util.HashMap; +import java.util.Map; + +final public class ConnectUtils { + + private ConnectUtils() { + // hidden + } + static Map<String, Object> getPartitionMap(final String topicName, final int defaultPartitionId, + final String bucketName) { + final Map<String, Object> partitionMap = new HashMap<>(); + partitionMap.put(BUCKET, bucketName); + partitionMap.put(TOPIC, topicName); + partitionMap.put(PARTITION, defaultPartitionId); + return partitionMap; + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java index 2b881001c..2661ed396 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -31,6 +31,7 @@ public class FileReader { + public static final int PAGE_SIZE_FACTOR = 2; private final S3SourceConfig s3SourceConfig; private final String bucketName; @@ -41,7 +42,7 @@ public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName) List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOException { final ObjectListing objectListing = s3Client.listObjects(new ListObjectsRequest().withBucketName(bucketName) - .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * 2)); + .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * PAGE_SIZE_FACTOR)); return new ArrayList<>(objectListing.getObjectSummaries()); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index 05cc8cbdd..92a72593a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -16,9 +16,6 @@ package io.aiven.kafka.connect.s3.source.utils; -import static io.aiven.kafka.connect.s3.source.S3SourceTask.BUCKET; -import static io.aiven.kafka.connect.s3.source.S3SourceTask.PARTITION; -import static io.aiven.kafka.connect.s3.source.S3SourceTask.TOPIC; import static io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator.OFFSET_KEY; import static java.util.stream.Collectors.toMap; @@ -92,11 +89,7 @@ private static List<Map<String, Object>> buildPartitionKeys(final String bucket, final Set<String> topics) { final List<Map<String, Object>> partitionKeys = new ArrayList<>(); partitions.forEach(partition -> topics.forEach(topic -> { - final Map<String, Object> partitionMap = new HashMap<>(); - partitionMap.put(BUCKET, bucket); - partitionMap.put(TOPIC, topic); - partitionMap.put(PARTITION, partition); - partitionKeys.add(partitionMap); + partitionKeys.add(ConnectUtils.getPartitionMap(topic, partition, bucket)); })); return partitionKeys; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java deleted file mode 100644 index d01276409..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ParquetUtils.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright 2021 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.s3.source.utils; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.nio.channels.Channels; -import java.nio.channels.SeekableByteChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.time.Instant; -import java.util.ArrayList; -import java.util.List; - -import io.aiven.kafka.connect.s3.source.output.AvroWriter; - -import org.apache.avro.generic.GenericRecord; -import org.apache.parquet.avro.AvroParquetReader; -import org.apache.parquet.io.DelegatingSeekableInputStream; -import org.apache.parquet.io.InputFile; -import org.apache.parquet.io.SeekableInputStream; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public final class ParquetUtils { - - private static final Logger LOGGER = LoggerFactory.getLogger(AvroWriter.class); - - public static final String TMP_DIR = "/tmp"; - public static final int BUFFER_SIZE = 8192; - - private ParquetUtils() { - /* hide constructor */ } - - public static List<GenericRecord> getRecords(final InputStream inputStream, final String topic) { - final Path tmpDir = Paths.get(TMP_DIR); - - final String timestamp = String.valueOf(Instant.now().toEpochMilli()); - final Path parquetFile = tmpDir.resolve(topic + "_" + timestamp + ".parquet"); - - // Write the byte array to a file - try (OutputStream outputStream = Files.newOutputStream(parquetFile)) { - final byte[] buffer = new byte[BUFFER_SIZE]; - - int bytesRead = inputStream.read(buffer); - while (bytesRead != -1) { - outputStream.write(buffer, 0, bytesRead); // Write buffer to file - bytesRead = inputStream.read(buffer); - } - } catch (IOException e) { - LOGGER.error("Error in reading s3 object stream for topic " + topic + " with error : " + e.getMessage()); - } - final var records = new ArrayList<GenericRecord>(); - try (SeekableByteChannel seekableByteChannel = Files.newByteChannel(parquetFile); - var parquetReader = AvroParquetReader.<GenericRecord>builder(new InputFile() { - @Override - public long getLength() throws IOException { - return seekableByteChannel.size(); - } - - @Override - public SeekableInputStream newStream() { - return new DelegatingSeekableInputStream(Channels.newInputStream(seekableByteChannel)) { - @Override - public long getPos() throws IOException { - return seekableByteChannel.position(); - } - - @Override - public void seek(final long value) throws IOException { - seekableByteChannel.position(value); - } - }; - } - - }).withCompatibility(false).build()) { - var record = parquetReader.read(); - while (record != null) { - records.add(record); - record = parquetReader.read(); - } - } catch (IOException e) { - LOGGER.error("Error in reading s3 object stream " + e.getMessage()); - } - - deleteTmpFile(parquetFile); - - return records; - } - - private static void deleteTmpFile(final Path parquetFile) { - if (Files.exists(parquetFile)) { - try { - Files.delete(parquetFile); - } catch (IOException e) { - LOGGER.error("Error in deleting tmp file " + e.getMessage()); - } - } - } - -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index dd2902802..1396e8b1c 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -116,24 +116,29 @@ private Iterator<List<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentF } final String finalTopic = topicName; + final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(topicName, defaultPartitionId, + bucketName); - return getObjectIterator(inputStream, finalTopic, defaultPartitionId, defaultStartOffsetId, outputWriter); + return getObjectIterator(inputStream, finalTopic, defaultPartitionId, defaultStartOffsetId, outputWriter, + partitionMap); } } @SuppressWarnings("PMD.CognitiveComplexity") private Iterator<List<ConsumerRecord<byte[], byte[]>>> getObjectIterator(final InputStream valueInputStream, - final String topic, final int topicPartition, final long startOffset, final OutputWriter outputWriter) { + final String topic, final int topicPartition, final long startOffset, final OutputWriter outputWriter, + final Map<String, Object> partitionMap) { return new Iterator<>() { private Map<Map<String, Object>, Long> currentOffsets = new HashMap<>(); private List<ConsumerRecord<byte[], byte[]>> nextRecord = readNext(); private List<ConsumerRecord<byte[], byte[]>> readNext() { + final Optional<byte[]> optionalKeyBytes = Optional.ofNullable(currentObjectKey) .map(k -> k.getBytes(StandardCharsets.UTF_8)); final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = new ArrayList<>(); outputWriter.handleValueData(optionalKeyBytes, valueInputStream, topic, consumerRecordList, - s3SourceConfig, topicPartition, startOffset, offsetManager, currentOffsets); + s3SourceConfig, topicPartition, startOffset, offsetManager, currentOffsets, partitionMap); return consumerRecordList; } From daf9bdb739c98025c9664876d393001e1d661e61 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Tue, 8 Oct 2024 12:35:55 +0200 Subject: [PATCH 30/90] From review, refactor, improve --- s3-source-connector/build.gradle.kts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts index 73c952f42..3498d2370 100644 --- a/s3-source-connector/build.gradle.kts +++ b/s3-source-connector/build.gradle.kts @@ -20,7 +20,7 @@ plugins { id("aiven-apache-kafka-connectors-all.java-conventions") } val amazonS3Version by extra("1.12.729") val amazonSTSVersion by extra("1.12.729") -val parquetVersion by extra("1.11.2") +val parquetVersion by extra("1.14.3") val integrationTest: SourceSet = sourceSets.create("integrationTest") { From 00ca2939951129c31f10da866e403ddfe6240742 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Tue, 8 Oct 2024 16:31:59 +0200 Subject: [PATCH 31/90] Refactor based on review --- .../connect/s3/source/IntegrationTest.java | 16 +++--- .../connect/s3/source/output/AvroWriter.java | 4 +- .../s3/source/output/ByteArrayWriter.java | 4 +- .../connect/s3/source/output/JsonWriter.java | 4 +- .../s3/source/output/OutputFormat.java | 13 ++--- .../connect/s3/source/output/OutputUtils.java | 31 +++++++++--- .../s3/source/output/OutputWriter.java | 17 ------- .../s3/source/output/OutputWriterFactory.java | 4 +- .../s3/source/output/ParquetWriter.java | 4 +- .../connect/s3/source/utils/FileReader.java | 49 +++++++++++++++++-- 10 files changed, 91 insertions(+), 55 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index cecbfdbd2..d4021f789 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -63,7 +63,6 @@ import org.apache.parquet.avro.AvroParquetWriter; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.metadata.CompressionCodecName; -import org.junit.Ignore; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; @@ -76,7 +75,6 @@ import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; -@Ignore @Testcontainers @SuppressWarnings("PMD.ExcessiveImports") final class IntegrationTest implements IntegrationBase { @@ -160,9 +158,10 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx // write 2 objects to s3 writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00001"); writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00002"); + writeToS3(topicName, new byte[0], "00003"); // this should be ignored. final List<String> objects = testBucketAccessor.listObjects(); - assertThat(objects.size()).isEqualTo(2); + assertThat(objects.size()).isEqualTo(3); // Verify that the connector is correctly set up assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); @@ -302,11 +301,14 @@ private static void writeToS3(final String topicName, final byte[] testDataBytes throws IOException { final String filePrefix = topicName + "-" + partitionId; final String fileSuffix = ".txt"; - final Path testFilePath = File.createTempFile(filePrefix, fileSuffix).toPath(); - Files.write(testFilePath, testDataBytes); - saveToS3(TEST_BUCKET_NAME, "", filePrefix + fileSuffix, testFilePath.toFile()); - Files.delete(testFilePath); + final Path testFilePath = File.createTempFile(filePrefix, fileSuffix).toPath(); + try { + Files.write(testFilePath, testDataBytes); + saveToS3(TEST_BUCKET_NAME, "", filePrefix + fileSuffix, testFilePath.toFile()); + } finally { + Files.delete(testFilePath); + } } private Map<String, String> getConfig(final Map<String, String> config, final String topics) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java index 43b64ef7e..9117dd184 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java @@ -59,8 +59,8 @@ public void handleValueData(final Optional<byte[]> optionalKeyBytes, final Input final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); DecoderFactory.get().binaryDecoder(inputStream, null); final List<GenericRecord> records = readAvroRecords(inputStream, datumReader); - OutputUtils.buildConsumerRecordList(this, optionalKeyBytes, topic, consumerRecordList, s3SourceConfig, - topicPartition, startOffset, offsetManager, currentOffsets, records, partitionMap); + OutputUtils.buildConsumerRecordList(optionalKeyBytes, topic, consumerRecordList, s3SourceConfig, topicPartition, + startOffset, offsetManager, currentOffsets, records, partitionMap); } private List<GenericRecord> readAvroRecords(final InputStream content, diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java index 269dd061f..0120a3771 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java @@ -47,8 +47,8 @@ public void handleValueData(final Optional<byte[]> optionalKeyBytes, final Input final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets, final Map<String, Object> partitionMap) { try { - consumerRecordList.add(getConsumerRecord(optionalKeyBytes, IOUtils.toByteArray(inputStream), topic, - topicPartition, offsetManager, currentOffsets, startOffset, partitionMap)); + consumerRecordList.add(OutputUtils.getConsumerRecord(optionalKeyBytes, IOUtils.toByteArray(inputStream), + topic, topicPartition, offsetManager, currentOffsets, startOffset, partitionMap)); } catch (IOException e) { LOGGER.error("Error in reading s3 object stream " + e.getMessage()); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java index 5c8b0a382..cbaa7518c 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java @@ -50,8 +50,8 @@ public void handleValueData(final Optional<byte[]> optionalKeyBytes, final Input final Map<String, Object> partitionMap) { final byte[] valueBytes = serializeJsonData(inputStream); if (valueBytes.length > 0) { - consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, topic, topicPartition, offsetManager, - currentOffsets, startOffset, partitionMap)); + consumerRecordList.add(OutputUtils.getConsumerRecord(optionalKeyBytes, valueBytes, topic, topicPartition, + offsetManager, currentOffsets, startOffset, partitionMap)); } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java index 3b734eee8..7d29af4fd 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java @@ -16,6 +16,8 @@ package io.aiven.kafka.connect.s3.source.output; +import java.util.Locale; + public enum OutputFormat { AVRO("avro"), PARQUET("parquet"), JSON("json"), BYTES("bytes"); @@ -26,16 +28,7 @@ public enum OutputFormat { } public String getFormat() { - return format; - } - - public static OutputFormat valueOfFormat(final String outFormat) { - for (final OutputFormat outputFormat : values()) { - if (outputFormat.format.equalsIgnoreCase(outFormat)) { - return outputFormat; - } - } - throw new IllegalArgumentException("Unknown outFormat: " + outFormat); + return format.toLowerCase(Locale.ROOT); } @Override diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java index 5ee31e387..c185935dc 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java @@ -65,18 +65,35 @@ static byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, } @SuppressWarnings("PMD.ExcessiveParameterList") - static void buildConsumerRecordList(final OutputWriter outputWriter, final Optional<byte[]> optionalKeyBytes, - final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, - final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, - final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets, - final List<GenericRecord> records, final Map<String, Object> partitionMap) { + static void buildConsumerRecordList(final Optional<byte[]> optionalKeyBytes, final String topic, + final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, final S3SourceConfig s3SourceConfig, + final int topicPartition, final long startOffset, final OffsetManager offsetManager, + final Map<Map<String, Object>, Long> currentOffsets, final List<GenericRecord> records, + final Map<String, Object> partitionMap) { for (final GenericRecord record : records) { final byte[] valueBytes = OutputUtils.serializeAvroRecordToBytes(Collections.singletonList(record), topic, s3SourceConfig); if (valueBytes.length > 0) { - consumerRecordList.add(outputWriter.getConsumerRecord(optionalKeyBytes, valueBytes, topic, - topicPartition, offsetManager, currentOffsets, startOffset, partitionMap)); + consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, topic, topicPartition, + offsetManager, currentOffsets, startOffset, partitionMap)); } } } + + static ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> key, final byte[] value, + final String topic, final int topicPartition, final OffsetManager offsetManager, + final Map<Map<String, Object>, Long> currentOffsets, final long startOffset, + final Map<String, Object> partitionMap) { + + long currentOffset; + + if (offsetManager.getOffsets().containsKey(partitionMap)) { + currentOffset = offsetManager.incrementAndUpdateOffsetMap(partitionMap); + } else { + currentOffset = currentOffsets.getOrDefault(partitionMap, startOffset); + currentOffsets.put(partitionMap, currentOffset + 1); + } + + return new ConsumerRecord<>(topic, topicPartition, currentOffset, key.orElse(null), value); + } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java index 210f81b4d..68c20d537 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java @@ -40,21 +40,4 @@ void handleValueData(Optional<byte[]> optionalKeyBytes, InputStream inputStream, List<ConsumerRecord<byte[], byte[]>> consumerRecordList, S3SourceConfig s3SourceConfig, int topicPartition, long startOffset, OffsetManager offsetManager, Map<Map<String, Object>, Long> currentOffsets, Map<String, Object> partitionMap); - - default ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> key, final byte[] value, - final String topic, final int topicPartition, final OffsetManager offsetManager, - final Map<Map<String, Object>, Long> currentOffsets, final long startOffset, - final Map<String, Object> partitionMap) { - - long currentOffset; - - if (offsetManager.getOffsets().containsKey(partitionMap)) { - currentOffset = offsetManager.incrementAndUpdateOffsetMap(partitionMap); - } else { - currentOffset = currentOffsets.getOrDefault(partitionMap, startOffset); - currentOffsets.put(partitionMap, currentOffset + 1); - } - - return new ConsumerRecord<>(topic, topicPartition, currentOffset, key.orElse(null), value); - } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java index b7f988cea..b6c0aedad 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java @@ -16,13 +16,15 @@ package io.aiven.kafka.connect.s3.source.output; +import java.util.Locale; + public final class OutputWriterFactory { private OutputWriterFactory() { // hidden } public static OutputWriter getWriter(final String outputFormat) { - final OutputFormat outputFormatEnum = OutputFormat.valueOfFormat(outputFormat); + final OutputFormat outputFormatEnum = OutputFormat.valueOf(outputFormat.toUpperCase(Locale.ROOT)); switch (outputFormatEnum) { case AVRO : return new AvroWriter(); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java index 7a3070611..8c2db5f20 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java @@ -59,8 +59,8 @@ public void handleValueData(final Optional<byte[]> optionalKeyBytes, final Input final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets, final Map<String, Object> partitionMap) { final List<GenericRecord> records = getRecords(inputStream, topic, topicPartition); - OutputUtils.buildConsumerRecordList(this, optionalKeyBytes, topic, consumerRecordList, s3SourceConfig, - topicPartition, startOffset, offsetManager, currentOffsets, records, partitionMap); + OutputUtils.buildConsumerRecordList(optionalKeyBytes, topic, consumerRecordList, s3SourceConfig, topicPartition, + startOffset, offsetManager, currentOffsets, records, partitionMap); } public static List<GenericRecord> getRecords(final InputStream inputStream, final String topic, diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java index 2661ed396..52846963a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -21,12 +21,13 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.model.ListObjectsRequest; -import com.amazonaws.services.s3.model.ObjectListing; +import com.amazonaws.services.s3.model.ListObjectsV2Request; +import com.amazonaws.services.s3.model.ListObjectsV2Result; import com.amazonaws.services.s3.model.S3ObjectSummary; public class FileReader { @@ -40,11 +41,49 @@ public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName) this.bucketName = bucketName; } + List<S3ObjectSummary> fetchObjectSummaries1(final AmazonS3 s3Client) throws IOException { + // final ObjectListing objectListing = s3Client.listObjects(new ListObjectsRequest().withBucketName(bucketName) + // .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * PAGE_SIZE_FACTOR)); + + final ListObjectsV2Result objectListing = s3Client + .listObjectsV2(new ListObjectsV2Request().withBucketName(bucketName) + .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * PAGE_SIZE_FACTOR)); + + // filtering zero byte objects + return objectListing.getObjectSummaries() + .stream() + .filter(objectSummary -> objectSummary.getSize() > 0) + .collect(Collectors.toList()); + } + + @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOException { - final ObjectListing objectListing = s3Client.listObjects(new ListObjectsRequest().withBucketName(bucketName) - .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * PAGE_SIZE_FACTOR)); + final List<S3ObjectSummary> allSummaries = new ArrayList<>(); + String continuationToken = null; + ListObjectsV2Result objectListing; + + do { + // Create the request for listing objects + final ListObjectsV2Request request = new ListObjectsV2Request().withBucketName(bucketName) + .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * PAGE_SIZE_FACTOR) + .withContinuationToken(continuationToken); // Set continuation token for pagination + + // List objects from S3 + objectListing = s3Client.listObjectsV2(request); + + // Filter out zero-byte objects and add to the list + final List<S3ObjectSummary> filteredSummaries = objectListing.getObjectSummaries() + .stream() + .filter(objectSummary -> objectSummary.getSize() > 0) + .collect(Collectors.toList()); + + allSummaries.addAll(filteredSummaries); // Add the filtered summaries to the main list + + // Check if there are more objects to fetch + continuationToken = objectListing.getNextContinuationToken(); + } while (objectListing.isTruncated()); // Continue fetching if the result is truncated - return new ArrayList<>(objectListing.getObjectSummaries()); + return allSummaries; } } From 25e520845de21a0a4d0de98e8f0bfc606e80abbf Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Wed, 9 Oct 2024 10:30:20 +0200 Subject: [PATCH 32/90] Adding unit tests for sourcetask --- s3-source-connector/build.gradle.kts | 2 + .../connect/s3/source/IntegrationTest.java | 8 +- .../kafka/connect/s3/source/S3SourceTask.java | 21 +++ .../s3/source/config/S3SourceConfig.java | 2 +- .../s3/source/output/OutputFormat.java | 2 +- .../connect/s3/source/utils/FileReader.java | 15 -- .../s3/source/utils/SourceRecordIterator.java | 4 +- .../connect/s3/source/S3SourceTaskTest.java | 156 ++++++++++++++++++ 8 files changed, 187 insertions(+), 23 deletions(-) create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts index 3498d2370..27a8d92f9 100644 --- a/s3-source-connector/build.gradle.kts +++ b/s3-source-connector/build.gradle.kts @@ -20,6 +20,7 @@ plugins { id("aiven-apache-kafka-connectors-all.java-conventions") } val amazonS3Version by extra("1.12.729") val amazonSTSVersion by extra("1.12.729") +val s3mockVersion by extra("0.2.6") val parquetVersion by extra("1.14.3") val integrationTest: SourceSet = @@ -92,6 +93,7 @@ dependencies { testImplementation(testinglibs.assertj.core) testImplementation(testinglibs.mockito.core) + testImplementation("io.findify:s3mock_2.11:$s3mockVersion") testRuntimeOnly(testinglibs.junit.jupiter.engine) testImplementation(testinglibs.mockito.junit.jupiter) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index d4021f789..46004bf1f 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -150,7 +150,7 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectRunner.createConnector(connectorConfig); - connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.BYTES.getFormat()); + connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.BYTES.getValue()); final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; @@ -191,7 +191,7 @@ void multiPartUploadBytesTest(final TestInfo testInfo) throws ExecutionException void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.AVRO.getFormat()); + connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.AVRO.getValue()); connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); connectorConfig.put("value.converter.schema.registry.url", SCHEMA_REGISTRY.getSchemaRegistryUrl()); @@ -233,7 +233,7 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc void parquetTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.PARQUET.getFormat()); + connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.PARQUET.getValue()); connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); connectorConfig.put("value.converter.schema.registry.url", SCHEMA_REGISTRY.getSchemaRegistryUrl()); @@ -264,7 +264,7 @@ void parquetTest(final TestInfo testInfo) throws ExecutionException, Interrupted void jsonTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.JSON.getFormat()); + connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.JSON.getValue()); connectorConfig.put("value.converter", "org.apache.kafka.connect.json.JsonConverter"); connectRunner.createConnector(connectorConfig); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 653a910b5..f25de0f56 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -68,12 +68,15 @@ public class S3SourceTask extends SourceTask { private Iterator<List<AivenS3SourceRecord>> sourceRecordIterator; private Optional<Converter> keyConverter; + private Converter valueConverter; private OutputWriter outputWriter; private String s3Bucket; + private boolean taskInitialized; + private final AtomicBoolean connectorStopped = new AtomicBoolean(); private final S3ClientFactory s3ClientFactory = new S3ClientFactory(); @@ -96,6 +99,7 @@ public void start(final Map<String, String> props) { this.s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); this.outputWriter = OutputWriterFactory.getWriter(s3SourceConfig.getString(OUTPUT_FORMAT_KEY)); prepareReaderFromOffsetStorageReader(); + this.taskInitialized = true; } private void initializeConverters() { @@ -171,4 +175,21 @@ private void handleS3Exception(final AmazonS3Exception amazonS3Exception) throws public void stop() { this.connectorStopped.set(true); } + + // below for visibility in tests + public Optional<Converter> getKeyConverter() { + return keyConverter; + } + + public Converter getValueConverter() { + return valueConverter; + } + + public OutputWriter getOutputWriter() { + return outputWriter; + } + + public boolean isTaskInitialized() { + return taskInitialized; + } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index ff12cc29a..6d462e822 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -143,7 +143,7 @@ private static void addSchemaRegistryGroup(final ConfigDef configDef) { configDef.define(SCHEMA_REGISTRY_URL, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "SCHEMA REGISTRY URL", GROUP_OTHER, srCounter++, ConfigDef.Width.NONE, SCHEMA_REGISTRY_URL); - configDef.define(OUTPUT_FORMAT_KEY, ConfigDef.Type.STRING, OutputFormat.BYTES.getFormat(), + configDef.define(OUTPUT_FORMAT_KEY, ConfigDef.Type.STRING, OutputFormat.BYTES.getValue(), new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "Output format avro/json/parquet/bytes", GROUP_OTHER, srCounter++, // NOPMD ConfigDef.Width.NONE, OUTPUT_FORMAT_KEY); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java index 7d29af4fd..16bca89a4 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java @@ -27,7 +27,7 @@ public enum OutputFormat { this.format = format; } - public String getFormat() { + public String getValue() { return format.toLowerCase(Locale.ROOT); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java index 52846963a..0269f084a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -41,21 +41,6 @@ public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName) this.bucketName = bucketName; } - List<S3ObjectSummary> fetchObjectSummaries1(final AmazonS3 s3Client) throws IOException { - // final ObjectListing objectListing = s3Client.listObjects(new ListObjectsRequest().withBucketName(bucketName) - // .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * PAGE_SIZE_FACTOR)); - - final ListObjectsV2Result objectListing = s3Client - .listObjectsV2(new ListObjectsV2Request().withBucketName(bucketName) - .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * PAGE_SIZE_FACTOR)); - - // filtering zero byte objects - return objectListing.getObjectSummaries() - .stream() - .filter(objectSummary -> objectSummary.getSize() > 0) - .collect(Collectors.toList()); - } - @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOException { final List<S3ObjectSummary> allSummaries = new ArrayList<>(); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 1396e8b1c..c734cd845 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -197,12 +197,12 @@ public List<AivenS3SourceRecord> next() { aivenS3SourceRecordList.add(aivenS3SourceRecord); } - return aivenS3SourceRecordList; + return Collections.unmodifiableList(aivenS3SourceRecordList); } @Override public void remove() { - throw new UnsupportedOperationException(); + throw new UnsupportedOperationException("This iterator is unmodifiable"); } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java new file mode 100644 index 000000000..fc20a7e2c --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -0,0 +1,156 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; +import java.util.Random; + +import org.apache.kafka.connect.converters.ByteArrayConverter; +import org.apache.kafka.connect.source.SourceTaskContext; +import org.apache.kafka.connect.storage.Converter; +import org.apache.kafka.connect.storage.OffsetStorageReader; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.output.ByteArrayWriter; +import io.aiven.kafka.connect.s3.source.output.OutputFormat; +import io.aiven.kafka.connect.s3.source.output.OutputWriter; +import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; + +import com.amazonaws.auth.AWSStaticCredentialsProvider; +import com.amazonaws.auth.BasicAWSCredentials; +import com.amazonaws.client.builder.AwsClientBuilder; +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.AmazonS3ClientBuilder; +import io.findify.s3mock.S3Mock; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; + +final class S3SourceTaskTest { + + private static final Random RANDOM = new Random(); + private Map<String, String> properties; + + private static BucketAccessor testBucketAccessor; + private static final String TEST_BUCKET = "test-bucket"; + + private static S3Mock s3Api; + private static AmazonS3 s3Client; + + private static Map<String, String> commonProperties; + + @Mock + private SourceTaskContext mockedSourceTaskContext; + + @Mock + private OffsetStorageReader mockedOffsetStorageReader; + + @BeforeAll + public static void setUpClass() { + final int s3Port = RANDOM.nextInt(10_000) + 10_000; + + s3Api = new S3Mock.Builder().withPort(s3Port).withInMemoryBackend().build(); + s3Api.start(); + + commonProperties = Map.of(S3SourceConfig.AWS_ACCESS_KEY_ID_CONFIG, "test_key_id", + S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG, "test_secret_key", + S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET, S3SourceConfig.AWS_S3_ENDPOINT_CONFIG, + "http://localhost:" + s3Port, S3SourceConfig.AWS_S3_REGION_CONFIG, "us-west-2"); + + final AmazonS3ClientBuilder builder = AmazonS3ClientBuilder.standard(); + final BasicAWSCredentials awsCreds = new BasicAWSCredentials( + commonProperties.get(S3SourceConfig.AWS_ACCESS_KEY_ID_CONFIG), + commonProperties.get(S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG)); + builder.withCredentials(new AWSStaticCredentialsProvider(awsCreds)); + builder.withEndpointConfiguration( + new AwsClientBuilder.EndpointConfiguration(commonProperties.get(S3SourceConfig.AWS_S3_ENDPOINT_CONFIG), + commonProperties.get(S3SourceConfig.AWS_S3_REGION_CONFIG))); + builder.withPathStyleAccessEnabled(true); + + s3Client = builder.build(); + + testBucketAccessor = new BucketAccessor(s3Client, TEST_BUCKET); + testBucketAccessor.createBucket(); + } + + @AfterAll + public static void tearDownClass() { + s3Api.stop(); + } + + @BeforeEach + public void setUp() { + properties = new HashMap<>(commonProperties); + s3Client.createBucket(TEST_BUCKET); + mockedSourceTaskContext = mock(SourceTaskContext.class); + mockedOffsetStorageReader = mock(OffsetStorageReader.class); + } + + @AfterEach + public void tearDown() { + s3Client.deleteBucket(TEST_BUCKET); + } + + @Test + void testS3SourceTaskInitialization() { + final S3SourceTask s3SourceTask = new S3SourceTask(); + startSourceTask(s3SourceTask); + + final Optional<Converter> keyConverter = s3SourceTask.getKeyConverter(); + assertThat(keyConverter).isPresent(); + assertThat(keyConverter.get()).isInstanceOf(ByteArrayConverter.class); + + final Converter valueConverter = s3SourceTask.getValueConverter(); + assertThat(valueConverter).isInstanceOf(ByteArrayConverter.class); + + final OutputWriter outputWriter = s3SourceTask.getOutputWriter(); + assertThat(outputWriter).isInstanceOf(ByteArrayWriter.class); + + final boolean taskInitialized = s3SourceTask.isTaskInitialized(); + assertThat(taskInitialized).isTrue(); + } + + private void startSourceTask(final S3SourceTask s3SourceTask) { + s3SourceTask.initialize(mockedSourceTaskContext); + when(mockedSourceTaskContext.offsetStorageReader()).thenReturn(mockedOffsetStorageReader); + + setBasicProperties(); + s3SourceTask.start(properties); + } + + private void setBasicProperties() { + properties.put(S3SourceConfig.OUTPUT_FORMAT_KEY, OutputFormat.BYTES.getValue()); + properties.put("name", "test_source_connector"); + properties.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + properties.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + properties.put("tasks.max", "1"); + properties.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); + properties.put(TARGET_TOPIC_PARTITIONS, "0,1"); + properties.put(TARGET_TOPICS, "testtopic"); + } +} From 6d307ff0e0ebd89d9c1459f0ab2a4534b743ce79 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Wed, 9 Oct 2024 12:53:15 +0200 Subject: [PATCH 33/90] Add update unit tests --- .../kafka/connect/s3/source/S3SourceTask.java | 8 ++- .../s3/source/config/S3SourceConfig.java | 21 ++++++ .../s3/source/output/OutputWriterFactory.java | 11 +-- .../connect/s3/source/S3SourceTaskTest.java | 55 +++++++++++++++ .../s3/source/config/S3SourceConfigTest.java | 70 +++++++++++++++++++ 5 files changed, 159 insertions(+), 6 deletions(-) create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index f25de0f56..8eb75ff8c 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -18,7 +18,6 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT_KEY; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; @@ -97,7 +96,7 @@ public void start(final Map<String, String> props) { initializeConverters(); initializeS3Client(); this.s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); - this.outputWriter = OutputWriterFactory.getWriter(s3SourceConfig.getString(OUTPUT_FORMAT_KEY)); + this.outputWriter = OutputWriterFactory.getWriter(s3SourceConfig); prepareReaderFromOffsetStorageReader(); this.taskInitialized = true; } @@ -173,6 +172,7 @@ private void handleS3Exception(final AmazonS3Exception amazonS3Exception) throws @Override public void stop() { + this.taskInitialized = false; this.connectorStopped.set(true); } @@ -192,4 +192,8 @@ public OutputWriter getOutputWriter() { public boolean isTaskInitialized() { return taskInitialized; } + + public AtomicBoolean getConnectorStopped() { + return new AtomicBoolean(connectorStopped.get()); + } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 6d462e822..4d1e8d853 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -19,6 +19,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.regex.Pattern; @@ -330,6 +331,14 @@ public int getS3RetryBackoffMaxRetries() { return getInt(AWS_S3_RETRY_BACKOFF_MAX_RETRIES_CONFIG); } + public String getAwsS3BucketName() { + return getString(AWS_S3_BUCKET_NAME_CONFIG); + } + + public OutputFormat getOutputFormat() { + return OutputFormat.valueOf(getString(OUTPUT_FORMAT_KEY).toUpperCase(Locale.ROOT)); + } + Region getAwsS3Region() { // we have priority of properties if old one not set or both old and new one set // the new property value will be selected @@ -368,4 +377,16 @@ AwsAccessSecret getAwsCredentials() { AWSCredentialsProvider getCustomCredentialsProvider() { return getConfiguredInstance(AWS_CREDENTIALS_PROVIDER_CONFIG, AWSCredentialsProvider.class); } + + String getTargetTopics() { + return getString(TARGET_TOPICS); + } + + String getTargetTopicPartitions() { + return getString(TARGET_TOPIC_PARTITIONS); + } + + String getSchemaRegistryUrl() { + return getString(SCHEMA_REGISTRY_URL); + } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java index b6c0aedad..c3f7d3b3a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java @@ -16,15 +16,17 @@ package io.aiven.kafka.connect.s3.source.output; -import java.util.Locale; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT_KEY; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; public final class OutputWriterFactory { private OutputWriterFactory() { // hidden } - public static OutputWriter getWriter(final String outputFormat) { - final OutputFormat outputFormatEnum = OutputFormat.valueOf(outputFormat.toUpperCase(Locale.ROOT)); + public static OutputWriter getWriter(final S3SourceConfig s3SourceConfig) { + final OutputFormat outputFormatEnum = s3SourceConfig.getOutputFormat(); switch (outputFormatEnum) { case AVRO : return new AvroWriter(); @@ -35,7 +37,8 @@ public static OutputWriter getWriter(final String outputFormat) { case BYTES : return new ByteArrayWriter(); default : - throw new IllegalArgumentException("Unknown output format: " + outputFormat); + throw new IllegalArgumentException( + "Unknown output format " + s3SourceConfig.getString(OUTPUT_FORMAT_KEY)); } } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java index fc20a7e2c..5d6b42efa 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -22,12 +22,16 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +import java.lang.reflect.Field; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Random; import org.apache.kafka.connect.converters.ByteArrayConverter; +import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.source.SourceTaskContext; import org.apache.kafka.connect.storage.Converter; import org.apache.kafka.connect.storage.OffsetStorageReader; @@ -37,6 +41,8 @@ import io.aiven.kafka.connect.s3.source.output.OutputFormat; import io.aiven.kafka.connect.s3.source.output.OutputWriter; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; +import io.aiven.kafka.connect.s3.source.utils.AivenS3SourceRecord; +import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; import com.amazonaws.auth.AWSStaticCredentialsProvider; import com.amazonaws.auth.BasicAWSCredentials; @@ -135,6 +141,55 @@ void testS3SourceTaskInitialization() { assertThat(taskInitialized).isTrue(); } + @Test + void testPoll() throws Exception { + final S3SourceTask s3SourceTask = new S3SourceTask(); + startSourceTask(s3SourceTask); + + SourceRecordIterator mockSourceRecordIterator; + + mockSourceRecordIterator = mock(SourceRecordIterator.class); + setPrivateField(s3SourceTask, "sourceRecordIterator", mockSourceRecordIterator); + when(mockSourceRecordIterator.hasNext()).thenReturn(true).thenReturn(true).thenReturn(false); + + final List<AivenS3SourceRecord> aivenS3SourceRecordList = getAivenS3SourceRecords(); + when(mockSourceRecordIterator.next()).thenReturn(aivenS3SourceRecordList); + + final List<SourceRecord> sourceRecordList = s3SourceTask.poll(); + assertThat(sourceRecordList).hasSize(2); + } + + @Test + void testStop() { + final S3SourceTask s3SourceTask = new S3SourceTask(); + startSourceTask(s3SourceTask); + s3SourceTask.stop(); + + final boolean taskInitialized = s3SourceTask.isTaskInitialized(); + assertThat(taskInitialized).isFalse(); + assertThat(s3SourceTask.getConnectorStopped()).isTrue(); + } + + private static List<AivenS3SourceRecord> getAivenS3SourceRecords() { + final List<AivenS3SourceRecord> aivenS3SourceRecordList = new ArrayList<>(); + final AivenS3SourceRecord aivenS3SourceRecord1 = new AivenS3SourceRecord(new HashMap<>(), new HashMap<>(), + "testtopic", 0, new byte[0], new byte[0]); + aivenS3SourceRecordList.add(aivenS3SourceRecord1); + final AivenS3SourceRecord aivenS3SourceRecord2 = new AivenS3SourceRecord(new HashMap<>(), new HashMap<>(), + "testtopic", 1, new byte[0], new byte[0]); + aivenS3SourceRecordList.add(aivenS3SourceRecord2); + return aivenS3SourceRecordList; + } + + @SuppressWarnings("PMD.AvoidAccessibilityAlteration") + private void setPrivateField(final Object object, final String fieldName, final Object value) + throws NoSuchFieldException, IllegalAccessException { + Field field; + field = object.getClass().getDeclaredField(fieldName); + field.setAccessible(true); + field.set(object, value); + } + private void startSourceTask(final S3SourceTask s3SourceTask) { s3SourceTask.initialize(mockedSourceTaskContext); when(mockedSourceTaskContext.offsetStorageReader()).thenReturn(mockedOffsetStorageReader); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java new file mode 100644 index 000000000..e27604ce9 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java @@ -0,0 +1,70 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.config; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.HashMap; + +import io.aiven.kafka.connect.s3.source.output.OutputFormat; + +import com.amazonaws.regions.RegionUtils; +import com.amazonaws.regions.Regions; +import org.junit.jupiter.api.Test; + +final class S3SourceConfigTest { + @Test + void correctFullConfig() { + final var props = new HashMap<String, String>(); + + // aws props + props.put(S3SourceConfig.AWS_ACCESS_KEY_ID_CONFIG, "AWS_ACCESS_KEY_ID"); + props.put(S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG, "AWS_SECRET_ACCESS_KEY"); + props.put(S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG, "the-bucket"); + props.put(S3SourceConfig.AWS_S3_ENDPOINT_CONFIG, "AWS_S3_ENDPOINT"); + props.put(S3SourceConfig.AWS_S3_PREFIX_CONFIG, "AWS_S3_PREFIX"); + props.put(S3SourceConfig.AWS_S3_REGION_CONFIG, Regions.US_EAST_1.getName()); + + // record, topic specific props + props.put(S3SourceConfig.OUTPUT_FORMAT_KEY, OutputFormat.AVRO.getValue()); + props.put(TARGET_TOPIC_PARTITIONS, "0,1"); + props.put(TARGET_TOPICS, "testtopic"); + props.put(SCHEMA_REGISTRY_URL, "localhost:8081"); + + final var conf = new S3SourceConfig(props); + final var awsCredentials = conf.getAwsCredentials(); + + assertThat(awsCredentials.getAccessKeyId().value()).isEqualTo("AWS_ACCESS_KEY_ID"); + assertThat(awsCredentials.getSecretAccessKey().value()).isEqualTo("AWS_SECRET_ACCESS_KEY"); + assertThat(conf.getAwsS3BucketName()).isEqualTo("the-bucket"); + assertThat(conf.getAwsS3EndPoint()).isEqualTo("AWS_S3_ENDPOINT"); + assertThat(conf.getAwsS3Region()).isEqualTo(RegionUtils.getRegion("us-east-1")); + + assertThat(conf.getOutputFormat()).isEqualTo(OutputFormat.AVRO); + assertThat(conf.getTargetTopics()).isEqualTo("testtopic"); + assertThat(conf.getTargetTopicPartitions()).isEqualTo("0,1"); + assertThat(conf.getSchemaRegistryUrl()).isEqualTo("localhost:8081"); + + assertThat(conf.getS3RetryBackoffDelayMs()).isEqualTo(S3SourceConfig.AWS_S3_RETRY_BACKOFF_DELAY_MS_DEFAULT); + assertThat(conf.getS3RetryBackoffMaxDelayMs()) + .isEqualTo(S3SourceConfig.AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_DEFAULT); + assertThat(conf.getS3RetryBackoffMaxRetries()).isEqualTo(S3SourceConfig.S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT); + } +} From 7c6743b503b2a17c524f71fdad5470e82a97bae9 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Wed, 9 Oct 2024 13:49:23 +0200 Subject: [PATCH 34/90] Add tests for FileReader, OffsetMgr --- .../s3/source/utils/FileReaderTest.java | 140 +++++++++++++++++ .../s3/source/utils/OffsetManagerTest.java | 144 ++++++++++++++++++ 2 files changed, 284 insertions(+) create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java new file mode 100644 index 000000000..360dfc5a6 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java @@ -0,0 +1,140 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import io.aiven.kafka.connect.s3.source.AivenKafkaConnectS3SourceConnector; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.output.OutputFormat; + +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.model.ListObjectsV2Request; +import com.amazonaws.services.s3.model.ListObjectsV2Result; +import com.amazonaws.services.s3.model.S3ObjectSummary; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; + +class FileReaderTest { + + private static final String TEST_BUCKET = "test-bucket"; + @Mock + private AmazonS3 s3Client; + + private FileReader fileReader; + + private Map<String, String> properties; + + @BeforeEach + public void setUp() { + properties = new HashMap<>(); + setBasicProperties(); + final S3SourceConfig s3SourceConfig = new S3SourceConfig(properties); + fileReader = new FileReader(s3SourceConfig, TEST_BUCKET); + s3Client = mock(AmazonS3.class); + } + + @Test + void testFetchObjectSummariesWithNoObjects() throws IOException { + final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result(Collections.emptyList(), null); + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); + final List<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); + assertThat(summaries.size()).isEqualTo(0); + } + + @Test + void testFetchObjectSummariesWithOneNonZeroByteObject() throws IOException { + final S3ObjectSummary objectSummary = createObjectSummary(1); + final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result( + Collections.singletonList(objectSummary), null); + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); + + final List<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); + + assertThat(summaries.size()).isEqualTo(1); + assertThat(summaries.get(0).getSize()).isEqualTo(1); + } + + @Test + void testFetchObjectSummariesWithZeroByteObject() throws IOException { + final S3ObjectSummary zeroByteObject = createObjectSummary(0); + final S3ObjectSummary nonZeroByteObject = createObjectSummary(1); + final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result( + List.of(zeroByteObject, nonZeroByteObject), null); + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); + + final List<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); + + assertThat(summaries.size()).isEqualTo(1); + assertThat(summaries.get(0).getSize()).isEqualTo(1); + } + + @Test + void testFetchObjectSummariesWithPagination() throws IOException { + final S3ObjectSummary object1 = createObjectSummary(1); + final S3ObjectSummary object2 = createObjectSummary(2); + final List<S3ObjectSummary> firstBatch = List.of(object1); + final List<S3ObjectSummary> secondBatch = List.of(object2); + + final ListObjectsV2Result firstResult = createListObjectsV2Result(firstBatch, "nextToken"); + final ListObjectsV2Result secondResult = createListObjectsV2Result(secondBatch, null); + + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(firstResult).thenReturn(secondResult); + + final List<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); + + assertThat(summaries.size()).isEqualTo(2); + } + + private ListObjectsV2Result createListObjectsV2Result(final List<S3ObjectSummary> summaries, + final String nextToken) { + final ListObjectsV2Result result = mock(ListObjectsV2Result.class); + when(result.getObjectSummaries()).thenReturn(summaries); + when(result.getNextContinuationToken()).thenReturn(nextToken); + when(result.isTruncated()).thenReturn(nextToken != null); + return result; + } + + private S3ObjectSummary createObjectSummary(final long sizeOfObject) { + final S3ObjectSummary summary = mock(S3ObjectSummary.class); + when(summary.getSize()).thenReturn(sizeOfObject); + return summary; + } + + private void setBasicProperties() { + properties.put(S3SourceConfig.OUTPUT_FORMAT_KEY, OutputFormat.BYTES.getValue()); + properties.put("name", "test_source_connector"); + properties.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + properties.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + properties.put("tasks.max", "1"); + properties.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); + properties.put(TARGET_TOPIC_PARTITIONS, "0,1"); + properties.put(TARGET_TOPICS, "testtopic"); + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java new file mode 100644 index 000000000..3bfca42ad --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java @@ -0,0 +1,144 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; +import static io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator.OFFSET_KEY; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.kafka.connect.source.SourceTaskContext; +import org.apache.kafka.connect.storage.OffsetStorageReader; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; + +final class OffsetManagerTest { + + private Map<String, String> properties; + private static final String TEST_BUCKET = "test-bucket"; + + @Mock + private SourceTaskContext sourceTaskContext; + + private S3SourceConfig s3SourceConfig; + + private OffsetManager offsetManager; + + @BeforeEach + public void setUp() { + properties = new HashMap<>(); + setBasicProperties(); + s3SourceConfig = new S3SourceConfig(properties); + } + + @Test + void testWithOffsets() { + sourceTaskContext = mock(SourceTaskContext.class); + final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); + when(sourceTaskContext.offsetStorageReader()).thenReturn(offsetStorageReader); + + final Map<String, Object> partitionKey = new HashMap<>(); + partitionKey.put("topic", "topic1"); + partitionKey.put("partition", 0); + partitionKey.put("bucket", TEST_BUCKET); + + final Map<String, Object> offsetValue = new HashMap<>(); + offsetValue.put(OFFSET_KEY, 5L); + final Map<Map<String, Object>, Map<String, Object>> offsets = new HashMap<>(); + offsets.put(partitionKey, offsetValue); + + when(offsetStorageReader.offsets(any())).thenReturn(offsets); + + offsetManager = new OffsetManager(sourceTaskContext, s3SourceConfig); + + final Map<Map<String, Object>, Map<String, Object>> retrievedOffsets = offsetManager.getOffsets(); + assertThat(retrievedOffsets.size()).isEqualTo(1); + assertThat(retrievedOffsets.values().iterator().next().get(OFFSET_KEY)).isEqualTo(5L); + } + + @Test + void testIncrementAndUpdateOffsetMapExistingOffset() { + sourceTaskContext = mock(SourceTaskContext.class); + final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); + when(sourceTaskContext.offsetStorageReader()).thenReturn(offsetStorageReader); + + final Map<String, Object> partitionKey = new HashMap<>(); + partitionKey.put("topic", "topic1"); + partitionKey.put("partition", 0); + + final Map<String, Object> offsetValue = new HashMap<>(); + offsetValue.put(OFFSET_KEY, 1L); + final Map<Map<String, Object>, Map<String, Object>> offsets = new HashMap<>(); + offsets.put(partitionKey, offsetValue); + + when(offsetStorageReader.offsets(any())).thenReturn(offsets); + + offsetManager = new OffsetManager(sourceTaskContext, s3SourceConfig); + final long newOffset = offsetManager.incrementAndUpdateOffsetMap(partitionKey); + + assertThat(newOffset).isEqualTo(2L); + assertThat(offsetManager.getOffsets().get(partitionKey).get(OFFSET_KEY)).isEqualTo(2L); + } + + @Test + void testIncrementAndUpdateOffsetMapNonExistingOffset() { + sourceTaskContext = mock(SourceTaskContext.class); + final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); + when(sourceTaskContext.offsetStorageReader()).thenReturn(offsetStorageReader); + + final Map<String, Object> partitionKey = new HashMap<>(); + partitionKey.put("topic", "topic1"); + partitionKey.put("partition", 0); + + when(offsetStorageReader.offsets(any())).thenReturn(Collections.emptyMap()); + + offsetManager = new OffsetManager(sourceTaskContext, s3SourceConfig); + final long newOffset = offsetManager.incrementAndUpdateOffsetMap(partitionKey); + + assertThat(newOffset).isEqualTo(0L); + } + + @Test + void testGetFirstConfiguredTopic() throws Exception { + sourceTaskContext = mock(SourceTaskContext.class); + final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); + when(sourceTaskContext.offsetStorageReader()).thenReturn(offsetStorageReader); + + offsetManager = new OffsetManager(sourceTaskContext, s3SourceConfig); + + final String firstTopic = offsetManager.getFirstConfiguredTopic(s3SourceConfig); + assertEquals("topic1", firstTopic); + } + + private void setBasicProperties() { + properties.put(S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET); + properties.put(TARGET_TOPIC_PARTITIONS, "0,1"); + properties.put(TARGET_TOPICS, "topic1,topic2"); + } +} From e1e15bfa2680c1823529a66df1a332ed77626e86 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Wed, 9 Oct 2024 16:33:48 +0200 Subject: [PATCH 35/90] Adding tests --- .../connect/s3/source/output/AvroWriter.java | 3 +- .../s3/source/utils/RecordProcessor.java | 2 +- .../s3/source/output/AvroWriterTest.java | 132 +++++++++++++++ .../s3/source/utils/OffsetManagerTest.java | 3 +- .../s3/source/utils/RecordProcessorTest.java | 151 ++++++++++++++++++ .../src/test/resources/sample1.parquet | Bin 0 -> 1308 bytes 6 files changed, 286 insertions(+), 5 deletions(-) create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java create mode 100644 s3-source-connector/src/test/resources/sample1.parquet diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java index 9117dd184..d20548bda 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java @@ -63,8 +63,7 @@ public void handleValueData(final Optional<byte[]> optionalKeyBytes, final Input startOffset, offsetManager, currentOffsets, records, partitionMap); } - private List<GenericRecord> readAvroRecords(final InputStream content, - final DatumReader<GenericRecord> datumReader) { + List<GenericRecord> readAvroRecords(final InputStream content, final DatumReader<GenericRecord> datumReader) { final List<GenericRecord> records = new ArrayList<>(); try (SeekableByteArrayInput sin = new SeekableByteArrayInput(IOUtils.toByteArray(content))) { try (DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, datumReader)) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index 10ae8fc67..c311dd952 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -55,7 +55,7 @@ public static List<SourceRecord> processRecords(final Iterator<List<AivenS3Sourc } @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") - private static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRecord> aivenS3SourceRecordList, + static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRecord> aivenS3SourceRecordList, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, final Map<String, String> conversionConfig, final OutputWriter outputWriter) { diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java new file mode 100644 index 000000000..ab077cbf4 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java @@ -0,0 +1,132 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.output; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.when; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import org.apache.kafka.clients.consumer.ConsumerRecord; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; + +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.DatumReader; +import org.apache.avro.io.DatumWriter; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +final class AvroWriterTest { + + @Mock + private S3SourceConfig s3SourceConfig; + @Mock + private OffsetManager offsetManager; + + private AvroWriter avroWriter; + private Map<String, String> config; + + @BeforeEach + void setUp() { + MockitoAnnotations.openMocks(this); + avroWriter = new AvroWriter(); + config = new HashMap<>(); + } + + @Test + void testConfigureValueConverter() { + final String value = "http://localhost:8081"; + when(s3SourceConfig.getString(SCHEMA_REGISTRY_URL)).thenReturn(value); + avroWriter.configureValueConverter(config, s3SourceConfig); + assertThat(config.get(SCHEMA_REGISTRY_URL)).isEqualTo("http://localhost:8081") + .describedAs("The schema registry URL should be correctly set in the config."); + } + + @Test + void testHandleValueData() throws Exception { + final InputStream inputStream = new ByteArrayInputStream("mock-avro-data".getBytes(StandardCharsets.UTF_8)); + final Optional<byte[]> optionalKeyBytes = Optional.of("key".getBytes(StandardCharsets.UTF_8)); + final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = new ArrayList<>(); + final Map<Map<String, Object>, Long> currentOffsets = new HashMap<>(); + final Map<String, Object> partitionMap = new HashMap<>(); + + avroWriter.handleValueData(optionalKeyBytes, inputStream, "test-topic", consumerRecordList, s3SourceConfig, 0, + 0L, offsetManager, currentOffsets, partitionMap); + + assertTrue(consumerRecordList.isEmpty(), "Consumer record list should be populated by the OutputUtils method."); + } + + @Test + void testReadAvroRecords() throws Exception { + final ByteArrayOutputStream avroData = generateMockAvroData(); + final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); + + final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); + final List<GenericRecord> records = avroWriter.readAvroRecords(inputStream, datumReader); + + assertThat(records.size()).isEqualTo(2); + } + + ByteArrayOutputStream generateMockAvroData() throws IOException { + final String schemaJson = "{\n" + " \"type\": \"record\",\n" + " \"name\": \"TestRecord\",\n" + + " \"fields\": [\n" + " {\"name\": \"message\", \"type\": \"string\"},\n" + + " {\"name\": \"id\", \"type\": \"int\"}\n" + " ]\n" + "}"; + final Schema.Parser parser = new Schema.Parser(); + final Schema schema = parser.parse(schemaJson); + + return getAvroRecord(schema, 2); + } + + private static ByteArrayOutputStream getAvroRecord(final Schema schema, final int messageId) throws IOException { + // Create Avro records + final GenericRecord avroRecord = new GenericData.Record(schema); + avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + messageId); + avroRecord.put("id", messageId); + + // Serialize Avro records to byte arrays + final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); + try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) { + dataFileWriter.create(schema, outputStream); + dataFileWriter.append(avroRecord); // record 1 + dataFileWriter.append(avroRecord); // record 2 + dataFileWriter.flush(); + } + outputStream.close(); + return outputStream; + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java index 3bfca42ad..0ff30eb71 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java @@ -20,7 +20,6 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; import static io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator.OFFSET_KEY; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.jupiter.api.Assertions.assertEquals; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -133,7 +132,7 @@ void testGetFirstConfiguredTopic() throws Exception { offsetManager = new OffsetManager(sourceTaskContext, s3SourceConfig); final String firstTopic = offsetManager.getFirstConfiguredTopic(s3SourceConfig); - assertEquals("topic1", firstTopic); + assertThat(firstTopic).isEqualTo("topic1"); } private void setBasicProperties() { diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java new file mode 100644 index 000000000..1831fc610 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -0,0 +1,151 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; +import static org.mockito.internal.verification.VerificationModeFactory.times; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.source.SourceRecord; +import org.apache.kafka.connect.storage.Converter; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.output.OutputWriter; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +class RecordProcessorTest { + + @Mock + private S3SourceConfig s3SourceConfig; + @Mock + private Converter valueConverter; + @Mock + private OutputWriter outputWriter; + @Mock + private Converter keyConverter; + + private AtomicBoolean connectorStopped; + private Iterator<List<AivenS3SourceRecord>> sourceRecordIterator; + + @BeforeEach + void setUp() { + MockitoAnnotations.openMocks(this); + connectorStopped = new AtomicBoolean(false); + sourceRecordIterator = mock(Iterator.class); + } + + @Test + void testProcessRecordsNoRecords() { + when(s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS)).thenReturn(5); + when(sourceRecordIterator.hasNext()).thenReturn(false); + + final List<SourceRecord> results = new ArrayList<>(); + final List<SourceRecord> processedRecords = RecordProcessor.processRecords( + sourceRecordIterator, + results, + s3SourceConfig, + Optional.of(keyConverter), + valueConverter, + connectorStopped, + outputWriter + ); + + assertTrue(processedRecords.isEmpty(), "Processed records should be empty when there are no records."); + } + + @Test + void testProcessRecordsWithRecords() { + when(s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS)).thenReturn(5); + when(sourceRecordIterator.hasNext()).thenReturn(true, false); // One iteration with records + + final AivenS3SourceRecord mockRecord = mock(AivenS3SourceRecord.class); + final List<AivenS3SourceRecord> recordList = Collections.singletonList(mockRecord); + when(sourceRecordIterator.next()).thenReturn(recordList); + + final List<SourceRecord> results = new ArrayList<>(); + RecordProcessor.processRecords( + sourceRecordIterator, + results, + s3SourceConfig, + Optional.of(keyConverter), + valueConverter, + connectorStopped, + outputWriter + ); + + assertThat(results.size()).isEqualTo(1); + verify(sourceRecordIterator, times(1)).next(); + } + + @Test + void testProcessRecordsConnectorStopped() { + when(s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS)).thenReturn(5); + connectorStopped.set(true); // Simulate connector stopped + + final List<SourceRecord> results = new ArrayList<>(); + final List<SourceRecord> processedRecords = RecordProcessor.processRecords( + sourceRecordIterator, + results, + s3SourceConfig, + Optional.of(keyConverter), + valueConverter, + connectorStopped, + outputWriter + ); + + assertTrue(processedRecords.isEmpty(), "Processed records should be empty when connector is stopped."); + verify(sourceRecordIterator, never()).next(); + } + + @Test + void testCreateSourceRecords() { + final AivenS3SourceRecord mockRecord = mock(AivenS3SourceRecord.class); + when(mockRecord.getToTopic()).thenReturn("test-topic"); + when(mockRecord.key()).thenReturn("mock-key".getBytes(StandardCharsets.UTF_8)); + when(mockRecord.value()).thenReturn("mock-value".getBytes(StandardCharsets.UTF_8)); + + when(valueConverter.toConnectData(anyString(), any())) + .thenReturn(new SchemaAndValue(null, "mock-value-converted")); + when(mockRecord.getSourceRecord(anyString(), any(), any())).thenReturn(mock(SourceRecord.class)); + + final List<AivenS3SourceRecord> recordList = Collections.singletonList(mockRecord); + final List<SourceRecord> sourceRecords = RecordProcessor.createSourceRecords(recordList, s3SourceConfig, + Optional.of(keyConverter), valueConverter, new HashMap<>(), outputWriter); + + assertThat(sourceRecords.size()).isEqualTo(1); + } +} diff --git a/s3-source-connector/src/test/resources/sample1.parquet b/s3-source-connector/src/test/resources/sample1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8<h@2kbEU3>gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvM<l1Z}{$Di<VTzp?b`Qdqyxk?)j2Oi#(KY`Qa&cmloMp}=KHAdw19tv6bEUr+Y z?uy<CC(hGcd;%x0E@0JXTo9$PEFiUH&BD07zt?Rpd*3O_79mk$5LNFn%jdO6D0fhH z(b26BmN#C_>D|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWV<FEJY4bF-c(wnkj7 zyZBeZ?RJNX$v&t5H5nQGaklz=dX;NI*B-0+pPMhgVB56NPvZ8`B}>EJpR{CJ{Fy0- zE8ux@_5^8x!<w<v%fCL;c&PJs?+?DM8BZH>?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(I<xrA!nIL)-H#sBm2b<v zyjvzg$!p$w!}-F@uPeSV#+c22IV)Y8y+ql+f3=-R_;f${)b5vuC%*hU>rH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43<N8&XSbmN1%Zq~hB6nbq-g9n>QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<<tA1bk1fe$<H&s4s+_Y&%zj~{|nXL}@f9ITO++Dxs?4FP* z-VAJ?csr+9-~P<yF5>)~?``+iT<fQ2-$`Z2U-&$y`AaJAZH7ytl`C~W<;4FMTI*@` z{^2?o6Tjwc5u2Hxk6ygVyyaTp(Pf=a+T?#Z>xh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{<k`F~;CHFxnWo|XEEYfXdH7SHZ~G<&wOu+{!& z6_1zHsTFsYHvbfgwKOc8(ZxRDR{uYZSc&}Fybq!4rYJp_a60mZ;`vj*+KY;QUpRK_ z`mUxWSDuQ#Dtz{HpYN-Rl2ZF6{qri8u4K5Hma$q>eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 From 8bcfa72894b0c17bd6295f72c0a28d8bc22de6b9 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Wed, 9 Oct 2024 19:04:50 +0200 Subject: [PATCH 36/90] Adding parquet unit tests --- s3-source-connector/build.gradle.kts | 24 ++-- .../connect/s3/source/IntegrationTest.java | 62 +--------- .../s3/source/output/ParquetWriter.java | 51 +++----- .../s3/source/output/ParquetWriterTest.java | 116 ++++++++++++++++++ .../s3/source/testutils/ContentUtils.java | 88 +++++++++++++ .../src/test/resources/sample1.parquet | Bin 1308 -> 0 bytes 6 files changed, 235 insertions(+), 106 deletions(-) create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java delete mode 100644 s3-source-connector/src/test/resources/sample1.parquet diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts index 27a8d92f9..d5517b4cc 100644 --- a/s3-source-connector/build.gradle.kts +++ b/s3-source-connector/build.gradle.kts @@ -69,18 +69,27 @@ dependencies { implementation("com.amazonaws:aws-java-sdk-s3:$amazonS3Version") implementation("com.amazonaws:aws-java-sdk-sts:$amazonSTSVersion") + implementation("org.apache.parquet:parquet-hadoop:$parquetVersion") + testImplementation("org.apache.parquet:parquet-hadoop:$parquetVersion") + integrationTestImplementation("org.apache.parquet:parquet-hadoop:$parquetVersion") + + implementation("org.apache.parquet:parquet-avro:$parquetVersion") { + exclude(group = "org.xerial.snappy", module = "snappy-java") + exclude(group = "org.slf4j", module = "slf4j-api") + exclude(group = "org.apache.avro", module = "avro") + } + testImplementation("org.apache.parquet:parquet-avro:$parquetVersion") { + exclude(group = "org.xerial.snappy", module = "snappy-java") + exclude(group = "org.slf4j", module = "slf4j-api") + exclude(group = "org.apache.avro", module = "avro") + } + implementation(tools.spotbugs.annotations) implementation(logginglibs.slf4j) implementation(apache.avro) implementation(confluent.kafka.connect.avro.converter) { exclude(group = "org.apache.kafka", module = "kafka-clients") } - implementation(apache.parquet.tools) - implementation(apache.parquet.avro) { - exclude(group = "org.xerial.snappy", module = "snappy-java") - exclude(group = "org.slf4j", module = "slf4j-api") - exclude(group = "org.apache.avro", module = "avro") - } testImplementation(compressionlibs.snappy) testImplementation(compressionlibs.zstd.jni) @@ -150,9 +159,6 @@ dependencies { exclude(group = "org.apache.kafka", module = "kafka-clients") } - integrationTestImplementation(apache.avro) - - testImplementation(apache.parquet.tools) { exclude(group = "org.slf4j", module = "slf4j-api") } testImplementation(apache.hadoop.mapreduce.client.core) { exclude(group = "org.apache.hadoop", module = "hadoop-yarn-client") exclude(group = "org.apache.hadoop.thirdparty", module = "hadoop-shaded-protobuf_3_7") diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 46004bf1f..d58eb10cc 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -32,11 +32,9 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.net.ConnectException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; import java.util.HashMap; @@ -48,6 +46,7 @@ import io.aiven.kafka.connect.s3.source.output.OutputFormat; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; +import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; import io.aiven.kafka.connect.s3.source.testutils.S3OutputStream; import com.amazonaws.services.s3.AmazonS3; @@ -60,9 +59,6 @@ import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumWriter; import org.apache.commons.io.IOUtils; -import org.apache.parquet.avro.AvroParquetWriter; -import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; @@ -244,7 +240,7 @@ void parquetTest(final TestInfo testInfo) throws ExecutionException, Interrupted final String name2 = "testuser2"; connectRunner.createConnector(connectorConfig); - final Path path = getTmpFilePath(name1, name2); + final Path path = ContentUtils.getTmpFilePath(name1, name2); try { s3Client.putObject(TEST_BUCKET_NAME, fileName, Files.newInputStream(path), null); @@ -351,58 +347,4 @@ public void multipartUpload(final String bucketName, final String key) { LOGGER.error(e.getMessage()); } } - - public static void writeParquetFile(final String tempFilePath, final String name1, final String name2) - throws IOException { - // Define the Avro schema - final String schemaString = "{" + "\"type\":\"record\"," + "\"name\":\"User\"," + "\"fields\":[" - + "{\"name\":\"name\",\"type\":\"string\"}," + "{\"name\":\"age\",\"type\":\"int\"}," - + "{\"name\":\"email\",\"type\":\"string\"}" + "]" + "}"; - final Schema schema = new Schema.Parser().parse(schemaString); - - // Write the Parquet file - try { - writeParquetFile(tempFilePath, schema, name1, name2); - } catch (IOException e) { - throw new ConnectException("Error writing parquet file"); - } - } - - private static Path getTmpFilePath(final String name1, final String name2) throws IOException { - final String tmpFile = "users.parquet"; - final Path parquetFileDir = Files.createTempDirectory("parquet_tests"); - final String parquetFilePath = parquetFileDir.toAbsolutePath() + "/" + tmpFile; - - writeParquetFile(parquetFilePath, name1, name2); - return Paths.get(parquetFilePath); - } - - private static void writeParquetFile(final String outputPath, final Schema schema, final String name1, - final String name2) throws IOException { - - // Create sample records - final GenericData.Record user1 = new GenericData.Record(schema); - user1.put("name", name1); - user1.put("age", 30); - user1.put("email", name1 + "@test"); - - final GenericData.Record user2 = new GenericData.Record(schema); - user2.put("name", name2); - user2.put("age", 25); - user2.put("email", name2 + "@test"); - - // Create a Parquet writer - final org.apache.hadoop.fs.Path path = new org.apache.hadoop.fs.Path(outputPath); // NOPMD - try (ParquetWriter<GenericData.Record> writer = AvroParquetWriter.<GenericData.Record>builder(path) - .withSchema(schema) - .withCompressionCodec(CompressionCodecName.SNAPPY) // You can choose GZIP, LZO, etc. - .withRowGroupSize(100 * 1024) // Customize row group size - .withPageSize(1024 * 1024) // Customize page size - .build()) { - // Write records to the Parquet file - writer.write(user1); - writer.write(user2); - } - } - } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java index 8c2db5f20..be132eb8a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java @@ -22,8 +22,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.nio.channels.Channels; -import java.nio.channels.SeekableByteChannel; import java.nio.file.Files; import java.nio.file.Path; import java.time.Instant; @@ -40,12 +38,15 @@ import org.apache.avro.generic.GenericRecord; import org.apache.commons.compress.utils.IOUtils; import org.apache.parquet.avro.AvroParquetReader; -import org.apache.parquet.io.DelegatingSeekableInputStream; import org.apache.parquet.io.InputFile; -import org.apache.parquet.io.SeekableInputStream; +import org.apache.parquet.io.LocalInputFile; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class ParquetWriter implements OutputWriter { + private static final Logger LOGGER = LoggerFactory.getLogger(ParquetWriter.class); + @Override public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); @@ -76,49 +77,25 @@ public static List<GenericRecord> getRecords(final InputStream inputStream, fina } try (OutputStream outputStream = Files.newOutputStream(parquetFile.toPath())) { - // write to a local file IOUtils.copy(inputStream, outputStream); - - try (SeekableByteChannel seekableByteChannel = Files.newByteChannel(parquetFile.toPath()); - var parquetReader = AvroParquetReader.<GenericRecord>builder(new InputFile() { - @Override - public long getLength() throws IOException { - return seekableByteChannel.size(); - } - - @Override - public SeekableInputStream newStream() { - return new DelegatingSeekableInputStream(Channels.newInputStream(seekableByteChannel)) { - @Override - public long getPos() throws IOException { - return seekableByteChannel.position(); - } - - @Override - public void seek(final long value) throws IOException { - seekableByteChannel.position(value); - } - }; - } - - }).withCompatibility(false).build()) { - var record = parquetReader.read(); + final InputFile inputFile = new LocalInputFile(parquetFile.toPath()); + try (var parquetReader = AvroParquetReader.<GenericRecord>builder(inputFile).build()) { + GenericRecord record; + record = parquetReader.read(); while (record != null) { records.add(record); record = parquetReader.read(); } - } catch (IOException e) { - LOGGER.error("Error in reading s3 object stream " + e.getMessage()); - } finally { - deleteTmpFile(parquetFile.toPath()); } - } catch (IOException e) { - LOGGER.error("Error in reading s3 object stream for topic " + topic + " with error : " + e.getMessage()); + } catch (IOException | RuntimeException e) { // NOPMD + LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + } finally { + deleteTmpFile(parquetFile.toPath()); } return records; } - private static void deleteTmpFile(final Path parquetFile) { + static void deleteTmpFile(final Path parquetFile) { if (Files.exists(parquetFile)) { try { Files.delete(parquetFile); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java new file mode 100644 index 000000000..8fc7f5536 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java @@ -0,0 +1,116 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.output; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import org.apache.kafka.clients.consumer.ConsumerRecord; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; + +import com.amazonaws.util.IOUtils; +import org.apache.avro.generic.GenericRecord; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +final class ParquetWriterTest { + private ParquetWriter parquetWriter; + private S3SourceConfig s3SourceConfig; + private OffsetManager offsetManager; + private List<ConsumerRecord<byte[], byte[]>> consumerRecordList; + private Map<Map<String, Object>, Long> currentOffsets; + private Map<String, Object> partitionMap; + + @BeforeEach + public void setUp() { + parquetWriter = new ParquetWriter(); + s3SourceConfig = mock(S3SourceConfig.class); + offsetManager = mock(OffsetManager.class); + consumerRecordList = new ArrayList<>(); + currentOffsets = mock(Map.class); + partitionMap = mock(Map.class); + } + + @Test + void testHandleValueDataWithZeroBytes() { + final byte[] mockParquetData = new byte[0]; + final InputStream inputStream = new ByteArrayInputStream(mockParquetData); + + final String topic = "test-topic"; + final int topicPartition = 0; + final long startOffset = 100L; + + parquetWriter.handleValueData(Optional.empty(), inputStream, topic, consumerRecordList, s3SourceConfig, + topicPartition, startOffset, offsetManager, currentOffsets, partitionMap); + + assertThat(consumerRecordList).isEmpty(); + } + + @Test + void testGetRecordsWithValidData() throws Exception { + final byte[] mockParquetData = generateMockParquetData(); + final InputStream inputStream = new ByteArrayInputStream(mockParquetData); + + final String topic = "test-topic"; + final int topicPartition = 0; + + final List<GenericRecord> records = ParquetWriter.getRecords(inputStream, topic, topicPartition); + + assertThat(records).isNotEmpty(); + assertThat(records).extracting(record -> record.get("name").toString()).contains("name1").contains("name2"); + } + + @Test + void testGetRecordsWithInvalidData() { + final byte[] invalidData = "invalid data".getBytes(StandardCharsets.UTF_8); + final InputStream inputStream = new ByteArrayInputStream(invalidData); + + final String topic = "test-topic"; + final int topicPartition = 0; + + final List<GenericRecord> records = ParquetWriter.getRecords(inputStream, topic, topicPartition); + assertThat(records).isEmpty(); + } + + @Test + void testTemporaryFileDeletion() throws Exception { + final Path tempFile = Files.createTempFile("test-file", ".parquet"); + assertThat(Files.exists(tempFile)).isTrue(); + + ParquetWriter.deleteTmpFile(tempFile); + assertThat(Files.exists(tempFile)).isFalse(); + } + + private byte[] generateMockParquetData() throws IOException { + final Path path = ContentUtils.getTmpFilePath("name1", "name2"); + return IOUtils.toByteArray(Files.newInputStream(path)); + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java new file mode 100644 index 000000000..328f7fbf7 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java @@ -0,0 +1,88 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.testutils; + +import java.io.IOException; +import java.net.ConnectException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.parquet.avro.AvroParquetWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.io.LocalOutputFile; +import org.apache.parquet.io.OutputFile; + +public final class ContentUtils { + private ContentUtils() { + } + public static Path getTmpFilePath(final String name1, final String name2) throws IOException { + final String tmpFile = "users.parquet"; + final Path parquetFileDir = Files.createTempDirectory("parquet_tests"); + final String parquetFilePath = parquetFileDir.toAbsolutePath() + "/" + tmpFile; + + writeParquetFile(parquetFilePath, name1, name2); + return Paths.get(parquetFilePath); + } + + public static void writeParquetFile(final String tempFilePath, final String name1, final String name2) + throws IOException { + // Define the Avro schema + final String schemaString = "{" + "\"type\":\"record\"," + "\"name\":\"User\"," + "\"fields\":[" + + "{\"name\":\"name\",\"type\":\"string\"}," + "{\"name\":\"age\",\"type\":\"int\"}," + + "{\"name\":\"email\",\"type\":\"string\"}" + "]" + "}"; + final Schema schema = new Schema.Parser().parse(schemaString); + + // Write the Parquet file + try { + writeParquetFile(tempFilePath, schema, name1, name2); + } catch (IOException e) { + throw new ConnectException("Error writing parquet file"); + } + } + + private static void writeParquetFile(final String outputPath, final Schema schema, final String name1, + final String name2) throws IOException { + + // Create sample records + final GenericData.Record user1 = new GenericData.Record(schema); + user1.put("name", name1); + user1.put("age", 30); + user1.put("email", name1 + "@test"); + + final GenericData.Record user2 = new GenericData.Record(schema); + user2.put("name", name2); + user2.put("age", 25); + user2.put("email", name2 + "@test"); + + // Create a Parquet writer + final OutputFile outputFile = new LocalOutputFile(Paths.get(outputPath)); + try (ParquetWriter<GenericData.Record> writer = AvroParquetWriter.<GenericData.Record>builder(outputFile) + .withSchema(schema) + .withCompressionCodec(CompressionCodecName.SNAPPY) + .withRowGroupSize(100L * 1024L) + .withPageSize(1024 * 1024) + .build()) { + // Write records to the Parquet file + writer.write(user1); + writer.write(user2); + } + } +} diff --git a/s3-source-connector/src/test/resources/sample1.parquet b/s3-source-connector/src/test/resources/sample1.parquet deleted file mode 100644 index 9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8<h@2kbEU3>gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvM<l1Z}{$Di<VTzp?b`Qdqyxk?)j2Oi#(KY`Qa&cmloMp}=KHAdw19tv6bEUr+Y z?uy<CC(hGcd;%x0E@0JXTo9$PEFiUH&BD07zt?Rpd*3O_79mk$5LNFn%jdO6D0fhH z(b26BmN#C_>D|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWV<FEJY4bF-c(wnkj7 zyZBeZ?RJNX$v&t5H5nQGaklz=dX;NI*B-0+pPMhgVB56NPvZ8`B}>EJpR{CJ{Fy0- zE8ux@_5^8x!<w<v%fCL;c&PJs?+?DM8BZH>?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(I<xrA!nIL)-H#sBm2b<v zyjvzg$!p$w!}-F@uPeSV#+c22IV)Y8y+ql+f3=-R_;f${)b5vuC%*hU>rH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43<N8&XSbmN1%Zq~hB6nbq-g9n>QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<<tA1bk1fe$<H&s4s+_Y&%zj~{|nXL}@f9ITO++Dxs?4FP* z-VAJ?csr+9-~P<yF5>)~?``+iT<fQ2-$`Z2U-&$y`AaJAZH7ytl`C~W<;4FMTI*@` z{^2?o6Tjwc5u2Hxk6ygVyyaTp(Pf=a+T?#Z>xh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{<k`F~;CHFxnWo|XEEYfXdH7SHZ~G<&wOu+{!& z6_1zHsTFsYHvbfgwKOc8(ZxRDR{uYZSc&}Fybq!4rYJp_a60mZ;`vj*+KY;QUpRK_ z`mUxWSDuQ#Dtz{HpYN-Rl2ZF6{qri8u4K5Hma$q>eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? From 148ae6413f88def62ac3f319d6716970cbec6315 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Wed, 9 Oct 2024 23:51:35 +0200 Subject: [PATCH 37/90] Skip failed objects --- .../kafka/connect/s3/source/S3SourceTask.java | 68 +++++++++++++------ .../s3/source/utils/AivenS3SourceRecord.java | 10 ++- .../connect/s3/source/utils/FileReader.java | 9 ++- .../s3/source/utils/RecordProcessor.java | 25 +++++-- .../s3/source/utils/SourceRecordIterator.java | 7 +- .../connect/s3/source/S3SourceTaskTest.java | 4 +- .../s3/source/utils/FileReaderTest.java | 2 +- .../s3/source/utils/RecordProcessorTest.java | 17 ++--- 8 files changed, 100 insertions(+), 42 deletions(-) diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 8eb75ff8c..de3ae2599 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -21,13 +21,16 @@ import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.source.SourceTask; import org.apache.kafka.connect.storage.Converter; @@ -51,6 +54,7 @@ * S3SourceTask is a Kafka Connect SourceTask implementation that reads from source-s3 buckets and generates Kafka * Connect records. */ +@SuppressWarnings("PMD.TooManyMethods") public class S3SourceTask extends SourceTask { private static final Logger LOGGER = LoggerFactory.getLogger(S3SourceTask.class); @@ -79,6 +83,9 @@ public class S3SourceTask extends SourceTask { private final AtomicBoolean connectorStopped = new AtomicBoolean(); private final S3ClientFactory s3ClientFactory = new S3ClientFactory(); + private final Object pollLock = new Object(); + private final Set<String> failedObjectKeys = new HashSet<>(); + @SuppressWarnings("PMD.UnnecessaryConstructor") public S3SourceTask() { super(); @@ -122,25 +129,49 @@ private void initializeS3Client() { private void prepareReaderFromOffsetStorageReader() { final OffsetManager offsetManager = new OffsetManager(context, s3SourceConfig); sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, s3Client, this.s3Bucket, offsetManager, - this.outputWriter); + this.outputWriter, failedObjectKeys); } @Override public List<SourceRecord> poll() throws InterruptedException { - final List<SourceRecord> results = new ArrayList<>(s3SourceConfig.getInt(MAX_POLL_RECORDS)); + synchronized (pollLock) { + final List<SourceRecord> results = new ArrayList<>(s3SourceConfig.getInt(MAX_POLL_RECORDS)); - if (connectorStopped.get()) { + if (connectorStopped.get()) { + return results; + } + + while (!connectorStopped.get()) { + try { + return extractSourceRecords(results); + } catch (AmazonS3Exception | DataException exception) { + if (handleException(exception)) { + return null; // NOPMD + } + } catch (final Throwable t) { // NOPMD + // This task has failed, so close any resources (may be reopened if needed) before throwing + closeResources(); + throw t; + } + } return results; } + } - while (!connectorStopped.get()) { - try { - return extractSourceRecords(results); - } catch (AmazonS3Exception e) { - handleS3Exception(e); + private boolean handleException(final RuntimeException exception) throws InterruptedException { + if (exception instanceof AmazonS3Exception) { + if (((AmazonS3Exception) exception).isRetryable()) { + LOGGER.warn("Retryable error while polling. Will sleep and try again.", exception); + Thread.sleep(ERROR_BACKOFF); + prepareReaderFromOffsetStorageReader(); + } else { + return true; } } - return results; + if (exception instanceof DataException) { + LOGGER.warn("DataException. Will NOT try again.", exception); + } + return false; } private List<SourceRecord> extractSourceRecords(final List<SourceRecord> results) throws InterruptedException { @@ -149,7 +180,7 @@ private List<SourceRecord> extractSourceRecords(final List<SourceRecord> results return results; } return RecordProcessor.processRecords(sourceRecordIterator, results, s3SourceConfig, keyConverter, - valueConverter, connectorStopped, this.outputWriter); + valueConverter, connectorStopped, this.outputWriter, failedObjectKeys); } private void waitForObjects() throws InterruptedException { @@ -160,20 +191,17 @@ private void waitForObjects() throws InterruptedException { } } - private void handleS3Exception(final AmazonS3Exception amazonS3Exception) throws InterruptedException { - if (amazonS3Exception.isRetryable()) { - LOGGER.warn("Retryable error while polling. Will sleep and try again.", amazonS3Exception); - Thread.sleep(ERROR_BACKOFF); - prepareReaderFromOffsetStorageReader(); - } else { - throw amazonS3Exception; - } - } - @Override public void stop() { this.taskInitialized = false; this.connectorStopped.set(true); + synchronized (pollLock) { + closeResources(); + } + } + + private void closeResources() { + s3Client.shutdown(); } // below for visibility in tests diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java index 00e924e9f..ae8baf297 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java @@ -33,8 +33,11 @@ public class AivenS3SourceRecord { private final byte[] recordKey; private final byte[] recordValue; + private final String objectKey; + public AivenS3SourceRecord(final Map<String, Object> partitionMap, final Map<String, Object> offsetMap, - final String toTopic, final int topicPartition, final byte[] recordKey, final byte[] recordValue) { + final String toTopic, final int topicPartition, final byte[] recordKey, final byte[] recordValue, + final String objectKey) { this.partitionMap = new HashMap<>(partitionMap); this.offsetMap = new HashMap<>(offsetMap); @@ -42,6 +45,7 @@ public AivenS3SourceRecord(final Map<String, Object> partitionMap, final Map<Str this.topicPartition = topicPartition; this.recordKey = Arrays.copyOf(recordKey, recordKey.length); this.recordValue = Arrays.copyOf(recordValue, recordValue.length); + this.objectKey = objectKey; } public Map<String, Object> getPartitionMap() { @@ -68,6 +72,10 @@ public byte[] value() { return recordValue.clone(); } + public String getObjectKey() { + return objectKey; + } + public SourceRecord getSourceRecord(final String topic, final Optional<SchemaAndValue> keyData, final SchemaAndValue schemaAndValue) { return new SourceRecord(getPartitionMap(), getOffsetMap(), topic, partition(), diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java index 0269f084a..618c68a82 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -20,7 +20,9 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; @@ -36,9 +38,13 @@ public class FileReader { private final S3SourceConfig s3SourceConfig; private final String bucketName; - public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName) { + private final Set<String> failedObjectKeys; + + public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName, + final Set<String> failedObjectKeys) { this.s3SourceConfig = s3SourceConfig; this.bucketName = bucketName; + this.failedObjectKeys = new HashSet<>(failedObjectKeys); } @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") @@ -60,6 +66,7 @@ List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOExc final List<S3ObjectSummary> filteredSummaries = objectListing.getObjectSummaries() .stream() .filter(objectSummary -> objectSummary.getSize() > 0) + .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey())) .collect(Collectors.toList()); allSummaries.addAll(filteredSummaries); // Add the filtered summaries to the main list diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index c311dd952..ccaab7f07 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -22,24 +22,31 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.storage.Converter; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.output.OutputWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + public final class RecordProcessor { + private static final Logger LOGGER = LoggerFactory.getLogger(RecordProcessor.class); + private RecordProcessor() { } public static List<SourceRecord> processRecords(final Iterator<List<AivenS3SourceRecord>> sourceRecordIterator, final List<SourceRecord> results, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, - final AtomicBoolean connectorStopped, final OutputWriter outputWriter) { + final AtomicBoolean connectorStopped, final OutputWriter outputWriter, final Set<String> failedObjectKeys) { final Map<String, String> conversionConfig = new HashMap<>(); final int maxPollRecords = s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS); @@ -47,7 +54,7 @@ public static List<SourceRecord> processRecords(final Iterator<List<AivenS3Sourc for (int i = 0; sourceRecordIterator.hasNext() && i < maxPollRecords && !connectorStopped.get(); i++) { final List<AivenS3SourceRecord> recordList = sourceRecordIterator.next(); final List<SourceRecord> sourceRecords = createSourceRecords(recordList, s3SourceConfig, keyConverter, - valueConverter, conversionConfig, outputWriter); + valueConverter, conversionConfig, outputWriter, failedObjectKeys); results.addAll(sourceRecords); } @@ -57,7 +64,8 @@ public static List<SourceRecord> processRecords(final Iterator<List<AivenS3Sourc @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRecord> aivenS3SourceRecordList, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, - final Map<String, String> conversionConfig, final OutputWriter outputWriter) { + final Map<String, String> conversionConfig, final OutputWriter outputWriter, + final Set<String> failedObjectKeys) { final List<SourceRecord> sourceRecordList = new ArrayList<>(); for (final AivenS3SourceRecord aivenS3SourceRecord : aivenS3SourceRecordList) { @@ -67,9 +75,14 @@ static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRecord> ai outputWriter.configureValueConverter(conversionConfig, s3SourceConfig); valueConverter.configure(conversionConfig, false); - final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); - - sourceRecordList.add(aivenS3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue)); + try { + final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); + sourceRecordList.add(aivenS3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue)); + } catch (DataException e) { + LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + failedObjectKeys.add(aivenS3SourceRecord.getObjectKey()); + throw e; + } } return sourceRecordList; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index c734cd845..071c9f165 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -31,6 +31,7 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -70,13 +71,13 @@ public final class SourceRecordIterator implements Iterator<List<AivenS3SourceRe private final OutputWriter outputWriter; public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, - final OffsetManager offsetManager, final OutputWriter outputWriter) { + final OffsetManager offsetManager, final OutputWriter outputWriter, final Set<String> failedObjectKeys) { this.s3SourceConfig = s3SourceConfig; this.offsetManager = offsetManager; this.s3Client = s3Client; this.bucketName = bucketName; this.outputWriter = outputWriter; - final FileReader fileReader = new FileReader(s3SourceConfig, bucketName); + final FileReader fileReader = new FileReader(s3SourceConfig, bucketName, failedObjectKeys); try { final List<S3ObjectSummary> chunks = fileReader.fetchObjectSummaries(s3Client); nextFileIterator = chunks.iterator(); @@ -192,7 +193,7 @@ public List<AivenS3SourceRecord> next() { offsetMap.put(OFFSET_KEY, currentRecord.offset()); aivenS3SourceRecord = new AivenS3SourceRecord(partitionMap, offsetMap, currentRecord.topic(), - currentRecord.partition(), currentRecord.key(), currentRecord.value()); + currentRecord.partition(), currentRecord.key(), currentRecord.value(), currentObjectKey); aivenS3SourceRecordList.add(aivenS3SourceRecord); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java index 5d6b42efa..eb207dc75 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -173,10 +173,10 @@ void testStop() { private static List<AivenS3SourceRecord> getAivenS3SourceRecords() { final List<AivenS3SourceRecord> aivenS3SourceRecordList = new ArrayList<>(); final AivenS3SourceRecord aivenS3SourceRecord1 = new AivenS3SourceRecord(new HashMap<>(), new HashMap<>(), - "testtopic", 0, new byte[0], new byte[0]); + "testtopic", 0, new byte[0], new byte[0], ""); aivenS3SourceRecordList.add(aivenS3SourceRecord1); final AivenS3SourceRecord aivenS3SourceRecord2 = new AivenS3SourceRecord(new HashMap<>(), new HashMap<>(), - "testtopic", 1, new byte[0], new byte[0]); + "testtopic", 1, new byte[0], new byte[0], ""); aivenS3SourceRecordList.add(aivenS3SourceRecord2); return aivenS3SourceRecordList; } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java index 360dfc5a6..e8d56d8c2 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java @@ -56,7 +56,7 @@ public void setUp() { properties = new HashMap<>(); setBasicProperties(); final S3SourceConfig s3SourceConfig = new S3SourceConfig(properties); - fileReader = new FileReader(s3SourceConfig, TEST_BUCKET); + fileReader = new FileReader(s3SourceConfig, TEST_BUCKET, Collections.emptySet()); s3Client = mock(AmazonS3.class); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java index 1831fc610..9e598f3af 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -26,6 +26,7 @@ import static org.mockito.Mockito.when; import static org.mockito.internal.verification.VerificationModeFactory.times; +import java.net.ConnectException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; @@ -69,7 +70,7 @@ void setUp() { } @Test - void testProcessRecordsNoRecords() { + void testProcessRecordsNoRecords() throws ConnectException { when(s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS)).thenReturn(5); when(sourceRecordIterator.hasNext()).thenReturn(false); @@ -81,14 +82,14 @@ void testProcessRecordsNoRecords() { Optional.of(keyConverter), valueConverter, connectorStopped, - outputWriter + outputWriter, Collections.emptySet() ); assertTrue(processedRecords.isEmpty(), "Processed records should be empty when there are no records."); } @Test - void testProcessRecordsWithRecords() { + void testProcessRecordsWithRecords() throws ConnectException { when(s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS)).thenReturn(5); when(sourceRecordIterator.hasNext()).thenReturn(true, false); // One iteration with records @@ -104,7 +105,7 @@ void testProcessRecordsWithRecords() { Optional.of(keyConverter), valueConverter, connectorStopped, - outputWriter + outputWriter, Collections.emptySet() ); assertThat(results.size()).isEqualTo(1); @@ -112,7 +113,7 @@ void testProcessRecordsWithRecords() { } @Test - void testProcessRecordsConnectorStopped() { + void testProcessRecordsConnectorStopped() throws ConnectException { when(s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS)).thenReturn(5); connectorStopped.set(true); // Simulate connector stopped @@ -124,7 +125,7 @@ void testProcessRecordsConnectorStopped() { Optional.of(keyConverter), valueConverter, connectorStopped, - outputWriter + outputWriter, Collections.emptySet() ); assertTrue(processedRecords.isEmpty(), "Processed records should be empty when connector is stopped."); @@ -132,7 +133,7 @@ void testProcessRecordsConnectorStopped() { } @Test - void testCreateSourceRecords() { + void testCreateSourceRecords() throws ConnectException { final AivenS3SourceRecord mockRecord = mock(AivenS3SourceRecord.class); when(mockRecord.getToTopic()).thenReturn("test-topic"); when(mockRecord.key()).thenReturn("mock-key".getBytes(StandardCharsets.UTF_8)); @@ -144,7 +145,7 @@ void testCreateSourceRecords() { final List<AivenS3SourceRecord> recordList = Collections.singletonList(mockRecord); final List<SourceRecord> sourceRecords = RecordProcessor.createSourceRecords(recordList, s3SourceConfig, - Optional.of(keyConverter), valueConverter, new HashMap<>(), outputWriter); + Optional.of(keyConverter), valueConverter, new HashMap<>(), outputWriter, Collections.emptySet()); assertThat(sourceRecords.size()).isEqualTo(1); } From db7187f0a91dc02364e2885096af4126b7c3f7bc Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Thu, 10 Oct 2024 12:20:32 +0200 Subject: [PATCH 38/90] Refactor writer classes --- .../connect/s3/source/IntegrationBase.java | 2 +- .../connect/s3/source/output/AvroWriter.java | 28 +++--- .../s3/source/output/ByteArrayWriter.java | 20 ++-- .../connect/s3/source/output/JsonWriter.java | 36 +++---- .../connect/s3/source/output/OutputUtils.java | 37 ------- .../s3/source/output/OutputWriter.java | 12 +-- .../s3/source/output/ParquetWriter.java | 26 +++-- .../s3/source/utils/SourceRecordIterator.java | 36 ++++++- .../s3/source/output/AvroWriterTest.java | 19 +--- .../s3/source/output/JsonWriterTest.java | 99 +++++++++++++++++++ .../s3/source/output/ParquetWriterTest.java | 33 ++----- 11 files changed, 192 insertions(+), 156 deletions(-) create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonWriterTest.java diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index 0edb23772..d89d85473 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -173,7 +173,7 @@ static List<GenericRecord> consumeAvroMessages(final String topic, final int exp while (recordsList.size() < expectedMessageCount) { final ConsumerRecords<String, GenericRecord> records = consumer.poll(500L); for (final ConsumerRecord<String, GenericRecord> record : records) { - recordsList.add(record.value()); // Add the GenericRecord to the list + recordsList.add(record.value()); } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java index d20548bda..cfce865ca 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java @@ -21,14 +21,11 @@ import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Map; -import java.util.Optional; - -import org.apache.kafka.clients.consumer.ConsumerRecord; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import com.amazonaws.util.IOUtils; import org.apache.avro.file.DataFileReader; @@ -36,7 +33,6 @@ import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; -import org.apache.avro.io.DecoderFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -50,21 +46,19 @@ public void configureValueConverter(final Map<String, String> config, final S3So } @Override - @SuppressWarnings("PMD.ExcessiveParameterList") - public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, - final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, - final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, - final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets, - final Map<String, Object> partitionMap) { + public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition) { final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); - DecoderFactory.get().binaryDecoder(inputStream, null); - final List<GenericRecord> records = readAvroRecords(inputStream, datumReader); - OutputUtils.buildConsumerRecordList(optionalKeyBytes, topic, consumerRecordList, s3SourceConfig, topicPartition, - startOffset, offsetManager, currentOffsets, records, partitionMap); + return readAvroRecords(inputStream, datumReader); + } + + @Override + public byte[] getValueBytes(final Object record, final String topic, final S3SourceConfig s3SourceConfig) { + return OutputUtils.serializeAvroRecordToBytes(Collections.singletonList((GenericRecord) record), topic, + s3SourceConfig); } - List<GenericRecord> readAvroRecords(final InputStream content, final DatumReader<GenericRecord> datumReader) { - final List<GenericRecord> records = new ArrayList<>(); + List<Object> readAvroRecords(final InputStream content, final DatumReader<GenericRecord> datumReader) { + final List<Object> records = new ArrayList<>(); try (SeekableByteArrayInput sin = new SeekableByteArrayInput(IOUtils.toByteArray(content))) { try (DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, datumReader)) { reader.forEach(records::add); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java index 0120a3771..e1052ac8c 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java @@ -20,12 +20,8 @@ import java.io.InputStream; import java.util.List; import java.util.Map; -import java.util.Optional; - -import org.apache.kafka.clients.consumer.ConsumerRecord; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import com.amazonaws.util.IOUtils; import org.slf4j.Logger; @@ -40,17 +36,17 @@ public void configureValueConverter(final Map<String, String> config, final S3So } @Override - @SuppressWarnings("PMD.ExcessiveParameterList") - public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, - final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, - final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, - final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets, - final Map<String, Object> partitionMap) { + public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition) { + return List.of(inputStream); + } + + @Override + public byte[] getValueBytes(final Object record, final String topic, final S3SourceConfig s3SourceConfig) { try { - consumerRecordList.add(OutputUtils.getConsumerRecord(optionalKeyBytes, IOUtils.toByteArray(inputStream), - topic, topicPartition, offsetManager, currentOffsets, startOffset, partitionMap)); + return IOUtils.toByteArray((InputStream) record); } catch (IOException e) { LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + return new byte[0]; } } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java index cbaa7518c..8d4b91482 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java @@ -20,15 +20,13 @@ import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; import java.util.List; import java.util.Map; -import java.util.Optional; - -import org.apache.kafka.clients.consumer.ConsumerRecord; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.utils.OffsetManager; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; @@ -42,27 +40,25 @@ public void configureValueConverter(final Map<String, String> config, final S3So } @Override - @SuppressWarnings("PMD.ExcessiveParameterList") - public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, - final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, - final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, - final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets, - final Map<String, Object> partitionMap) { - final byte[] valueBytes = serializeJsonData(inputStream); - if (valueBytes.length > 0) { - consumerRecordList.add(OutputUtils.getConsumerRecord(optionalKeyBytes, valueBytes, topic, topicPartition, - offsetManager, currentOffsets, startOffset, partitionMap)); - } - } - - private byte[] serializeJsonData(final InputStream inputStream) { + public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition) { + final List<Object> jsonNodeList = new ArrayList<>(); final JsonNode jsonNode; try { jsonNode = objectMapper.readTree(inputStream); - return objectMapper.writeValueAsBytes(jsonNode); + jsonNodeList.add(jsonNode); } catch (IOException e) { LOGGER.error("Error in reading s3 object stream " + e.getMessage()); } - return new byte[0]; + return jsonNodeList; + } + + @Override + public byte[] getValueBytes(final Object record, final String topic, final S3SourceConfig s3SourceConfig) { + try { + return objectMapper.writeValueAsBytes(record); + } catch (JsonProcessingException e) { + LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + return new byte[0]; + } } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java index c185935dc..a075a5c76 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java @@ -25,12 +25,8 @@ import java.util.Collections; import java.util.List; import java.util.Map; -import java.util.Optional; - -import org.apache.kafka.clients.consumer.ConsumerRecord; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import io.confluent.kafka.serializers.KafkaAvroSerializer; import org.apache.avro.generic.GenericRecord; @@ -63,37 +59,4 @@ static byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, } return new byte[0]; } - - @SuppressWarnings("PMD.ExcessiveParameterList") - static void buildConsumerRecordList(final Optional<byte[]> optionalKeyBytes, final String topic, - final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, final S3SourceConfig s3SourceConfig, - final int topicPartition, final long startOffset, final OffsetManager offsetManager, - final Map<Map<String, Object>, Long> currentOffsets, final List<GenericRecord> records, - final Map<String, Object> partitionMap) { - for (final GenericRecord record : records) { - final byte[] valueBytes = OutputUtils.serializeAvroRecordToBytes(Collections.singletonList(record), topic, - s3SourceConfig); - if (valueBytes.length > 0) { - consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, topic, topicPartition, - offsetManager, currentOffsets, startOffset, partitionMap)); - } - } - } - - static ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> key, final byte[] value, - final String topic, final int topicPartition, final OffsetManager offsetManager, - final Map<Map<String, Object>, Long> currentOffsets, final long startOffset, - final Map<String, Object> partitionMap) { - - long currentOffset; - - if (offsetManager.getOffsets().containsKey(partitionMap)) { - currentOffset = offsetManager.incrementAndUpdateOffsetMap(partitionMap); - } else { - currentOffset = currentOffsets.getOrDefault(partitionMap, startOffset); - currentOffsets.put(partitionMap, currentOffset + 1); - } - - return new ConsumerRecord<>(topic, topicPartition, currentOffset, key.orElse(null), value); - } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java index 68c20d537..af648564e 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java @@ -19,12 +19,8 @@ import java.io.InputStream; import java.util.List; import java.util.Map; -import java.util.Optional; - -import org.apache.kafka.clients.consumer.ConsumerRecord; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,9 +31,7 @@ public interface OutputWriter { void configureValueConverter(Map<String, String> config, S3SourceConfig s3SourceConfig); - @SuppressWarnings("PMD.ExcessiveParameterList") - void handleValueData(Optional<byte[]> optionalKeyBytes, InputStream inputStream, String topic, - List<ConsumerRecord<byte[], byte[]>> consumerRecordList, S3SourceConfig s3SourceConfig, int topicPartition, - long startOffset, OffsetManager offsetManager, Map<Map<String, Object>, Long> currentOffsets, - Map<String, Object> partitionMap); + List<Object> getRecords(InputStream inputStream, String topic, int topicPartition); + + byte[] getValueBytes(Object record, String topic, S3SourceConfig s3SourceConfig); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java index be132eb8a..fd5ab11b0 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java @@ -26,14 +26,11 @@ import java.nio.file.Path; import java.time.Instant; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Map; -import java.util.Optional; - -import org.apache.kafka.clients.consumer.ConsumerRecord; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import org.apache.avro.generic.GenericRecord; import org.apache.commons.compress.utils.IOUtils; @@ -53,22 +50,21 @@ public void configureValueConverter(final Map<String, String> config, final S3So } @Override - @SuppressWarnings("PMD.ExcessiveParameterList") - public void handleValueData(final Optional<byte[]> optionalKeyBytes, final InputStream inputStream, - final String topic, final List<ConsumerRecord<byte[], byte[]>> consumerRecordList, - final S3SourceConfig s3SourceConfig, final int topicPartition, final long startOffset, - final OffsetManager offsetManager, final Map<Map<String, Object>, Long> currentOffsets, - final Map<String, Object> partitionMap) { - final List<GenericRecord> records = getRecords(inputStream, topic, topicPartition); - OutputUtils.buildConsumerRecordList(optionalKeyBytes, topic, consumerRecordList, s3SourceConfig, topicPartition, - startOffset, offsetManager, currentOffsets, records, partitionMap); + public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition) { + return getParquetRecords(inputStream, topic, topicPartition); + } + + @Override + public byte[] getValueBytes(final Object record, final String topic, final S3SourceConfig s3SourceConfig) { + return OutputUtils.serializeAvroRecordToBytes(Collections.singletonList((GenericRecord) record), topic, + s3SourceConfig); } - public static List<GenericRecord> getRecords(final InputStream inputStream, final String topic, + private List<Object> getParquetRecords(final InputStream inputStream, final String topic, final int topicPartition) { final String timestamp = String.valueOf(Instant.now().toEpochMilli()); File parquetFile; - final var records = new ArrayList<GenericRecord>(); + final List<Object> records = new ArrayList<>(); try { parquetFile = File.createTempFile(topic + "_" + topicPartition + "_" + timestamp, ".parquet"); } catch (IOException e) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 071c9f165..73db8d4fa 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -29,7 +29,6 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.NoSuchElementException; import java.util.Optional; import java.util.Set; import java.util.regex.Matcher; @@ -44,12 +43,15 @@ import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Iterator that processes S3 files and creates Kafka source records. Supports different output formats (Avro, JSON, * Parquet). */ public final class SourceRecordIterator implements Iterator<List<AivenS3SourceRecord>> { + private static final Logger LOGGER = LoggerFactory.getLogger(SourceRecordIterator.class); public static final String PATTERN_TOPIC_KEY = "topicName"; public static final String PATTERN_PARTITION_KEY = "partitionId"; public static final String OFFSET_KEY = "offset"; @@ -138,12 +140,32 @@ private List<ConsumerRecord<byte[], byte[]>> readNext() { final Optional<byte[]> optionalKeyBytes = Optional.ofNullable(currentObjectKey) .map(k -> k.getBytes(StandardCharsets.UTF_8)); final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = new ArrayList<>(); - outputWriter.handleValueData(optionalKeyBytes, valueInputStream, topic, consumerRecordList, - s3SourceConfig, topicPartition, startOffset, offsetManager, currentOffsets, partitionMap); + for (final Object record : outputWriter.getRecords(valueInputStream, topic, topicPartition)) { + final byte[] valueBytes = outputWriter.getValueBytes(record, topic, s3SourceConfig); + consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, topic, topicPartition, + offsetManager, currentOffsets, startOffset, partitionMap)); + } return consumerRecordList; } + private ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> key, final byte[] value, + final String topic, final int topicPartition, final OffsetManager offsetManager, + final Map<Map<String, Object>, Long> currentOffsets, final long startOffset, + final Map<String, Object> partitionMap) { + + long currentOffset; + + if (offsetManager.getOffsets().containsKey(partitionMap)) { + currentOffset = offsetManager.incrementAndUpdateOffsetMap(partitionMap); + } else { + currentOffset = currentOffsets.getOrDefault(partitionMap, startOffset); + currentOffsets.put(partitionMap, currentOffset + 1); + } + + return new ConsumerRecord<>(topic, topicPartition, currentOffset, key.orElse(null), value); + } + @Override public boolean hasNext() { return !nextRecord.isEmpty(); @@ -152,7 +174,9 @@ public boolean hasNext() { @Override public List<ConsumerRecord<byte[], byte[]>> next() { if (nextRecord.isEmpty()) { - throw new NoSuchElementException(); + LOGGER.error("May be error in reading s3 object " + currentObjectKey); + return Collections.emptyList(); + // throw new NoSuchElementException(); } final List<ConsumerRecord<byte[], byte[]>> currentRecord = nextRecord; nextRecord = Collections.emptyList(); @@ -175,7 +199,9 @@ public List<AivenS3SourceRecord> next() { final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = recordIterator.next(); if (consumerRecordList.isEmpty()) { - throw new NoSuchElementException(); + LOGGER.error("May be error in reading s3 object " + currentObjectKey); + return Collections.emptyList(); + // throw new NoSuchElementException(); } final List<AivenS3SourceRecord> aivenS3SourceRecordList = new ArrayList<>(); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java index ab077cbf4..9e1c6e958 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java @@ -18,7 +18,6 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.when; import java.io.ByteArrayInputStream; @@ -26,13 +25,9 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; -import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Optional; - -import org.apache.kafka.clients.consumer.ConsumerRecord; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; @@ -77,17 +72,13 @@ void testConfigureValueConverter() { } @Test - void testHandleValueData() throws Exception { + void testReadAvroRecordsInvalidData() { final InputStream inputStream = new ByteArrayInputStream("mock-avro-data".getBytes(StandardCharsets.UTF_8)); - final Optional<byte[]> optionalKeyBytes = Optional.of("key".getBytes(StandardCharsets.UTF_8)); - final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = new ArrayList<>(); - final Map<Map<String, Object>, Long> currentOffsets = new HashMap<>(); - final Map<String, Object> partitionMap = new HashMap<>(); - avroWriter.handleValueData(optionalKeyBytes, inputStream, "test-topic", consumerRecordList, s3SourceConfig, 0, - 0L, offsetManager, currentOffsets, partitionMap); + final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); + final List<Object> records = avroWriter.readAvroRecords(inputStream, datumReader); - assertTrue(consumerRecordList.isEmpty(), "Consumer record list should be populated by the OutputUtils method."); + assertThat(records.size()).isEqualTo(0); } @Test @@ -96,7 +87,7 @@ void testReadAvroRecords() throws Exception { final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); - final List<GenericRecord> records = avroWriter.readAvroRecords(inputStream, datumReader); + final List<Object> records = avroWriter.readAvroRecords(inputStream, datumReader); assertThat(records.size()).isEqualTo(2); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonWriterTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonWriterTest.java new file mode 100644 index 000000000..e33a2da1e --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonWriterTest.java @@ -0,0 +1,99 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.output; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMAS_ENABLE; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.mock; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +final class JsonWriterTest { + + JsonWriter jsonWriter; + + @Mock + OffsetManager offsetManager; + + @BeforeEach + void setUp() { + MockitoAnnotations.openMocks(this); + jsonWriter = new JsonWriter(); + } + + @Test + void testConfigureValueConverter() { + final Map<String, String> config = new HashMap<>(); + final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); + + jsonWriter.configureValueConverter(config, s3SourceConfig); + assertEquals("false", config.get(SCHEMAS_ENABLE), "SCHEMAS_ENABLE should be set to false"); + } + + @Test + void testHandleValueDataWithValidJson() { + final InputStream validJsonInputStream = new ByteArrayInputStream( + "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); + + final List<Object> jsonNodes = jsonWriter.getRecords(validJsonInputStream, "testtopic", 1); + + assertThat(jsonNodes.size()).isEqualTo(1); + } + + @Test + void testHandleValueDataWithInvalidJson() { + final InputStream invalidJsonInputStream = new ByteArrayInputStream( + "invalid-json".getBytes(StandardCharsets.UTF_8)); + + final List<Object> jsonNodes = jsonWriter.getRecords(invalidJsonInputStream, "testtopic", 1); + + assertThat(jsonNodes.size()).isEqualTo(0); + } + + @Test + void testSerializeJsonDataValid() throws IOException { + final InputStream validJsonInputStream = new ByteArrayInputStream( + "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); + final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); + final List<Object> jsonNodes = jsonWriter.getRecords(validJsonInputStream, "testtopic", 1); + + final byte[] serializedData = jsonWriter.getValueBytes(jsonNodes.get(0), "testtopic", s3SourceConfig); + + final ObjectMapper objectMapper = new ObjectMapper(); + + final JsonNode expectedData = objectMapper.readTree(serializedData); + + assertThat(expectedData.get("key").asText()).isEqualTo("value"); + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java index 8fc7f5536..88209a693 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java @@ -17,7 +17,6 @@ package io.aiven.kafka.connect.s3.source.output; import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.Mockito.mock; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -25,16 +24,9 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; import java.util.List; -import java.util.Map; -import java.util.Optional; -import org.apache.kafka.clients.consumer.ConsumerRecord; - -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; -import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import com.amazonaws.util.IOUtils; import org.apache.avro.generic.GenericRecord; @@ -43,20 +35,10 @@ final class ParquetWriterTest { private ParquetWriter parquetWriter; - private S3SourceConfig s3SourceConfig; - private OffsetManager offsetManager; - private List<ConsumerRecord<byte[], byte[]>> consumerRecordList; - private Map<Map<String, Object>, Long> currentOffsets; - private Map<String, Object> partitionMap; @BeforeEach public void setUp() { parquetWriter = new ParquetWriter(); - s3SourceConfig = mock(S3SourceConfig.class); - offsetManager = mock(OffsetManager.class); - consumerRecordList = new ArrayList<>(); - currentOffsets = mock(Map.class); - partitionMap = mock(Map.class); } @Test @@ -66,12 +48,9 @@ void testHandleValueDataWithZeroBytes() { final String topic = "test-topic"; final int topicPartition = 0; - final long startOffset = 100L; - - parquetWriter.handleValueData(Optional.empty(), inputStream, topic, consumerRecordList, s3SourceConfig, - topicPartition, startOffset, offsetManager, currentOffsets, partitionMap); + final List<Object> recs = parquetWriter.getRecords(inputStream, topic, topicPartition); - assertThat(consumerRecordList).isEmpty(); + assertThat(recs).isEmpty(); } @Test @@ -82,10 +61,12 @@ void testGetRecordsWithValidData() throws Exception { final String topic = "test-topic"; final int topicPartition = 0; - final List<GenericRecord> records = ParquetWriter.getRecords(inputStream, topic, topicPartition); + final List<Object> records = parquetWriter.getRecords(inputStream, topic, topicPartition); assertThat(records).isNotEmpty(); - assertThat(records).extracting(record -> record.get("name").toString()).contains("name1").contains("name2"); + assertThat(records).extracting(record -> ((GenericRecord) record).get("name").toString()) + .contains("name1") + .contains("name2"); } @Test @@ -96,7 +77,7 @@ void testGetRecordsWithInvalidData() { final String topic = "test-topic"; final int topicPartition = 0; - final List<GenericRecord> records = ParquetWriter.getRecords(inputStream, topic, topicPartition); + final List<Object> records = parquetWriter.getRecords(inputStream, topic, topicPartition); assertThat(records).isEmpty(); } From e668c5fc4da44dd34cc8ba97d6aff9bd3e1b8436 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Fri, 11 Oct 2024 17:03:52 +0200 Subject: [PATCH 39/90] fix offset storage --- .../connect/s3/source/IntegrationTest.java | 59 ++++++++++++++++++- .../kafka/connect/s3/source/S3SourceTask.java | 10 +++- .../s3/source/config/S3SourceConfig.java | 4 +- .../s3/source/utils/AivenS3SourceRecord.java | 6 +- .../connect/s3/source/utils/ConnectUtils.java | 3 +- .../connect/s3/source/utils/FileReader.java | 36 ++++++++++- .../s3/source/utils/OffsetManager.java | 14 ++++- .../s3/source/utils/RecordProcessor.java | 10 +++- .../s3/source/utils/SourceRecordIterator.java | 15 +++-- .../s3/source/utils/FileReaderTest.java | 11 +++- .../s3/source/utils/RecordProcessorTest.java | 13 ++-- 11 files changed, 150 insertions(+), 31 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index d58eb10cc..2ae4809d7 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -215,7 +215,51 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); // Poll Avro messages from the Kafka topic and deserialize them - final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 2, KAFKA_CONTAINER, + final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 4, KAFKA_CONTAINER, + SCHEMA_REGISTRY.getSchemaRegistryUrl()); // Ensure this method deserializes Avro + + // Verify that the correct data is read from the S3 bucket and pushed to Kafka + assertThat(records).extracting(record -> record.get("message").toString()) + .contains("Hello, Kafka Connect S3 Source! object 1") + .contains("Hello, Kafka Connect S3 Source! object 2"); + assertThat(records).extracting(record -> record.get("id").toString()).contains("1").contains("2"); + } + + @Test + void avroTestRandomFiles(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { + final var topicName = IntegrationBase.topicName(testInfo); + final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.AVRO.getValue()); + connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); + connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); + connectorConfig.put("value.converter.schema.registry.url", SCHEMA_REGISTRY.getSchemaRegistryUrl()); + connectorConfig.put(VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); + + connectRunner.createConnector(connectorConfig); + + // Define Avro schema + final String schemaJson = "{\n" + " \"type\": \"record\",\n" + " \"name\": \"TestRecord\",\n" + + " \"fields\": [\n" + " {\"name\": \"message\", \"type\": \"string\"},\n" + + " {\"name\": \"id\", \"type\": \"int\"}\n" + " ]\n" + "}"; + final Schema.Parser parser = new Schema.Parser(); + final Schema schema = parser.parse(schemaJson); + + final ByteArrayOutputStream outputStream1 = getAvroRecord(schema, 1); + final ByteArrayOutputStream outputStream2 = getAvroRecord(schema, 2); + + writeToS3GeneratedKey(topicName, outputStream1.toByteArray()); + writeToS3(topicName, outputStream2.toByteArray(), "00000"); + writeToS3GeneratedKey(topicName, outputStream2.toByteArray()); + writeToS3GeneratedKey(topicName, outputStream2.toByteArray()); + + final List<String> objects = testBucketAccessor.listObjects(); + assertThat(objects.size()).isEqualTo(4); + + // Verify that the connector is correctly set up + assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); + + // Poll Avro messages from the Kafka topic and deserialize them + final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 4, KAFKA_CONTAINER, SCHEMA_REGISTRY.getSchemaRegistryUrl()); // Ensure this method deserializes Avro // Verify that the correct data is read from the S3 bucket and pushed to Kafka @@ -307,6 +351,19 @@ private static void writeToS3(final String topicName, final byte[] testDataBytes } } + private static void writeToS3GeneratedKey(final String topicName, final byte[] testDataBytes) throws IOException { + final String filePrefix = topicName + System.currentTimeMillis(); + final String fileSuffix = ".txt"; + + final Path testFilePath = File.createTempFile(filePrefix, fileSuffix).toPath(); + try { + Files.write(testFilePath, testDataBytes); + saveToS3(TEST_BUCKET_NAME, "", filePrefix + fileSuffix, testFilePath.toFile()); + } finally { + Files.delete(testFilePath); + } + } + private Map<String, String> getConfig(final Map<String, String> config, final String topics) { config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); config.put(AWS_ACCESS_KEY_ID_CONFIG, S3_ACCESS_KEY_ID); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index de3ae2599..34d099104 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -61,7 +61,9 @@ public class S3SourceTask extends SourceTask { public static final String BUCKET = "bucket"; public static final String TOPIC = "topic"; - public static final String PARTITION = "partition"; + + public static final String OBJECT_KEY = "object_key"; + public static final String PARTITION = "topicPartition"; private static final long S_3_POLL_INTERVAL_MS = 10_000L; private static final long ERROR_BACKOFF = 1000L; @@ -86,6 +88,8 @@ public class S3SourceTask extends SourceTask { private final Object pollLock = new Object(); private final Set<String> failedObjectKeys = new HashSet<>(); + private OffsetManager offsetManager; + @SuppressWarnings("PMD.UnnecessaryConstructor") public S3SourceTask() { super(); @@ -104,6 +108,7 @@ public void start(final Map<String, String> props) { initializeS3Client(); this.s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); this.outputWriter = OutputWriterFactory.getWriter(s3SourceConfig); + offsetManager = new OffsetManager(context, s3SourceConfig); prepareReaderFromOffsetStorageReader(); this.taskInitialized = true; } @@ -127,7 +132,6 @@ private void initializeS3Client() { } private void prepareReaderFromOffsetStorageReader() { - final OffsetManager offsetManager = new OffsetManager(context, s3SourceConfig); sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, s3Client, this.s3Bucket, offsetManager, this.outputWriter, failedObjectKeys); } @@ -180,7 +184,7 @@ private List<SourceRecord> extractSourceRecords(final List<SourceRecord> results return results; } return RecordProcessor.processRecords(sourceRecordIterator, results, s3SourceConfig, keyConverter, - valueConverter, connectorStopped, this.outputWriter, failedObjectKeys); + valueConverter, connectorStopped, this.outputWriter, failedObjectKeys, offsetManager); } private void waitForObjects() throws InterruptedException { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 4d1e8d853..e30f23d8f 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -149,8 +149,8 @@ private static void addSchemaRegistryGroup(final ConfigDef configDef) { GROUP_OTHER, srCounter++, // NOPMD ConfigDef.Width.NONE, OUTPUT_FORMAT_KEY); - configDef.define(VALUE_SERIALIZER, ConfigDef.Type.CLASS, "io.confluent.kafka.serializers.KafkaAvroSerializer", - ConfigDef.Importance.MEDIUM, "Value serializer", GROUP_OTHER, srCounter++, // NOPMD + configDef.define(VALUE_SERIALIZER, ConfigDef.Type.CLASS, null, ConfigDef.Importance.MEDIUM, "Value serializer", + GROUP_OTHER, srCounter++, // NOPMD // UnusedAssignment ConfigDef.Width.NONE, VALUE_SERIALIZER); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java index ae8baf297..13b325506 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java @@ -29,14 +29,14 @@ public class AivenS3SourceRecord { private final Map<String, Object> partitionMap; private final Map<String, Object> offsetMap; private final String toTopic; - private final int topicPartition; + private final Integer topicPartition; private final byte[] recordKey; private final byte[] recordValue; private final String objectKey; public AivenS3SourceRecord(final Map<String, Object> partitionMap, final Map<String, Object> offsetMap, - final String toTopic, final int topicPartition, final byte[] recordKey, final byte[] recordValue, + final String toTopic, final Integer topicPartition, final byte[] recordKey, final byte[] recordValue, final String objectKey) { this.partitionMap = new HashMap<>(partitionMap); this.offsetMap = new HashMap<>(offsetMap); @@ -60,7 +60,7 @@ public String getToTopic() { return toTopic; } - public int partition() { + public Integer partition() { return topicPartition; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java index 6c7d1e3d3..9420834da 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java @@ -28,11 +28,12 @@ final public class ConnectUtils { private ConnectUtils() { // hidden } - static Map<String, Object> getPartitionMap(final String topicName, final int defaultPartitionId, + static Map<String, Object> getPartitionMap(final String topicName, final Integer defaultPartitionId, final String bucketName) { final Map<String, Object> partitionMap = new HashMap<>(); partitionMap.put(BUCKET, bucketName); partitionMap.put(TOPIC, topicName); + partitionMap.put(PARTITION, defaultPartitionId); return partitionMap; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java index 618c68a82..903de65d2 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -16,12 +16,15 @@ package io.aiven.kafka.connect.s3.source.utils; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.BUCKET; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -31,20 +34,26 @@ import com.amazonaws.services.s3.model.ListObjectsV2Request; import com.amazonaws.services.s3.model.ListObjectsV2Result; import com.amazonaws.services.s3.model.S3ObjectSummary; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class FileReader { + private static final Logger LOGGER = LoggerFactory.getLogger(FileReader.class); public static final int PAGE_SIZE_FACTOR = 2; private final S3SourceConfig s3SourceConfig; private final String bucketName; + private final OffsetManager offsetManager; + private final Set<String> failedObjectKeys; - public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName, - final Set<String> failedObjectKeys) { + public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName, final Set<String> failedObjectKeys, + final OffsetManager offsetManager) { this.s3SourceConfig = s3SourceConfig; this.bucketName = bucketName; this.failedObjectKeys = new HashSet<>(failedObjectKeys); + this.offsetManager = offsetManager; } @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") @@ -69,7 +78,28 @@ List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOExc .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey())) .collect(Collectors.toList()); - allSummaries.addAll(filteredSummaries); // Add the filtered summaries to the main list + final Map<Map<String, Object>, Map<String, Object>> processedOffsets = offsetManager.getOffsets(); + LOGGER.info(processedOffsets + " processedOffsets"); + + final List<S3ObjectSummary> filteredSummariesNewList = filteredSummaries.stream() + .filter(s3ObjectSummary -> { + for (final Map.Entry<Map<String, Object>, Map<String, Object>> mapMapEntry : processedOffsets + .entrySet()) { + if (mapMapEntry.getKey().get(BUCKET).equals(bucketName) + // && mapMapEntry.getKey().get(OBJECT_KEY).equals(s3ObjectSummary.getKey()) + && s3ObjectSummary.getKey().equals(mapMapEntry.getValue().get(OBJECT_KEY))) { + return false; + } + } + return true; + }) + .collect(Collectors.toList()); + + LOGGER.info(" **** filteredSummariesNewList **** " + filteredSummariesNewList); + + allSummaries.addAll(filteredSummariesNewList); // Add the filtered summaries to the main list + + allSummaries.forEach(objSummary -> LOGGER.info(" ******* FR key ******** " + objSummary.getKey())); // Check if there are more objects to fetch continuationToken = objectListing.getNextContinuationToken(); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index 92a72593a..147903bf9 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -33,7 +33,12 @@ import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + public class OffsetManager { + + private static final Logger LOGGER = LoggerFactory.getLogger(OffsetManager.class); private final Map<Map<String, Object>, Map<String, Object>> offsets; public OffsetManager(final SourceTaskContext context, final S3SourceConfig s3SourceConfig) { @@ -46,10 +51,12 @@ public OffsetManager(final SourceTaskContext context, final S3SourceConfig s3Sou final Map<Map<String, Object>, Map<String, Object>> offsetMap = context.offsetStorageReader() .offsets(partitionKeys); + LOGGER.info(" ********** offsetMap ***** " + offsetMap); this.offsets = offsetMap.entrySet() .stream() .filter(e -> e.getValue() != null) .collect(toMap(entry -> new HashMap<>(entry.getKey()), entry -> new HashMap<>(entry.getValue()))); + LOGGER.info(" ********** offsets ***** " + offsets); } public Map<Map<String, Object>, Map<String, Object>> getOffsets() { @@ -58,16 +65,21 @@ public Map<Map<String, Object>, Map<String, Object>> getOffsets() { public long incrementAndUpdateOffsetMap(final Map<String, Object> partitionMap) { if (offsets.containsKey(partitionMap)) { - final Map<String, Object> offsetValue = offsets.get(partitionMap); + final Map<String, Object> offsetValue = new HashMap<>(offsets.get(partitionMap)); if (offsetValue.containsKey(OFFSET_KEY)) { final long newOffsetVal = (long) offsetValue.get(OFFSET_KEY) + 1L; offsetValue.put(OFFSET_KEY, newOffsetVal); + offsets.put(partitionMap, offsetValue); return newOffsetVal; } } return 0L; } + void updateCurrentOffsets(final Map<String, Object> partitionMap, final Map<String, Object> offsetValueMap) { + offsets.put(partitionMap, offsetValueMap); + } + private static Set<Integer> parsePartitions(final S3SourceConfig s3SourceConfig) { final String partitionString = s3SourceConfig.getString(S3SourceConfig.TARGET_TOPIC_PARTITIONS); return Arrays.stream(partitionString.split(",")).map(Integer::parseInt).collect(Collectors.toSet()); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index ccaab7f07..b590b3c16 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -46,7 +46,8 @@ private RecordProcessor() { public static List<SourceRecord> processRecords(final Iterator<List<AivenS3SourceRecord>> sourceRecordIterator, final List<SourceRecord> results, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, - final AtomicBoolean connectorStopped, final OutputWriter outputWriter, final Set<String> failedObjectKeys) { + final AtomicBoolean connectorStopped, final OutputWriter outputWriter, final Set<String> failedObjectKeys, + final OffsetManager offsetManager) { final Map<String, String> conversionConfig = new HashMap<>(); final int maxPollRecords = s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS); @@ -54,7 +55,7 @@ public static List<SourceRecord> processRecords(final Iterator<List<AivenS3Sourc for (int i = 0; sourceRecordIterator.hasNext() && i < maxPollRecords && !connectorStopped.get(); i++) { final List<AivenS3SourceRecord> recordList = sourceRecordIterator.next(); final List<SourceRecord> sourceRecords = createSourceRecords(recordList, s3SourceConfig, keyConverter, - valueConverter, conversionConfig, outputWriter, failedObjectKeys); + valueConverter, conversionConfig, outputWriter, failedObjectKeys, offsetManager); results.addAll(sourceRecords); } @@ -65,10 +66,11 @@ public static List<SourceRecord> processRecords(final Iterator<List<AivenS3Sourc static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRecord> aivenS3SourceRecordList, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, final Map<String, String> conversionConfig, final OutputWriter outputWriter, - final Set<String> failedObjectKeys) { + final Set<String> failedObjectKeys, final OffsetManager offsetManager) { final List<SourceRecord> sourceRecordList = new ArrayList<>(); for (final AivenS3SourceRecord aivenS3SourceRecord : aivenS3SourceRecordList) { + LOGGER.info(" ******* CSR key ******** " + aivenS3SourceRecord.getObjectKey()); final String topic = aivenS3SourceRecord.getToTopic(); final Optional<SchemaAndValue> keyData = keyConverter .map(c -> c.toConnectData(topic, aivenS3SourceRecord.key())); @@ -77,6 +79,8 @@ static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRecord> ai valueConverter.configure(conversionConfig, false); try { final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); + offsetManager.updateCurrentOffsets(aivenS3SourceRecord.getPartitionMap(), + aivenS3SourceRecord.getOffsetMap()); sourceRecordList.add(aivenS3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue)); } catch (DataException e) { LOGGER.error("Error in reading s3 object stream " + e.getMessage()); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 73db8d4fa..535b4f8a1 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -16,9 +16,7 @@ package io.aiven.kafka.connect.s3.source.utils; -import static io.aiven.kafka.connect.s3.source.S3SourceTask.BUCKET; -import static io.aiven.kafka.connect.s3.source.S3SourceTask.PARTITION; -import static io.aiven.kafka.connect.s3.source.S3SourceTask.TOPIC; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; import java.io.IOException; import java.io.InputStream; @@ -79,7 +77,7 @@ public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 this.s3Client = s3Client; this.bucketName = bucketName; this.outputWriter = outputWriter; - final FileReader fileReader = new FileReader(s3SourceConfig, bucketName, failedObjectKeys); + final FileReader fileReader = new FileReader(s3SourceConfig, bucketName, failedObjectKeys, offsetManager); try { final List<S3ObjectSummary> chunks = fileReader.fetchObjectSummaries(s3Client); nextFileIterator = chunks.iterator(); @@ -157,8 +155,10 @@ private ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> long currentOffset; if (offsetManager.getOffsets().containsKey(partitionMap)) { + LOGGER.info("***** offsetManager.getOffsets() ***** " + offsetManager.getOffsets()); currentOffset = offsetManager.incrementAndUpdateOffsetMap(partitionMap); } else { + LOGGER.info("Into else block ..."); currentOffset = currentOffsets.getOrDefault(partitionMap, startOffset); currentOffsets.put(partitionMap, currentOffset + 1); } @@ -209,14 +209,13 @@ public List<AivenS3SourceRecord> next() { Map<String, Object> offsetMap; Map<String, Object> partitionMap; for (final ConsumerRecord<byte[], byte[]> currentRecord : consumerRecordList) { - partitionMap = new HashMap<>(); - partitionMap.put(BUCKET, bucketName); - partitionMap.put(TOPIC, currentRecord.topic()); - partitionMap.put(PARTITION, currentRecord.partition()); + + partitionMap = ConnectUtils.getPartitionMap(currentRecord.topic(), currentRecord.partition(), bucketName); // Create the offset map offsetMap = new HashMap<>(); offsetMap.put(OFFSET_KEY, currentRecord.offset()); + offsetMap.put(OBJECT_KEY, currentObjectKey); aivenS3SourceRecord = new AivenS3SourceRecord(partitionMap, offsetMap, currentRecord.topic(), currentRecord.partition(), currentRecord.key(), currentRecord.value(), currentObjectKey); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java index e8d56d8c2..e74605e8a 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java @@ -47,6 +47,9 @@ class FileReaderTest { @Mock private AmazonS3 s3Client; + @Mock + private OffsetManager offsetManager; + private FileReader fileReader; private Map<String, String> properties; @@ -56,7 +59,8 @@ public void setUp() { properties = new HashMap<>(); setBasicProperties(); final S3SourceConfig s3SourceConfig = new S3SourceConfig(properties); - fileReader = new FileReader(s3SourceConfig, TEST_BUCKET, Collections.emptySet()); + offsetManager = mock(OffsetManager.class); + fileReader = new FileReader(s3SourceConfig, TEST_BUCKET, Collections.emptySet(), offsetManager); s3Client = mock(AmazonS3.class); } @@ -64,6 +68,8 @@ public void setUp() { void testFetchObjectSummariesWithNoObjects() throws IOException { final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result(Collections.emptyList(), null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); + when(offsetManager.getOffsets()).thenReturn(new HashMap<>()); + final List<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); assertThat(summaries.size()).isEqualTo(0); } @@ -74,6 +80,7 @@ void testFetchObjectSummariesWithOneNonZeroByteObject() throws IOException { final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result( Collections.singletonList(objectSummary), null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); + when(offsetManager.getOffsets()).thenReturn(new HashMap<>()); final List<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); @@ -88,6 +95,7 @@ void testFetchObjectSummariesWithZeroByteObject() throws IOException { final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result( List.of(zeroByteObject, nonZeroByteObject), null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); + when(offsetManager.getOffsets()).thenReturn(new HashMap<>()); final List<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); @@ -106,6 +114,7 @@ void testFetchObjectSummariesWithPagination() throws IOException { final ListObjectsV2Result secondResult = createListObjectsV2Result(secondBatch, null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(firstResult).thenReturn(secondResult); + when(offsetManager.getOffsets()).thenReturn(new HashMap<>()); final List<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java index 9e598f3af..443e05932 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -58,6 +58,8 @@ class RecordProcessorTest { private OutputWriter outputWriter; @Mock private Converter keyConverter; + @Mock + private OffsetManager offsetManager; private AtomicBoolean connectorStopped; private Iterator<List<AivenS3SourceRecord>> sourceRecordIterator; @@ -70,7 +72,7 @@ void setUp() { } @Test - void testProcessRecordsNoRecords() throws ConnectException { + void testProcessRecordsNoRecords() { when(s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS)).thenReturn(5); when(sourceRecordIterator.hasNext()).thenReturn(false); @@ -82,7 +84,7 @@ void testProcessRecordsNoRecords() throws ConnectException { Optional.of(keyConverter), valueConverter, connectorStopped, - outputWriter, Collections.emptySet() + outputWriter, Collections.emptySet(), offsetManager ); assertTrue(processedRecords.isEmpty(), "Processed records should be empty when there are no records."); @@ -105,7 +107,7 @@ void testProcessRecordsWithRecords() throws ConnectException { Optional.of(keyConverter), valueConverter, connectorStopped, - outputWriter, Collections.emptySet() + outputWriter, Collections.emptySet(), offsetManager ); assertThat(results.size()).isEqualTo(1); @@ -125,7 +127,7 @@ void testProcessRecordsConnectorStopped() throws ConnectException { Optional.of(keyConverter), valueConverter, connectorStopped, - outputWriter, Collections.emptySet() + outputWriter, Collections.emptySet(), offsetManager ); assertTrue(processedRecords.isEmpty(), "Processed records should be empty when connector is stopped."); @@ -145,7 +147,8 @@ void testCreateSourceRecords() throws ConnectException { final List<AivenS3SourceRecord> recordList = Collections.singletonList(mockRecord); final List<SourceRecord> sourceRecords = RecordProcessor.createSourceRecords(recordList, s3SourceConfig, - Optional.of(keyConverter), valueConverter, new HashMap<>(), outputWriter, Collections.emptySet()); + Optional.of(keyConverter), valueConverter, new HashMap<>(), outputWriter, Collections.emptySet(), + offsetManager); assertThat(sourceRecords.size()).isEqualTo(1); } From 6953c75abbc1ae382349078e2ee81fc14419d4e8 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Sat, 12 Oct 2024 22:46:43 +0200 Subject: [PATCH 40/90] refactor offset calls --- .../connect/s3/source/IntegrationTest.java | 76 ++++--------------- .../s3/source/utils/OffsetManager.java | 24 ++++-- .../s3/source/utils/RecordProcessor.java | 2 - .../s3/source/utils/SourceRecordIterator.java | 31 +++----- .../s3/source/utils/OffsetManagerTest.java | 12 --- 5 files changed, 41 insertions(+), 104 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 2ae4809d7..130d8dfe6 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -220,52 +220,10 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc // Verify that the correct data is read from the S3 bucket and pushed to Kafka assertThat(records).extracting(record -> record.get("message").toString()) - .contains("Hello, Kafka Connect S3 Source! object 1") - .contains("Hello, Kafka Connect S3 Source! object 2"); - assertThat(records).extracting(record -> record.get("id").toString()).contains("1").contains("2"); - } - - @Test - void avroTestRandomFiles(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { - final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.AVRO.getValue()); - connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); - connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); - connectorConfig.put("value.converter.schema.registry.url", SCHEMA_REGISTRY.getSchemaRegistryUrl()); - connectorConfig.put(VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); - - connectRunner.createConnector(connectorConfig); - - // Define Avro schema - final String schemaJson = "{\n" + " \"type\": \"record\",\n" + " \"name\": \"TestRecord\",\n" - + " \"fields\": [\n" + " {\"name\": \"message\", \"type\": \"string\"},\n" - + " {\"name\": \"id\", \"type\": \"int\"}\n" + " ]\n" + "}"; - final Schema.Parser parser = new Schema.Parser(); - final Schema schema = parser.parse(schemaJson); - - final ByteArrayOutputStream outputStream1 = getAvroRecord(schema, 1); - final ByteArrayOutputStream outputStream2 = getAvroRecord(schema, 2); - - writeToS3GeneratedKey(topicName, outputStream1.toByteArray()); - writeToS3(topicName, outputStream2.toByteArray(), "00000"); - writeToS3GeneratedKey(topicName, outputStream2.toByteArray()); - writeToS3GeneratedKey(topicName, outputStream2.toByteArray()); - - final List<String> objects = testBucketAccessor.listObjects(); - assertThat(objects.size()).isEqualTo(4); - - // Verify that the connector is correctly set up - assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); - - // Poll Avro messages from the Kafka topic and deserialize them - final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 4, KAFKA_CONTAINER, - SCHEMA_REGISTRY.getSchemaRegistryUrl()); // Ensure this method deserializes Avro - - // Verify that the correct data is read from the S3 bucket and pushed to Kafka - assertThat(records).extracting(record -> record.get("message").toString()) - .contains("Hello, Kafka Connect S3 Source! object 1") - .contains("Hello, Kafka Connect S3 Source! object 2"); + .contains("Hello, Kafka Connect S3 Source! object 11") + .contains("Hello, Kafka Connect S3 Source! object 21") + .contains("Hello, Kafka Connect S3 Source! object 12") + .contains("Hello, Kafka Connect S3 Source! object 22"); assertThat(records).extracting(record -> record.get("id").toString()).contains("1").contains("2"); } @@ -321,16 +279,21 @@ void jsonTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc private static ByteArrayOutputStream getAvroRecord(final Schema schema, final int messageId) throws IOException { // Create Avro records - final GenericRecord avroRecord = new GenericData.Record(schema); - avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + messageId); - avroRecord.put("id", messageId); + final GenericRecord avroRecord1 = new GenericData.Record(schema); + avroRecord1.put("message", "Hello, Kafka Connect S3 Source! object 1" + messageId); + avroRecord1.put("id", messageId); + + final GenericRecord avroRecord2 = new GenericData.Record(schema); + avroRecord2.put("message", "Hello, Kafka Connect S3 Source! object 2" + messageId); + avroRecord2.put("id", messageId); // Serialize Avro records to byte arrays final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) { dataFileWriter.create(schema, outputStream); - dataFileWriter.append(avroRecord); + dataFileWriter.append(avroRecord1); + dataFileWriter.append(avroRecord2); dataFileWriter.flush(); } outputStream.close(); @@ -351,19 +314,6 @@ private static void writeToS3(final String topicName, final byte[] testDataBytes } } - private static void writeToS3GeneratedKey(final String topicName, final byte[] testDataBytes) throws IOException { - final String filePrefix = topicName + System.currentTimeMillis(); - final String fileSuffix = ".txt"; - - final Path testFilePath = File.createTempFile(filePrefix, fileSuffix).toPath(); - try { - Files.write(testFilePath, testDataBytes); - saveToS3(TEST_BUCKET_NAME, "", filePrefix + fileSuffix, testFilePath.toFile()); - } finally { - Files.delete(testFilePath); - } - } - private Map<String, String> getConfig(final Map<String, String> config, final String topics) { config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); config.put(AWS_ACCESS_KEY_ID_CONFIG, S3_ACCESS_KEY_ID); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index 147903bf9..abfac71b2 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -16,10 +16,10 @@ package io.aiven.kafka.connect.s3.source.utils; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; import static io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator.OFFSET_KEY; import static java.util.stream.Collectors.toMap; -import java.net.ConnectException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -76,6 +76,21 @@ public long incrementAndUpdateOffsetMap(final Map<String, Object> partitionMap) return 0L; } + public void createNewOffsetMap(final Map<String, Object> partitionMap, final String objectKey, + final long offsetId) { + final Map<String, Object> offsetMap = getOffsetValueMap(objectKey, offsetId); + offsets.put(partitionMap, offsetMap); + } + + public Map<String, Object> getOffsetValueMap(final String currentObjectKey, final long offsetId) { + // Create the offset map + final Map<String, Object> offsetMap = new HashMap<>(); + offsetMap.put(OFFSET_KEY, offsetId); + offsetMap.put(OBJECT_KEY, currentObjectKey); + + return offsetMap; + } + void updateCurrentOffsets(final Map<String, Object> partitionMap, final Map<String, Object> offsetValueMap) { offsets.put(partitionMap, offsetValueMap); } @@ -90,13 +105,6 @@ private static Set<String> parseTopics(final S3SourceConfig s3SourceConfig) { return Arrays.stream(topicString.split(",")).collect(Collectors.toSet()); } - String getFirstConfiguredTopic(final S3SourceConfig s3SourceConfig) throws ConnectException { - final String topicString = s3SourceConfig.getString(S3SourceConfig.TARGET_TOPICS); - return Arrays.stream(topicString.split(",")) - .findFirst() - .orElseThrow(() -> new ConnectException("Topic could not be derived")); - } - private static List<Map<String, Object>> buildPartitionKeys(final String bucket, final Set<Integer> partitions, final Set<String> topics) { final List<Map<String, Object>> partitionKeys = new ArrayList<>(); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index b590b3c16..579ef6376 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -79,8 +79,6 @@ static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRecord> ai valueConverter.configure(conversionConfig, false); try { final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); - offsetManager.updateCurrentOffsets(aivenS3SourceRecord.getPartitionMap(), - aivenS3SourceRecord.getOffsetMap()); sourceRecordList.add(aivenS3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue)); } catch (DataException e) { LOGGER.error("Error in reading s3 object stream " + e.getMessage()); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 535b4f8a1..3b770ffd6 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -16,14 +16,12 @@ package io.aiven.kafka.connect.s3.source.utils; -import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; - import java.io.IOException; import java.io.InputStream; +import java.net.ConnectException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; -import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -104,18 +102,20 @@ private void nextS3Object() { private Iterator<List<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentFile() throws IOException { try (S3Object s3Object = s3Client.getObject(bucketName, currentObjectKey); InputStream inputStream = s3Object.getObjectContent()) { - String topicName; - int defaultPartitionId = 0; - final long defaultStartOffsetId = 0L; final Matcher fileMatcher = FILE_DEFAULT_PATTERN.matcher(currentObjectKey); + String topicName; + int defaultPartitionId; + if (fileMatcher.find()) { topicName = fileMatcher.group(PATTERN_TOPIC_KEY); defaultPartitionId = Integer.parseInt(fileMatcher.group(PATTERN_PARTITION_KEY)); } else { - topicName = offsetManager.getFirstConfiguredTopic(s3SourceConfig); + throw new ConnectException("File naming doesn't match to any topic. " + currentObjectKey); } + final long defaultStartOffsetId = 0L; + final String finalTopic = topicName; final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(topicName, defaultPartitionId, bucketName); @@ -130,9 +130,7 @@ private Iterator<List<ConsumerRecord<byte[], byte[]>>> getObjectIterator(final I final String topic, final int topicPartition, final long startOffset, final OutputWriter outputWriter, final Map<String, Object> partitionMap) { return new Iterator<>() { - private Map<Map<String, Object>, Long> currentOffsets = new HashMap<>(); private List<ConsumerRecord<byte[], byte[]>> nextRecord = readNext(); - private List<ConsumerRecord<byte[], byte[]>> readNext() { final Optional<byte[]> optionalKeyBytes = Optional.ofNullable(currentObjectKey) @@ -142,15 +140,14 @@ private List<ConsumerRecord<byte[], byte[]>> readNext() { for (final Object record : outputWriter.getRecords(valueInputStream, topic, topicPartition)) { final byte[] valueBytes = outputWriter.getValueBytes(record, topic, s3SourceConfig); consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, topic, topicPartition, - offsetManager, currentOffsets, startOffset, partitionMap)); + offsetManager, startOffset, partitionMap)); } return consumerRecordList; } private ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> key, final byte[] value, final String topic, final int topicPartition, final OffsetManager offsetManager, - final Map<Map<String, Object>, Long> currentOffsets, final long startOffset, - final Map<String, Object> partitionMap) { + final long startOffset, final Map<String, Object> partitionMap) { long currentOffset; @@ -159,8 +156,8 @@ private ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> currentOffset = offsetManager.incrementAndUpdateOffsetMap(partitionMap); } else { LOGGER.info("Into else block ..."); - currentOffset = currentOffsets.getOrDefault(partitionMap, startOffset); - currentOffsets.put(partitionMap, currentOffset + 1); + currentOffset = startOffset + 1L; + offsetManager.createNewOffsetMap(partitionMap, currentObjectKey, currentOffset); } return new ConsumerRecord<>(topic, topicPartition, currentOffset, key.orElse(null), value); @@ -209,13 +206,9 @@ public List<AivenS3SourceRecord> next() { Map<String, Object> offsetMap; Map<String, Object> partitionMap; for (final ConsumerRecord<byte[], byte[]> currentRecord : consumerRecordList) { - partitionMap = ConnectUtils.getPartitionMap(currentRecord.topic(), currentRecord.partition(), bucketName); - // Create the offset map - offsetMap = new HashMap<>(); - offsetMap.put(OFFSET_KEY, currentRecord.offset()); - offsetMap.put(OBJECT_KEY, currentObjectKey); + offsetMap = offsetManager.getOffsetValueMap(currentObjectKey, currentRecord.offset()); aivenS3SourceRecord = new AivenS3SourceRecord(partitionMap, offsetMap, currentRecord.topic(), currentRecord.partition(), currentRecord.key(), currentRecord.value(), currentObjectKey); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java index 0ff30eb71..5edd45a49 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java @@ -123,18 +123,6 @@ void testIncrementAndUpdateOffsetMapNonExistingOffset() { assertThat(newOffset).isEqualTo(0L); } - @Test - void testGetFirstConfiguredTopic() throws Exception { - sourceTaskContext = mock(SourceTaskContext.class); - final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); - when(sourceTaskContext.offsetStorageReader()).thenReturn(offsetStorageReader); - - offsetManager = new OffsetManager(sourceTaskContext, s3SourceConfig); - - final String firstTopic = offsetManager.getFirstConfiguredTopic(s3SourceConfig); - assertThat(firstTopic).isEqualTo("topic1"); - } - private void setBasicProperties() { properties.put(S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET); properties.put(TARGET_TOPIC_PARTITIONS, "0,1"); From 1f6056f54d6931b4f51fa9b7d1e6dca0c3ef78e5 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Sun, 13 Oct 2024 21:52:52 +0200 Subject: [PATCH 41/90] Fix offsets for unique object keys --- .../connect/s3/source/IntegrationTest.java | 63 ++++++++++++------- .../connect/s3/source/utils/FileReader.java | 33 ++-------- .../s3/source/utils/OffsetManager.java | 28 +++++---- .../s3/source/utils/RecordProcessor.java | 5 +- .../s3/source/utils/SourceRecordIterator.java | 34 +++++++--- .../s3/source/utils/FileReaderTest.java | 2 +- .../s3/source/utils/OffsetManagerTest.java | 32 +++++++--- 7 files changed, 118 insertions(+), 79 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 130d8dfe6..b48ff7629 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -176,7 +176,7 @@ void multiPartUploadBytesTest(final TestInfo testInfo) throws ExecutionException connectRunner.createConnector(connectorConfig); final String partition = "00001"; - final String key = topicName + "-" + partition + ".txt"; + final String key = topicName + "-" + partition + "-" + System.currentTimeMillis() + ".txt"; multipartUpload(TEST_BUCKET_NAME, key); // Poll messages from the Kafka topic and verify the consumed data final List<String> records = IntegrationBase.consumeMessages(topicName, 1, KAFKA_CONTAINER); @@ -202,28 +202,30 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc final Schema.Parser parser = new Schema.Parser(); final Schema schema = parser.parse(schemaJson); - final ByteArrayOutputStream outputStream1 = getAvroRecord(schema, 1); - final ByteArrayOutputStream outputStream2 = getAvroRecord(schema, 2); + final ByteArrayOutputStream outputStream1 = getAvroRecord(schema, 1, 10); + final ByteArrayOutputStream outputStream2 = getAvroRecord(schema, 2, 10); writeToS3(topicName, outputStream1.toByteArray(), "00001"); + writeToS3(topicName, outputStream2.toByteArray(), "00001"); + + writeToS3(topicName, outputStream1.toByteArray(), "00002"); + writeToS3(topicName, outputStream2.toByteArray(), "00002"); writeToS3(topicName, outputStream2.toByteArray(), "00002"); final List<String> objects = testBucketAccessor.listObjects(); - assertThat(objects.size()).isEqualTo(2); + assertThat(objects.size()).isEqualTo(5); // Verify that the connector is correctly set up assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); // Poll Avro messages from the Kafka topic and deserialize them - final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 4, KAFKA_CONTAINER, + final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 10, KAFKA_CONTAINER, SCHEMA_REGISTRY.getSchemaRegistryUrl()); // Ensure this method deserializes Avro // Verify that the correct data is read from the S3 bucket and pushed to Kafka assertThat(records).extracting(record -> record.get("message").toString()) - .contains("Hello, Kafka Connect S3 Source! object 11") - .contains("Hello, Kafka Connect S3 Source! object 21") - .contains("Hello, Kafka Connect S3 Source! object 12") - .contains("Hello, Kafka Connect S3 Source! object 22"); + .contains("Hello, Kafka Connect S3 Source! object 1") + .contains("Hello, Kafka Connect S3 Source! object 2"); assertThat(records).extracting(record -> record.get("id").toString()).contains("1").contains("2"); } @@ -235,9 +237,10 @@ void parquetTest(final TestInfo testInfo) throws ExecutionException, Interrupted connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); connectorConfig.put("value.converter.schema.registry.url", SCHEMA_REGISTRY.getSchemaRegistryUrl()); + connectorConfig.put(VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); final String partition = "00000"; - final String fileName = topicName + "-" + partition + ".txt"; + final String fileName = topicName + "-" + partition + "-" + System.currentTimeMillis() + ".txt"; final String name1 = "testuser1"; final String name2 = "testuser2"; @@ -277,23 +280,25 @@ void jsonTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc assertThat(records).extracting(record -> record.get("payload").get("id").asText()).contains("1"); } - private static ByteArrayOutputStream getAvroRecord(final Schema schema, final int messageId) throws IOException { + @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") + private static ByteArrayOutputStream getAvroRecord(final Schema schema, final int messageId, final int noOfAvroRecs) + throws IOException { // Create Avro records - final GenericRecord avroRecord1 = new GenericData.Record(schema); - avroRecord1.put("message", "Hello, Kafka Connect S3 Source! object 1" + messageId); - avroRecord1.put("id", messageId); - - final GenericRecord avroRecord2 = new GenericData.Record(schema); - avroRecord2.put("message", "Hello, Kafka Connect S3 Source! object 2" + messageId); - avroRecord2.put("id", messageId); + GenericRecord avroRecord; // Serialize Avro records to byte arrays final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) { dataFileWriter.create(schema, outputStream); - dataFileWriter.append(avroRecord1); - dataFileWriter.append(avroRecord2); + for (int i = 0; i < noOfAvroRecs; i++) { + avroRecord = new GenericData.Record(schema); + avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + i); + avroRecord.put("id", messageId); + + dataFileWriter.append(avroRecord); + } + dataFileWriter.flush(); } outputStream.close(); @@ -302,7 +307,8 @@ private static ByteArrayOutputStream getAvroRecord(final Schema schema, final in private static void writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) throws IOException { - final String filePrefix = topicName + "-" + partitionId; + final String filePrefix = topicName + "-" + partitionId + "-" + System.currentTimeMillis(); + // final String filePrefix = topicName + "-" + partitionId + "-" + "1234567891010"; final String fileSuffix = ".txt"; final Path testFilePath = File.createTempFile(filePrefix, fileSuffix).toPath(); @@ -354,4 +360,19 @@ public void multipartUpload(final String bucketName, final String key) { LOGGER.error(e.getMessage()); } } + + // private Map<Map<String, Object>, Map<String, Object>> getTmpData() { + // final Map<Map<String, Object>, Map<String, Object>> tmpOffsets = new HashMap<>(); + // final Map<String, Object> partitionKeyMap = new HashMap<>(); + // partitionKeyMap.put("topic", "avroTest"); + // partitionKeyMap.put("bucket", "test-bucket0"); + // partitionKeyMap.put("topicPartition", 1); + // + // final Map<String, Object> offsetValMap = new HashMap<>(); + // offsetValMap.put(OBJECT_KEY + ":" + "avroTest-00001-1234567891010.txt", 4L); + // + // tmpOffsets.put(partitionKeyMap, offsetValMap); + // + // return tmpOffsets; + // } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java index 903de65d2..e6c186aee 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -16,15 +16,12 @@ package io.aiven.kafka.connect.s3.source.utils; -import static io.aiven.kafka.connect.s3.source.S3SourceTask.BUCKET; -import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; -import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -44,16 +41,13 @@ public class FileReader { private final S3SourceConfig s3SourceConfig; private final String bucketName; - private final OffsetManager offsetManager; - private final Set<String> failedObjectKeys; - public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName, final Set<String> failedObjectKeys, - final OffsetManager offsetManager) { + public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName, + final Set<String> failedObjectKeys) { this.s3SourceConfig = s3SourceConfig; this.bucketName = bucketName; this.failedObjectKeys = new HashSet<>(failedObjectKeys); - this.offsetManager = offsetManager; } @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") @@ -78,28 +72,9 @@ List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOExc .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey())) .collect(Collectors.toList()); - final Map<Map<String, Object>, Map<String, Object>> processedOffsets = offsetManager.getOffsets(); - LOGGER.info(processedOffsets + " processedOffsets"); - - final List<S3ObjectSummary> filteredSummariesNewList = filteredSummaries.stream() - .filter(s3ObjectSummary -> { - for (final Map.Entry<Map<String, Object>, Map<String, Object>> mapMapEntry : processedOffsets - .entrySet()) { - if (mapMapEntry.getKey().get(BUCKET).equals(bucketName) - // && mapMapEntry.getKey().get(OBJECT_KEY).equals(s3ObjectSummary.getKey()) - && s3ObjectSummary.getKey().equals(mapMapEntry.getValue().get(OBJECT_KEY))) { - return false; - } - } - return true; - }) - .collect(Collectors.toList()); - - LOGGER.info(" **** filteredSummariesNewList **** " + filteredSummariesNewList); - - allSummaries.addAll(filteredSummariesNewList); // Add the filtered summaries to the main list + allSummaries.addAll(filteredSummaries); // Add the filtered summaries to the main list - allSummaries.forEach(objSummary -> LOGGER.info(" ******* FR key ******** " + objSummary.getKey())); + allSummaries.forEach(objSummary -> LOGGER.info("Objects to be processed {} ", objSummary.getKey())); // Check if there are more objects to fetch continuationToken = objectListing.getNextContinuationToken(); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index abfac71b2..bd62f381c 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -17,7 +17,6 @@ package io.aiven.kafka.connect.s3.source.utils; import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; -import static io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator.OFFSET_KEY; import static java.util.stream.Collectors.toMap; import java.util.ArrayList; @@ -51,29 +50,38 @@ public OffsetManager(final SourceTaskContext context, final S3SourceConfig s3Sou final Map<Map<String, Object>, Map<String, Object>> offsetMap = context.offsetStorageReader() .offsets(partitionKeys); - LOGGER.info(" ********** offsetMap ***** " + offsetMap); + LOGGER.info(" ********** offsetMap ***** {}", offsetMap); this.offsets = offsetMap.entrySet() .stream() .filter(e -> e.getValue() != null) .collect(toMap(entry -> new HashMap<>(entry.getKey()), entry -> new HashMap<>(entry.getValue()))); - LOGGER.info(" ********** offsets ***** " + offsets); + LOGGER.info(" ********** offsets ***** {}", offsets); } public Map<Map<String, Object>, Map<String, Object>> getOffsets() { return Collections.unmodifiableMap(offsets); } - public long incrementAndUpdateOffsetMap(final Map<String, Object> partitionMap) { + public long incrementAndUpdateOffsetMap(final Map<String, Object> partitionMap, final String currentObjectKey, + final long startOffset) { if (offsets.containsKey(partitionMap)) { final Map<String, Object> offsetValue = new HashMap<>(offsets.get(partitionMap)); - if (offsetValue.containsKey(OFFSET_KEY)) { - final long newOffsetVal = (long) offsetValue.get(OFFSET_KEY) + 1L; - offsetValue.put(OFFSET_KEY, newOffsetVal); + if (offsetValue.containsKey(getObjectMapKey(currentObjectKey))) { + final long newOffsetVal = (long) offsetValue.get(getObjectMapKey(currentObjectKey)) + 1L; + offsetValue.put(getObjectMapKey(currentObjectKey), newOffsetVal); offsets.put(partitionMap, offsetValue); return newOffsetVal; + } else { + offsetValue.put(getObjectMapKey(currentObjectKey), startOffset); + offsets.put(partitionMap, offsetValue); + return startOffset; } } - return 0L; + return startOffset; + } + + public String getObjectMapKey(final String currentObjectKey) { + return OBJECT_KEY + ":" + currentObjectKey; } public void createNewOffsetMap(final Map<String, Object> partitionMap, final String objectKey, @@ -83,10 +91,8 @@ public void createNewOffsetMap(final Map<String, Object> partitionMap, final Str } public Map<String, Object> getOffsetValueMap(final String currentObjectKey, final long offsetId) { - // Create the offset map final Map<String, Object> offsetMap = new HashMap<>(); - offsetMap.put(OFFSET_KEY, offsetId); - offsetMap.put(OBJECT_KEY, currentObjectKey); + offsetMap.put(getObjectMapKey(currentObjectKey), offsetId); return offsetMap; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index 579ef6376..f9a162925 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -59,6 +59,7 @@ public static List<SourceRecord> processRecords(final Iterator<List<AivenS3Sourc results.addAll(sourceRecords); } + LOGGER.info("Number of records sent {}", results.size()); return results; } @@ -70,7 +71,7 @@ static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRecord> ai final List<SourceRecord> sourceRecordList = new ArrayList<>(); for (final AivenS3SourceRecord aivenS3SourceRecord : aivenS3SourceRecordList) { - LOGGER.info(" ******* CSR key ******** " + aivenS3SourceRecord.getObjectKey()); + LOGGER.info(" ******* CSR key ******** {}", aivenS3SourceRecord.getObjectKey()); final String topic = aivenS3SourceRecord.getToTopic(); final Optional<SchemaAndValue> keyData = keyConverter .map(c -> c.toConnectData(topic, aivenS3SourceRecord.key())); @@ -79,6 +80,8 @@ static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRecord> ai valueConverter.configure(conversionConfig, false); try { final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); + offsetManager.updateCurrentOffsets(aivenS3SourceRecord.getPartitionMap(), + aivenS3SourceRecord.getOffsetMap()); sourceRecordList.add(aivenS3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue)); } catch (DataException e) { LOGGER.error("Error in reading s3 object stream " + e.getMessage()); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 3b770ffd6..9473cfc26 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -52,9 +52,8 @@ public final class SourceRecordIterator implements Iterator<List<AivenS3SourceRe public static final String PATTERN_PARTITION_KEY = "partitionId"; public static final String OFFSET_KEY = "offset"; - public static final Pattern FILE_DEFAULT_PATTERN = Pattern - .compile("(?<topicName>[^/]+?)-" + "(?<partitionId>\\d{5})" + "\\.(?<fileExtension>[^.]+)$"); // ex : - // topic-00001.txt + public static final Pattern FILE_DEFAULT_PATTERN = Pattern.compile( + "(?<topicName>[^/]+?)-" + "(?<partitionId>\\d{5})-" + "(?<timestamp>\\d+)" + "\\.(?<fileExtension>[^.]+)$"); // topic-00001.txt private String currentObjectKey; private Iterator<S3ObjectSummary> nextFileIterator; @@ -75,7 +74,7 @@ public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 this.s3Client = s3Client; this.bucketName = bucketName; this.outputWriter = outputWriter; - final FileReader fileReader = new FileReader(s3SourceConfig, bucketName, failedObjectKeys, offsetManager); + final FileReader fileReader = new FileReader(s3SourceConfig, bucketName, failedObjectKeys); try { final List<S3ObjectSummary> chunks = fileReader.fetchObjectSummaries(s3Client); nextFileIterator = chunks.iterator(); @@ -114,7 +113,7 @@ private Iterator<List<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentF throw new ConnectException("File naming doesn't match to any topic. " + currentObjectKey); } - final long defaultStartOffsetId = 0L; + final long defaultStartOffsetId = 1L; final String finalTopic = topicName; final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(topicName, defaultPartitionId, @@ -137,10 +136,28 @@ private List<ConsumerRecord<byte[], byte[]>> readNext() { .map(k -> k.getBytes(StandardCharsets.UTF_8)); final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = new ArrayList<>(); + int numOfProcessedRecs = 1; + boolean checkOffsetMap = true; for (final Object record : outputWriter.getRecords(valueInputStream, topic, topicPartition)) { + + if (offsetManager.getOffsets().containsKey(partitionMap) && checkOffsetMap) { + final Map<String, Object> offsetVal = offsetManager.getOffsets().get(partitionMap); + if (offsetVal.containsKey(offsetManager.getObjectMapKey(currentObjectKey))) { + final long offsetValue = (long) offsetVal + .get(offsetManager.getObjectMapKey(currentObjectKey)); + if (numOfProcessedRecs <= offsetValue) { + numOfProcessedRecs++; + continue; + } + } + } + + checkOffsetMap = false; + final byte[] valueBytes = outputWriter.getValueBytes(record, topic, s3SourceConfig); consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, topic, topicPartition, offsetManager, startOffset, partitionMap)); + numOfProcessedRecs++; } return consumerRecordList; } @@ -152,11 +169,12 @@ private ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> long currentOffset; if (offsetManager.getOffsets().containsKey(partitionMap)) { - LOGGER.info("***** offsetManager.getOffsets() ***** " + offsetManager.getOffsets()); - currentOffset = offsetManager.incrementAndUpdateOffsetMap(partitionMap); + LOGGER.info("***** offsetManager.getOffsets() ***** {}", offsetManager.getOffsets()); + currentOffset = offsetManager.incrementAndUpdateOffsetMap(partitionMap, currentObjectKey, + startOffset); } else { LOGGER.info("Into else block ..."); - currentOffset = startOffset + 1L; + currentOffset = startOffset; offsetManager.createNewOffsetMap(partitionMap, currentObjectKey, currentOffset); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java index e74605e8a..91a33b723 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java @@ -60,7 +60,7 @@ public void setUp() { setBasicProperties(); final S3SourceConfig s3SourceConfig = new S3SourceConfig(properties); offsetManager = mock(OffsetManager.class); - fileReader = new FileReader(s3SourceConfig, TEST_BUCKET, Collections.emptySet(), offsetManager); + fileReader = new FileReader(s3SourceConfig, TEST_BUCKET, Collections.emptySet()); s3Client = mock(AmazonS3.class); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java index 5edd45a49..acd876f8f 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java @@ -16,6 +16,7 @@ package io.aiven.kafka.connect.s3.source.utils; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; import static io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator.OFFSET_KEY; @@ -87,22 +88,31 @@ void testIncrementAndUpdateOffsetMapExistingOffset() { final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); when(sourceTaskContext.offsetStorageReader()).thenReturn(offsetStorageReader); + // Mock partition and offset values + final String objectKey = "testObject"; + final String offsetObjectKey = OBJECT_KEY + ":" + objectKey; + final Map<String, Object> partitionKey = new HashMap<>(); partitionKey.put("topic", "topic1"); partitionKey.put("partition", 0); + partitionKey.put("bucket", "bucket"); final Map<String, Object> offsetValue = new HashMap<>(); - offsetValue.put(OFFSET_KEY, 1L); + offsetValue.put(offsetObjectKey, 1L); // Existing offset value final Map<Map<String, Object>, Map<String, Object>> offsets = new HashMap<>(); offsets.put(partitionKey, offsetValue); - when(offsetStorageReader.offsets(any())).thenReturn(offsets); + when(offsetStorageReader.offsets(any())).thenReturn(offsets); // Mock offset retrieval + // Initialize offset manager offsetManager = new OffsetManager(sourceTaskContext, s3SourceConfig); - final long newOffset = offsetManager.incrementAndUpdateOffsetMap(partitionKey); - assertThat(newOffset).isEqualTo(2L); - assertThat(offsetManager.getOffsets().get(partitionKey).get(OFFSET_KEY)).isEqualTo(2L); + // Invoke method and assert new offset value + final long newOffset = offsetManager.incrementAndUpdateOffsetMap(partitionKey, objectKey, 2L); + + assertThat(newOffset).isEqualTo(2L); // Expect incremented offset + assertThat(offsetManager.getOffsets().get(partitionKey).get(offsetObjectKey)).isEqualTo(2L); // Verify updated + // offset in map } @Test @@ -111,16 +121,22 @@ void testIncrementAndUpdateOffsetMapNonExistingOffset() { final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); when(sourceTaskContext.offsetStorageReader()).thenReturn(offsetStorageReader); + // Mock partition without any existing offset final Map<String, Object> partitionKey = new HashMap<>(); partitionKey.put("topic", "topic1"); partitionKey.put("partition", 0); - when(offsetStorageReader.offsets(any())).thenReturn(Collections.emptyMap()); + when(offsetStorageReader.offsets(any())).thenReturn(Collections.emptyMap()); // No existing offset + // Initialize offset manager offsetManager = new OffsetManager(sourceTaskContext, s3SourceConfig); - final long newOffset = offsetManager.incrementAndUpdateOffsetMap(partitionKey); - assertThat(newOffset).isEqualTo(0L); + // Invoke method and assert new offset value + final long startOffset = 5L; + final long newOffset = offsetManager.incrementAndUpdateOffsetMap(partitionKey, "", startOffset); + + // Expect the startOffset to be returned when no existing offset is found + assertThat(newOffset).isEqualTo(startOffset); } private void setBasicProperties() { From 4b23dcc6b1c13f472d0a7962c4759a29b0ffb92f Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Mon, 14 Oct 2024 10:12:02 +0200 Subject: [PATCH 42/90] fix object key in map --- .../aiven/kafka/connect/s3/source/utils/ConnectUtils.java | 1 - .../kafka/connect/s3/source/utils/OffsetManager.java | 3 ++- .../connect/s3/source/utils/SourceRecordIterator.java | 8 +++----- .../kafka/connect/s3/source/utils/OffsetManagerTest.java | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java index 9420834da..f401c4e1f 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java @@ -33,7 +33,6 @@ static Map<String, Object> getPartitionMap(final String topicName, final Integer final Map<String, Object> partitionMap = new HashMap<>(); partitionMap.put(BUCKET, bucketName); partitionMap.put(TOPIC, topicName); - partitionMap.put(PARTITION, defaultPartitionId); return partitionMap; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index bd62f381c..714334782 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -38,6 +38,7 @@ public class OffsetManager { private static final Logger LOGGER = LoggerFactory.getLogger(OffsetManager.class); + public static final String SEPARATOR = "_"; private final Map<Map<String, Object>, Map<String, Object>> offsets; public OffsetManager(final SourceTaskContext context, final S3SourceConfig s3SourceConfig) { @@ -81,7 +82,7 @@ public long incrementAndUpdateOffsetMap(final Map<String, Object> partitionMap, } public String getObjectMapKey(final String currentObjectKey) { - return OBJECT_KEY + ":" + currentObjectKey; + return OBJECT_KEY + SEPARATOR + currentObjectKey; } public void createNewOffsetMap(final Map<String, Object> partitionMap, final String objectKey, diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 9473cfc26..b6192289f 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -139,7 +139,7 @@ private List<ConsumerRecord<byte[], byte[]>> readNext() { int numOfProcessedRecs = 1; boolean checkOffsetMap = true; for (final Object record : outputWriter.getRecords(valueInputStream, topic, topicPartition)) { - + final byte[] valueBytes = outputWriter.getValueBytes(record, topic, s3SourceConfig); if (offsetManager.getOffsets().containsKey(partitionMap) && checkOffsetMap) { final Map<String, Object> offsetVal = offsetManager.getOffsets().get(partitionMap); if (offsetVal.containsKey(offsetManager.getObjectMapKey(currentObjectKey))) { @@ -153,8 +153,6 @@ private List<ConsumerRecord<byte[], byte[]>> readNext() { } checkOffsetMap = false; - - final byte[] valueBytes = outputWriter.getValueBytes(record, topic, s3SourceConfig); consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, topic, topicPartition, offsetManager, startOffset, partitionMap)); numOfProcessedRecs++; @@ -189,7 +187,7 @@ public boolean hasNext() { @Override public List<ConsumerRecord<byte[], byte[]>> next() { if (nextRecord.isEmpty()) { - LOGGER.error("May be error in reading s3 object " + currentObjectKey); + // LOGGER.error("May be error in reading s3 object " + currentObjectKey); return Collections.emptyList(); // throw new NoSuchElementException(); } @@ -214,7 +212,7 @@ public List<AivenS3SourceRecord> next() { final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = recordIterator.next(); if (consumerRecordList.isEmpty()) { - LOGGER.error("May be error in reading s3 object " + currentObjectKey); + // LOGGER.error("May be error in reading s3 object " + currentObjectKey); return Collections.emptyList(); // throw new NoSuchElementException(); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java index acd876f8f..99ded7905 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java @@ -90,7 +90,7 @@ void testIncrementAndUpdateOffsetMapExistingOffset() { // Mock partition and offset values final String objectKey = "testObject"; - final String offsetObjectKey = OBJECT_KEY + ":" + objectKey; + final String offsetObjectKey = OBJECT_KEY + "_" + objectKey; final Map<String, Object> partitionKey = new HashMap<>(); partitionKey.put("topic", "topic1"); From 8e618eed3a639a2df7c3a04e9ce920371a9dd0f2 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Mon, 14 Oct 2024 12:04:00 +0200 Subject: [PATCH 43/90] fix for invalid object names --- .../s3/source/utils/SourceRecordIterator.java | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index b6192289f..74c9c9ab9 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.io.InputStream; -import java.net.ConnectException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; @@ -38,6 +37,7 @@ import com.amazonaws.AmazonClientException; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.S3Object; +import com.amazonaws.services.s3.model.S3ObjectInputStream; import com.amazonaws.services.s3.model.S3ObjectSummary; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -100,7 +100,7 @@ private void nextS3Object() { private Iterator<List<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentFile() throws IOException { try (S3Object s3Object = s3Client.getObject(bucketName, currentObjectKey); - InputStream inputStream = s3Object.getObjectContent()) { + S3ObjectInputStream inputStream = s3Object.getObjectContent()) { final Matcher fileMatcher = FILE_DEFAULT_PATTERN.matcher(currentObjectKey); String topicName; @@ -110,7 +110,10 @@ private Iterator<List<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentF topicName = fileMatcher.group(PATTERN_TOPIC_KEY); defaultPartitionId = Integer.parseInt(fileMatcher.group(PATTERN_PARTITION_KEY)); } else { - throw new ConnectException("File naming doesn't match to any topic. " + currentObjectKey); + LOGGER.error("File naming doesn't match to any topic. " + currentObjectKey); + inputStream.abort(); + s3Object.close(); + return Collections.emptyIterator(); } final long defaultStartOffsetId = 1L; @@ -210,6 +213,11 @@ public List<AivenS3SourceRecord> next() { nextS3Object(); } + if (!recordIterator.hasNext()) { + // If there are still no records, return an empty list + return Collections.emptyList(); // or new ArrayList<>() for mutable list + } + final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = recordIterator.next(); if (consumerRecordList.isEmpty()) { // LOGGER.error("May be error in reading s3 object " + currentObjectKey); From 626e5ee5dde14fa821c2a79821716bdbe09f6a83 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Mon, 14 Oct 2024 14:28:20 +0200 Subject: [PATCH 44/90] update pattern for uniqueness --- .../connect/s3/source/IntegrationTest.java | 18 +++++----- .../s3/source/utils/SourceRecordIterator.java | 4 +-- .../s3/source/output/ParquetWriterTest.java | 2 +- .../s3/source/testutils/ContentUtils.java | 33 +++++++++---------- 4 files changed, 28 insertions(+), 29 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index b48ff7629..884365f60 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -202,8 +202,8 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc final Schema.Parser parser = new Schema.Parser(); final Schema schema = parser.parse(schemaJson); - final ByteArrayOutputStream outputStream1 = getAvroRecord(schema, 1, 10); - final ByteArrayOutputStream outputStream2 = getAvroRecord(schema, 2, 10); + final ByteArrayOutputStream outputStream1 = getAvroRecord(schema, 1, 100); + final ByteArrayOutputStream outputStream2 = getAvroRecord(schema, 2, 100); writeToS3(topicName, outputStream1.toByteArray(), "00001"); writeToS3(topicName, outputStream2.toByteArray(), "00001"); @@ -219,7 +219,7 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); // Poll Avro messages from the Kafka topic and deserialize them - final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 10, KAFKA_CONTAINER, + final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 500, KAFKA_CONTAINER, SCHEMA_REGISTRY.getSchemaRegistryUrl()); // Ensure this method deserializes Avro // Verify that the correct data is read from the S3 bucket and pushed to Kafka @@ -241,11 +241,10 @@ void parquetTest(final TestInfo testInfo) throws ExecutionException, Interrupted final String partition = "00000"; final String fileName = topicName + "-" + partition + "-" + System.currentTimeMillis() + ".txt"; - final String name1 = "testuser1"; - final String name2 = "testuser2"; + final String name = "testuser"; connectRunner.createConnector(connectorConfig); - final Path path = ContentUtils.getTmpFilePath(name1, name2); + final Path path = ContentUtils.getTmpFilePath(name); try { s3Client.putObject(TEST_BUCKET_NAME, fileName, Files.newInputStream(path), null); @@ -255,10 +254,11 @@ void parquetTest(final TestInfo testInfo) throws ExecutionException, Interrupted Files.delete(path); } - final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 2, KAFKA_CONTAINER, + final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 100, KAFKA_CONTAINER, SCHEMA_REGISTRY.getSchemaRegistryUrl()); - assertThat(2).isEqualTo(records.size()); - assertThat(records).extracting(record -> record.get("name").toString()).contains(name1).contains(name2); + assertThat(records).extracting(record -> record.get("name").toString()) + .contains(name + "1") + .contains(name + "2"); } @Test diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 74c9c9ab9..30c9e3d0f 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -52,8 +52,8 @@ public final class SourceRecordIterator implements Iterator<List<AivenS3SourceRe public static final String PATTERN_PARTITION_KEY = "partitionId"; public static final String OFFSET_KEY = "offset"; - public static final Pattern FILE_DEFAULT_PATTERN = Pattern.compile( - "(?<topicName>[^/]+?)-" + "(?<partitionId>\\d{5})-" + "(?<timestamp>\\d+)" + "\\.(?<fileExtension>[^.]+)$"); // topic-00001.txt + public static final Pattern FILE_DEFAULT_PATTERN = Pattern.compile("(?<topicName>[^/]+?)-" + + "(?<partitionId>\\d{5})-" + "(?<uniqueId>[a-zA-Z0-9]+)" + "\\.(?<fileExtension>[^.]+)$"); // topic-00001.txt private String currentObjectKey; private Iterator<S3ObjectSummary> nextFileIterator; diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java index 88209a693..dd4619332 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java @@ -91,7 +91,7 @@ void testTemporaryFileDeletion() throws Exception { } private byte[] generateMockParquetData() throws IOException { - final Path path = ContentUtils.getTmpFilePath("name1", "name2"); + final Path path = ContentUtils.getTmpFilePath("name"); return IOUtils.toByteArray(Files.newInputStream(path)); } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java index 328f7fbf7..99671781f 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java @@ -33,17 +33,16 @@ public final class ContentUtils { private ContentUtils() { } - public static Path getTmpFilePath(final String name1, final String name2) throws IOException { + public static Path getTmpFilePath(final String name1) throws IOException { final String tmpFile = "users.parquet"; final Path parquetFileDir = Files.createTempDirectory("parquet_tests"); final String parquetFilePath = parquetFileDir.toAbsolutePath() + "/" + tmpFile; - writeParquetFile(parquetFilePath, name1, name2); + writeParquetFile(parquetFilePath, name1); return Paths.get(parquetFilePath); } - public static void writeParquetFile(final String tempFilePath, final String name1, final String name2) - throws IOException { + public static void writeParquetFile(final String tempFilePath, final String name1) throws IOException { // Define the Avro schema final String schemaString = "{" + "\"type\":\"record\"," + "\"name\":\"User\"," + "\"fields\":[" + "{\"name\":\"name\",\"type\":\"string\"}," + "{\"name\":\"age\",\"type\":\"int\"}," @@ -52,25 +51,18 @@ public static void writeParquetFile(final String tempFilePath, final String name // Write the Parquet file try { - writeParquetFile(tempFilePath, schema, name1, name2); + writeParquetFile(tempFilePath, schema, name1, 100); } catch (IOException e) { throw new ConnectException("Error writing parquet file"); } } + @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") private static void writeParquetFile(final String outputPath, final Schema schema, final String name1, - final String name2) throws IOException { + final int numOfRecords) throws IOException { // Create sample records - final GenericData.Record user1 = new GenericData.Record(schema); - user1.put("name", name1); - user1.put("age", 30); - user1.put("email", name1 + "@test"); - - final GenericData.Record user2 = new GenericData.Record(schema); - user2.put("name", name2); - user2.put("age", 25); - user2.put("email", name2 + "@test"); + GenericData.Record user; // Create a Parquet writer final OutputFile outputFile = new LocalOutputFile(Paths.get(outputPath)); @@ -81,8 +73,15 @@ private static void writeParquetFile(final String outputPath, final Schema schem .withPageSize(1024 * 1024) .build()) { // Write records to the Parquet file - writer.write(user1); - writer.write(user2); + for (int i = 0; i < numOfRecords; i++) { + user = new GenericData.Record(schema); + user.put("name", name1 + i); + user.put("age", 30); + user.put("email", name1 + "@test"); + + writer.write(user); + } + } } } From 816a39393daf35426556d24982faf8b686f2bd0b Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Mon, 14 Oct 2024 18:53:30 +0200 Subject: [PATCH 45/90] fix offset maps on final source recs --- .../connect/s3/source/IntegrationTest.java | 25 ++++++++++++------ .../s3/source/config/S3SourceConfig.java | 5 ++++ .../connect/s3/source/output/JsonWriter.java | 26 +++++++++++++++---- .../s3/source/utils/AivenS3SourceRecord.java | 6 ++++- .../s3/source/utils/OffsetManager.java | 8 +++++- .../s3/source/utils/RecordProcessor.java | 1 + 6 files changed, 56 insertions(+), 15 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 884365f60..2c06d0151 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -25,6 +25,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_CONVERTER_SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; import static org.assertj.core.api.Assertions.assertThat; @@ -152,18 +153,20 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; // write 2 objects to s3 + writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000"); + writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00000"); writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00001"); - writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00002"); + writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00001"); writeToS3(topicName, new byte[0], "00003"); // this should be ignored. final List<String> objects = testBucketAccessor.listObjects(); - assertThat(objects.size()).isEqualTo(3); + assertThat(objects.size()).isEqualTo(5); // Verify that the connector is correctly set up assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); // Poll messages from the Kafka topic and verify the consumed data - final List<String> records = IntegrationBase.consumeMessages(topicName, 2, KAFKA_CONTAINER); + final List<String> records = IntegrationBase.consumeMessages(topicName, 4, KAFKA_CONTAINER); // Verify that the correct data is read from the S3 bucket and pushed to Kafka assertThat(records).contains(testData1).contains(testData2); @@ -190,7 +193,7 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.AVRO.getValue()); connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); - connectorConfig.put("value.converter.schema.registry.url", SCHEMA_REGISTRY.getSchemaRegistryUrl()); + connectorConfig.put(VALUE_CONVERTER_SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); connectorConfig.put(VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); connectRunner.createConnector(connectorConfig); @@ -269,12 +272,18 @@ void jsonTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc connectorConfig.put("value.converter", "org.apache.kafka.connect.json.JsonConverter"); connectRunner.createConnector(connectorConfig); - final String testMessage = "This is a test"; - final String jsonContent = "{\"message\": \"" + testMessage + "\", \"id\":\"1\"}"; - writeToS3(topicName, jsonContent.getBytes(StandardCharsets.UTF_8), "00001"); + final String testMessage = "This is a test "; + final StringBuilder jsonBuilder = new StringBuilder(); + for (int i = 0; i < 500; i++) { + final String jsonContent = "{\"message\": \"" + testMessage + "\", \"id\":\"" + i + "\"}"; + jsonBuilder.append(jsonContent).append("\n"); // NOPMD + } + final byte[] jsonBytes = jsonBuilder.toString().getBytes(StandardCharsets.UTF_8); + + writeToS3(topicName, jsonBytes, "00001"); // Poll Json messages from the Kafka topic and deserialize them - final List<JsonNode> records = IntegrationBase.consumeJsonMessages(topicName, 1, KAFKA_CONTAINER); + final List<JsonNode> records = IntegrationBase.consumeJsonMessages(topicName, 500, KAFKA_CONTAINER); assertThat(records).extracting(record -> record.get("payload").get("message").asText()).contains(testMessage); assertThat(records).extracting(record -> record.get("payload").get("id").asText()).contains("1"); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index e30f23d8f..eae52da31 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -73,6 +73,8 @@ final public class S3SourceConfig extends AbstractConfig { public static final int AWS_S3_RETRY_BACKOFF_DELAY_MS_DEFAULT = 100; public static final int AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_DEFAULT = 20_000; public static final String SCHEMA_REGISTRY_URL = "schema.registry.url"; + + public static final String VALUE_CONVERTER_SCHEMA_REGISTRY_URL = "value.converter.schema.registry.url"; public static final String VALUE_SERIALIZER = "value.serializer"; public static final String AWS_ACCESS_KEY_ID_CONFIG = "aws.access.key.id"; public static final String AWS_SECRET_ACCESS_KEY_CONFIG = "aws.secret.access.key"; @@ -144,6 +146,9 @@ private static void addSchemaRegistryGroup(final ConfigDef configDef) { configDef.define(SCHEMA_REGISTRY_URL, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "SCHEMA REGISTRY URL", GROUP_OTHER, srCounter++, ConfigDef.Width.NONE, SCHEMA_REGISTRY_URL); + configDef.define(VALUE_CONVERTER_SCHEMA_REGISTRY_URL, ConfigDef.Type.STRING, null, + new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "SCHEMA REGISTRY URL", GROUP_OTHER, + srCounter++, ConfigDef.Width.NONE, VALUE_CONVERTER_SCHEMA_REGISTRY_URL); configDef.define(OUTPUT_FORMAT_KEY, ConfigDef.Type.STRING, OutputFormat.BYTES.getValue(), new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "Output format avro/json/parquet/bytes", GROUP_OTHER, srCounter++, // NOPMD diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java index 8d4b91482..73b0c42f5 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java @@ -18,8 +18,11 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMAS_ENABLE; +import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -42,12 +45,25 @@ public void configureValueConverter(final Map<String, String> config, final S3So @Override public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition) { final List<Object> jsonNodeList = new ArrayList<>(); - final JsonNode jsonNode; - try { - jsonNode = objectMapper.readTree(inputStream); - jsonNodeList.add(jsonNode); + JsonNode jsonNode; + try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { + String line = reader.readLine(); + while (line != null) { + line = line.trim(); + if (!line.isEmpty()) { + try { + // Parse each line as a separate JSON object + jsonNode = objectMapper.readTree(line.trim()); // Parse the current line into a JsonNode + jsonNodeList.add(jsonNode); // Add parsed JSON object to the list + } catch (IOException e) { + LOGGER.error("Error parsing JSON record from S3 input stream: " + e.getMessage()); + } + } + + line = reader.readLine(); + } } catch (IOException e) { - LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + LOGGER.error("Error reading S3 object stream: " + e.getMessage()); } return jsonNodeList; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java index 13b325506..1857dc4be 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java @@ -27,7 +27,7 @@ public class AivenS3SourceRecord { private final Map<String, Object> partitionMap; - private final Map<String, Object> offsetMap; + private Map<String, Object> offsetMap; private final String toTopic; private final Integer topicPartition; private final byte[] recordKey; @@ -76,6 +76,10 @@ public String getObjectKey() { return objectKey; } + public void setOffsetMap(final Map<String, Object> offsetMap) { + this.offsetMap = new HashMap<>(offsetMap); + } + public SourceRecord getSourceRecord(final String topic, final Optional<SchemaAndValue> keyData, final SchemaAndValue schemaAndValue) { return new SourceRecord(getPartitionMap(), getOffsetMap(), topic, partition(), diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index 714334782..7716b5d85 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -99,7 +99,13 @@ public Map<String, Object> getOffsetValueMap(final String currentObjectKey, fina } void updateCurrentOffsets(final Map<String, Object> partitionMap, final Map<String, Object> offsetValueMap) { - offsets.put(partitionMap, offsetValueMap); + if (offsets.containsKey(partitionMap)) { + final Map<String, Object> offsetMap = new HashMap<>(offsets.get(partitionMap)); + offsetMap.putAll(offsetValueMap); + offsets.put(partitionMap, offsetMap); + } else { + offsets.put(partitionMap, offsetValueMap); + } } private static Set<Integer> parsePartitions(final S3SourceConfig s3SourceConfig) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index f9a162925..9669d77bd 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -82,6 +82,7 @@ static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRecord> ai final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); offsetManager.updateCurrentOffsets(aivenS3SourceRecord.getPartitionMap(), aivenS3SourceRecord.getOffsetMap()); + aivenS3SourceRecord.setOffsetMap(offsetManager.getOffsets().get(aivenS3SourceRecord.getPartitionMap())); sourceRecordList.add(aivenS3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue)); } catch (DataException e) { LOGGER.error("Error in reading s3 object stream " + e.getMessage()); From b398804403ae98150789ed11d5e79ee4752ce975 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Tue, 15 Oct 2024 21:05:13 +0200 Subject: [PATCH 46/90] Fix iterator --- .../kafka/connect/s3/source/S3SourceTask.java | 8 +- .../s3/source/utils/RecordProcessor.java | 55 ++++----- .../s3/source/utils/SourceRecordIterator.java | 78 +++++------- .../connect/s3/source/S3SourceTaskTest.java | 21 ++-- .../s3/source/output/AvroWriterTest.java | 3 - .../s3/source/utils/RecordProcessorTest.java | 14 +-- .../utils/SourceRecordIteratorTest.java | 112 ++++++++++++++++++ 7 files changed, 189 insertions(+), 102 deletions(-) create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 34d099104..99cdaf953 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -40,6 +40,7 @@ import io.aiven.kafka.connect.s3.source.output.OutputWriter; import io.aiven.kafka.connect.s3.source.output.OutputWriterFactory; import io.aiven.kafka.connect.s3.source.utils.AivenS3SourceRecord; +import io.aiven.kafka.connect.s3.source.utils.FileReader; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import io.aiven.kafka.connect.s3.source.utils.RecordProcessor; import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; @@ -54,7 +55,7 @@ * S3SourceTask is a Kafka Connect SourceTask implementation that reads from source-s3 buckets and generates Kafka * Connect records. */ -@SuppressWarnings("PMD.TooManyMethods") +@SuppressWarnings({ "PMD.TooManyMethods", "PMD.ExcessiveImports" }) public class S3SourceTask extends SourceTask { private static final Logger LOGGER = LoggerFactory.getLogger(S3SourceTask.class); @@ -71,7 +72,7 @@ public class S3SourceTask extends SourceTask { private S3SourceConfig s3SourceConfig; private AmazonS3 s3Client; - private Iterator<List<AivenS3SourceRecord>> sourceRecordIterator; + private Iterator<AivenS3SourceRecord> sourceRecordIterator; private Optional<Converter> keyConverter; private Converter valueConverter; @@ -132,8 +133,9 @@ private void initializeS3Client() { } private void prepareReaderFromOffsetStorageReader() { + final FileReader fileReader = new FileReader(s3SourceConfig, this.s3Bucket, failedObjectKeys); sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, s3Client, this.s3Bucket, offsetManager, - this.outputWriter, failedObjectKeys); + this.outputWriter, fileReader); } @Override diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index 9669d77bd..f523b9e86 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -16,7 +16,6 @@ package io.aiven.kafka.connect.s3.source.utils; -import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -43,7 +42,8 @@ public final class RecordProcessor { private RecordProcessor() { } - public static List<SourceRecord> processRecords(final Iterator<List<AivenS3SourceRecord>> sourceRecordIterator, + + public static List<SourceRecord> processRecords(final Iterator<AivenS3SourceRecord> sourceRecordIterator, final List<SourceRecord> results, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, final AtomicBoolean connectorStopped, final OutputWriter outputWriter, final Set<String> failedObjectKeys, @@ -53,44 +53,39 @@ public static List<SourceRecord> processRecords(final Iterator<List<AivenS3Sourc final int maxPollRecords = s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS); for (int i = 0; sourceRecordIterator.hasNext() && i < maxPollRecords && !connectorStopped.get(); i++) { - final List<AivenS3SourceRecord> recordList = sourceRecordIterator.next(); - final List<SourceRecord> sourceRecords = createSourceRecords(recordList, s3SourceConfig, keyConverter, - valueConverter, conversionConfig, outputWriter, failedObjectKeys, offsetManager); - results.addAll(sourceRecords); + final AivenS3SourceRecord aivenS3SourceRecord = sourceRecordIterator.next(); + if (aivenS3SourceRecord != null) { + final SourceRecord sourceRecord = createSourceRecord(aivenS3SourceRecord, s3SourceConfig, keyConverter, + valueConverter, conversionConfig, outputWriter, failedObjectKeys, offsetManager); + results.add(sourceRecord); + } } LOGGER.info("Number of records sent {}", results.size()); return results; } - @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") - static List<SourceRecord> createSourceRecords(final List<AivenS3SourceRecord> aivenS3SourceRecordList, + static SourceRecord createSourceRecord(final AivenS3SourceRecord aivenS3SourceRecord, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, final Map<String, String> conversionConfig, final OutputWriter outputWriter, final Set<String> failedObjectKeys, final OffsetManager offsetManager) { - final List<SourceRecord> sourceRecordList = new ArrayList<>(); - for (final AivenS3SourceRecord aivenS3SourceRecord : aivenS3SourceRecordList) { - LOGGER.info(" ******* CSR key ******** {}", aivenS3SourceRecord.getObjectKey()); - final String topic = aivenS3SourceRecord.getToTopic(); - final Optional<SchemaAndValue> keyData = keyConverter - .map(c -> c.toConnectData(topic, aivenS3SourceRecord.key())); - - outputWriter.configureValueConverter(conversionConfig, s3SourceConfig); - valueConverter.configure(conversionConfig, false); - try { - final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); - offsetManager.updateCurrentOffsets(aivenS3SourceRecord.getPartitionMap(), - aivenS3SourceRecord.getOffsetMap()); - aivenS3SourceRecord.setOffsetMap(offsetManager.getOffsets().get(aivenS3SourceRecord.getPartitionMap())); - sourceRecordList.add(aivenS3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue)); - } catch (DataException e) { - LOGGER.error("Error in reading s3 object stream " + e.getMessage()); - failedObjectKeys.add(aivenS3SourceRecord.getObjectKey()); - throw e; - } + final String topic = aivenS3SourceRecord.getToTopic(); + final Optional<SchemaAndValue> keyData = keyConverter + .map(c -> c.toConnectData(topic, aivenS3SourceRecord.key())); + + outputWriter.configureValueConverter(conversionConfig, s3SourceConfig); + valueConverter.configure(conversionConfig, false); + try { + final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); + offsetManager.updateCurrentOffsets(aivenS3SourceRecord.getPartitionMap(), + aivenS3SourceRecord.getOffsetMap()); + aivenS3SourceRecord.setOffsetMap(offsetManager.getOffsets().get(aivenS3SourceRecord.getPartitionMap())); + return aivenS3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue); + } catch (DataException e) { + LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + failedObjectKeys.add(aivenS3SourceRecord.getObjectKey()); + throw e; } - - return sourceRecordList; } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 30c9e3d0f..6c9b2c3a9 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -16,6 +16,8 @@ package io.aiven.kafka.connect.s3.source.utils; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; + import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; @@ -25,7 +27,6 @@ import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -46,7 +47,7 @@ * Iterator that processes S3 files and creates Kafka source records. Supports different output formats (Avro, JSON, * Parquet). */ -public final class SourceRecordIterator implements Iterator<List<AivenS3SourceRecord>> { +public final class SourceRecordIterator implements Iterator<AivenS3SourceRecord> { private static final Logger LOGGER = LoggerFactory.getLogger(SourceRecordIterator.class); public static final String PATTERN_TOPIC_KEY = "topicName"; public static final String PATTERN_PARTITION_KEY = "partitionId"; @@ -57,7 +58,7 @@ public final class SourceRecordIterator implements Iterator<List<AivenS3SourceRe private String currentObjectKey; private Iterator<S3ObjectSummary> nextFileIterator; - private Iterator<List<ConsumerRecord<byte[], byte[]>>> recordIterator = Collections.emptyIterator(); + private Iterator<ConsumerRecord<byte[], byte[]>> recordIterator = Collections.emptyIterator(); private final OffsetManager offsetManager; @@ -67,14 +68,17 @@ public final class SourceRecordIterator implements Iterator<List<AivenS3SourceRe private final OutputWriter outputWriter; + private final FileReader fileReader; // NOPMD + public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, - final OffsetManager offsetManager, final OutputWriter outputWriter, final Set<String> failedObjectKeys) { + final OffsetManager offsetManager, final OutputWriter outputWriter, final FileReader fileReader) { this.s3SourceConfig = s3SourceConfig; this.offsetManager = offsetManager; this.s3Client = s3Client; this.bucketName = bucketName; this.outputWriter = outputWriter; - final FileReader fileReader = new FileReader(s3SourceConfig, bucketName, failedObjectKeys); + this.fileReader = fileReader; + // final FileReader fileReader = new FileReader(s3SourceConfig, bucketName, failedObjectKeys); try { final List<S3ObjectSummary> chunks = fileReader.fetchObjectSummaries(s3Client); nextFileIterator = chunks.iterator(); @@ -98,7 +102,7 @@ private void nextS3Object() { } } - private Iterator<List<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentFile() throws IOException { + private Iterator<ConsumerRecord<byte[], byte[]>> createIteratorForCurrentFile() throws IOException { try (S3Object s3Object = s3Client.getObject(bucketName, currentObjectKey); S3ObjectInputStream inputStream = s3Object.getObjectContent()) { @@ -128,11 +132,12 @@ private Iterator<List<ConsumerRecord<byte[], byte[]>>> createIteratorForCurrentF } @SuppressWarnings("PMD.CognitiveComplexity") - private Iterator<List<ConsumerRecord<byte[], byte[]>>> getObjectIterator(final InputStream valueInputStream, + private Iterator<ConsumerRecord<byte[], byte[]>> getObjectIterator(final InputStream valueInputStream, final String topic, final int topicPartition, final long startOffset, final OutputWriter outputWriter, final Map<String, Object> partitionMap) { return new Iterator<>() { - private List<ConsumerRecord<byte[], byte[]>> nextRecord = readNext(); + private final Iterator<ConsumerRecord<byte[], byte[]>> internalIterator = readNext().iterator(); + private List<ConsumerRecord<byte[], byte[]>> readNext() { final Optional<byte[]> optionalKeyBytes = Optional.ofNullable(currentObjectKey) @@ -142,7 +147,7 @@ private List<ConsumerRecord<byte[], byte[]>> readNext() { int numOfProcessedRecs = 1; boolean checkOffsetMap = true; for (final Object record : outputWriter.getRecords(valueInputStream, topic, topicPartition)) { - final byte[] valueBytes = outputWriter.getValueBytes(record, topic, s3SourceConfig); + if (offsetManager.getOffsets().containsKey(partitionMap) && checkOffsetMap) { final Map<String, Object> offsetVal = offsetManager.getOffsets().get(partitionMap); if (offsetVal.containsKey(offsetManager.getObjectMapKey(currentObjectKey))) { @@ -154,10 +159,14 @@ private List<ConsumerRecord<byte[], byte[]>> readNext() { } } } - + final byte[] valueBytes = outputWriter.getValueBytes(record, topic, s3SourceConfig); checkOffsetMap = false; consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, topic, topicPartition, offsetManager, startOffset, partitionMap)); + if (consumerRecordList.size() >= s3SourceConfig.getInt(MAX_POLL_RECORDS)) { + break; + } + numOfProcessedRecs++; } return consumerRecordList; @@ -184,19 +193,12 @@ private ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> @Override public boolean hasNext() { - return !nextRecord.isEmpty(); + return internalIterator.hasNext(); } @Override - public List<ConsumerRecord<byte[], byte[]>> next() { - if (nextRecord.isEmpty()) { - // LOGGER.error("May be error in reading s3 object " + currentObjectKey); - return Collections.emptyList(); - // throw new NoSuchElementException(); - } - final List<ConsumerRecord<byte[], byte[]>> currentRecord = nextRecord; - nextRecord = Collections.emptyList(); - return currentRecord; + public ConsumerRecord<byte[], byte[]> next() { + return internalIterator.next(); } }; } @@ -207,40 +209,24 @@ public boolean hasNext() { } @Override - @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") - public List<AivenS3SourceRecord> next() { + public AivenS3SourceRecord next() { if (!recordIterator.hasNext()) { nextS3Object(); } if (!recordIterator.hasNext()) { - // If there are still no records, return an empty list - return Collections.emptyList(); // or new ArrayList<>() for mutable list - } - - final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = recordIterator.next(); - if (consumerRecordList.isEmpty()) { - // LOGGER.error("May be error in reading s3 object " + currentObjectKey); - return Collections.emptyList(); - // throw new NoSuchElementException(); + // If there are still no records, return null or throw an exception + return null; // Or throw new NoSuchElementException(); } - final List<AivenS3SourceRecord> aivenS3SourceRecordList = new ArrayList<>(); - AivenS3SourceRecord aivenS3SourceRecord; - Map<String, Object> offsetMap; - Map<String, Object> partitionMap; - for (final ConsumerRecord<byte[], byte[]> currentRecord : consumerRecordList) { - partitionMap = ConnectUtils.getPartitionMap(currentRecord.topic(), currentRecord.partition(), bucketName); - - offsetMap = offsetManager.getOffsetValueMap(currentObjectKey, currentRecord.offset()); - - aivenS3SourceRecord = new AivenS3SourceRecord(partitionMap, offsetMap, currentRecord.topic(), - currentRecord.partition(), currentRecord.key(), currentRecord.value(), currentObjectKey); - - aivenS3SourceRecordList.add(aivenS3SourceRecord); - } + final ConsumerRecord<byte[], byte[]> consumerRecord = recordIterator.next(); + final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(consumerRecord.topic(), + consumerRecord.partition(), bucketName); + final Map<String, Object> offsetMap = offsetManager.getOffsetValueMap(currentObjectKey, + consumerRecord.offset()); - return Collections.unmodifiableList(aivenS3SourceRecordList); + return new AivenS3SourceRecord(partitionMap, offsetMap, consumerRecord.topic(), consumerRecord.partition(), + consumerRecord.key(), consumerRecord.value(), currentObjectKey); } @Override diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java index eb207dc75..d0a626a76 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -23,7 +23,6 @@ import static org.mockito.Mockito.when; import java.lang.reflect.Field; -import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -152,11 +151,11 @@ void testPoll() throws Exception { setPrivateField(s3SourceTask, "sourceRecordIterator", mockSourceRecordIterator); when(mockSourceRecordIterator.hasNext()).thenReturn(true).thenReturn(true).thenReturn(false); - final List<AivenS3SourceRecord> aivenS3SourceRecordList = getAivenS3SourceRecords(); + final AivenS3SourceRecord aivenS3SourceRecordList = getAivenS3SourceRecord(); when(mockSourceRecordIterator.next()).thenReturn(aivenS3SourceRecordList); final List<SourceRecord> sourceRecordList = s3SourceTask.poll(); - assertThat(sourceRecordList).hasSize(2); + assertThat(sourceRecordList).isNotEmpty(); } @Test @@ -170,15 +169,13 @@ void testStop() { assertThat(s3SourceTask.getConnectorStopped()).isTrue(); } - private static List<AivenS3SourceRecord> getAivenS3SourceRecords() { - final List<AivenS3SourceRecord> aivenS3SourceRecordList = new ArrayList<>(); - final AivenS3SourceRecord aivenS3SourceRecord1 = new AivenS3SourceRecord(new HashMap<>(), new HashMap<>(), - "testtopic", 0, new byte[0], new byte[0], ""); - aivenS3SourceRecordList.add(aivenS3SourceRecord1); - final AivenS3SourceRecord aivenS3SourceRecord2 = new AivenS3SourceRecord(new HashMap<>(), new HashMap<>(), - "testtopic", 1, new byte[0], new byte[0], ""); - aivenS3SourceRecordList.add(aivenS3SourceRecord2); - return aivenS3SourceRecordList; + private static AivenS3SourceRecord getAivenS3SourceRecord() { + // final List<AivenS3SourceRecord> aivenS3SourceRecordList = new ArrayList<>(); + // aivenS3SourceRecordList.add(aivenS3SourceRecord1); + // final AivenS3SourceRecord aivenS3SourceRecord2 = new AivenS3SourceRecord(new HashMap<>(), new HashMap<>(), + // "testtopic", 1, new byte[0], new byte[0], ""); + // aivenS3SourceRecordList.add(aivenS3SourceRecord2); + return new AivenS3SourceRecord(new HashMap<>(), new HashMap<>(), "testtopic", 0, new byte[0], new byte[0], ""); } @SuppressWarnings("PMD.AvoidAccessibilityAlteration") diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java index 9e1c6e958..c902ffeea 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java @@ -30,7 +30,6 @@ import java.util.Map; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import org.apache.avro.Schema; import org.apache.avro.file.DataFileWriter; @@ -49,8 +48,6 @@ final class AvroWriterTest { @Mock private S3SourceConfig s3SourceConfig; - @Mock - private OffsetManager offsetManager; private AvroWriter avroWriter; private Map<String, String> config; diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java index 443e05932..be2e273be 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -62,7 +62,7 @@ class RecordProcessorTest { private OffsetManager offsetManager; private AtomicBoolean connectorStopped; - private Iterator<List<AivenS3SourceRecord>> sourceRecordIterator; + private Iterator<AivenS3SourceRecord> sourceRecordIterator; @BeforeEach void setUp() { @@ -96,8 +96,7 @@ void testProcessRecordsWithRecords() throws ConnectException { when(sourceRecordIterator.hasNext()).thenReturn(true, false); // One iteration with records final AivenS3SourceRecord mockRecord = mock(AivenS3SourceRecord.class); - final List<AivenS3SourceRecord> recordList = Collections.singletonList(mockRecord); - when(sourceRecordIterator.next()).thenReturn(recordList); + when(sourceRecordIterator.next()).thenReturn(mockRecord); final List<SourceRecord> results = new ArrayList<>(); RecordProcessor.processRecords( @@ -115,7 +114,7 @@ void testProcessRecordsWithRecords() throws ConnectException { } @Test - void testProcessRecordsConnectorStopped() throws ConnectException { + void testProcessRecordsConnectorStopped() { when(s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS)).thenReturn(5); connectorStopped.set(true); // Simulate connector stopped @@ -135,7 +134,7 @@ void testProcessRecordsConnectorStopped() throws ConnectException { } @Test - void testCreateSourceRecords() throws ConnectException { + void testCreateSourceRecords() { final AivenS3SourceRecord mockRecord = mock(AivenS3SourceRecord.class); when(mockRecord.getToTopic()).thenReturn("test-topic"); when(mockRecord.key()).thenReturn("mock-key".getBytes(StandardCharsets.UTF_8)); @@ -145,11 +144,10 @@ void testCreateSourceRecords() throws ConnectException { .thenReturn(new SchemaAndValue(null, "mock-value-converted")); when(mockRecord.getSourceRecord(anyString(), any(), any())).thenReturn(mock(SourceRecord.class)); - final List<AivenS3SourceRecord> recordList = Collections.singletonList(mockRecord); - final List<SourceRecord> sourceRecords = RecordProcessor.createSourceRecords(recordList, s3SourceConfig, + final SourceRecord sourceRecords = RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, Optional.of(keyConverter), valueConverter, new HashMap<>(), outputWriter, Collections.emptySet(), offsetManager); - assertThat(sourceRecords.size()).isEqualTo(1); + assertThat(sourceRecords).isNotNull(); } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java new file mode 100644 index 000000000..4a4fe9c7b --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -0,0 +1,112 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.any; +import static org.mockito.Mockito.anyInt; +import static org.mockito.Mockito.anyString; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.List; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.output.OutputWriter; + +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.model.ListObjectsV2Result; +import com.amazonaws.services.s3.model.S3Object; +import com.amazonaws.services.s3.model.S3ObjectInputStream; +import com.amazonaws.services.s3.model.S3ObjectSummary; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +final class SourceRecordIteratorTest { + + private AmazonS3 mockS3Client; + private S3SourceConfig mockConfig; + private OffsetManager mockOffsetManager; + private OutputWriter mockOutputWriter; + + private FileReader mockFileReader; + + @BeforeEach + public void setUp() { + mockS3Client = mock(AmazonS3.class); + mockConfig = mock(S3SourceConfig.class); + mockOffsetManager = mock(OffsetManager.class); + mockOutputWriter = mock(OutputWriter.class); + mockFileReader = mock(FileReader.class); + } + + @Test + void testIteratorProcessesS3Objects() throws Exception { + final S3ObjectSummary mockSummary = new S3ObjectSummary(); + mockSummary.setKey("topic-00001-abc123.txt"); + + // Mock list of S3 object summaries + final List<S3ObjectSummary> mockObjectSummaries = Collections.singletonList(mockSummary); + final ListObjectsV2Result result = mockListObjectsResult(mockObjectSummaries); + when(mockS3Client.listObjectsV2(anyString())).thenReturn(result); + + // Mock S3Object and InputStream + try (S3Object mockS3Object = mock(S3Object.class); + S3ObjectInputStream mockInputStream = new S3ObjectInputStream(new ByteArrayInputStream(new byte[] {}), + null);) { + when(mockS3Client.getObject(anyString(), anyString())).thenReturn(mockS3Object); + when(mockS3Object.getObjectContent()).thenReturn(mockInputStream); + + when(mockOutputWriter.getRecords(any(), anyString(), anyInt())) + .thenReturn(Collections.singletonList(new Object())); + + final String outStr = "this is a test"; + when(mockOutputWriter.getValueBytes(any(), anyString(), any())) + .thenReturn(outStr.getBytes(StandardCharsets.UTF_8)); + + when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); + + when(mockFileReader.fetchObjectSummaries(any())).thenReturn(Collections.emptyList()); + SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockS3Client, "test-bucket", + mockOffsetManager, mockOutputWriter, mockFileReader); + + assertFalse(iterator.hasNext()); + assertNull(iterator.next()); + + when(mockFileReader.fetchObjectSummaries(any())).thenReturn(mockObjectSummaries); + + iterator = new SourceRecordIterator(mockConfig, mockS3Client, "test-bucket", mockOffsetManager, + mockOutputWriter, mockFileReader); + + assertTrue(iterator.hasNext()); + assertNotNull(iterator.next()); + } + + } + + private ListObjectsV2Result mockListObjectsResult(final List<S3ObjectSummary> summaries) { + final ListObjectsV2Result result = mock(ListObjectsV2Result.class); + when(result.getObjectSummaries()).thenReturn(summaries); + return result; + } +} From fee503ee84356e2aceb1b2a013d2f36e2e0c6999 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Tue, 15 Oct 2024 21:53:01 +0200 Subject: [PATCH 47/90] Introduce max message bytes config for bytes format --- .../s3/source/config/S3SourceConfig.java | 7 ++++ .../connect/s3/source/output/AvroWriter.java | 3 +- .../s3/source/output/ByteArrayWriter.java | 36 ++++++++++++++----- .../connect/s3/source/output/JsonWriter.java | 3 +- .../s3/source/output/OutputWriter.java | 2 +- .../s3/source/output/ParquetWriter.java | 3 +- .../s3/source/utils/SourceRecordIterator.java | 3 +- .../s3/source/output/JsonWriterTest.java | 9 ++--- .../s3/source/output/ParquetWriterTest.java | 11 ++++-- .../utils/SourceRecordIteratorTest.java | 2 +- 10 files changed, 57 insertions(+), 22 deletions(-) diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index eae52da31..c138fd38a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -86,6 +86,8 @@ final public class S3SourceConfig extends AbstractConfig { public static final String TARGET_TOPICS = "topics"; public static final String FETCH_PAGE_SIZE = "aws.s3.fetch.page.size"; public static final String MAX_POLL_RECORDS = "max.poll.records"; + + public static final String MAX_MESSAGE_BYTES_SIZE = "max.message.bytes"; public static final String KEY_CONVERTER = "key.converter"; public static final String VALUE_CONVERTER = "value.converter"; public static final int S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT = 3; @@ -179,6 +181,11 @@ private static void addOtherConfig(final S3SourceConfigDef configDef) { "Value converter", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD // UnusedAssignment ConfigDef.Width.NONE, VALUE_CONVERTER); + configDef.define(MAX_MESSAGE_BYTES_SIZE, ConfigDef.Type.INT, 1_048_588, ConfigDef.Importance.MEDIUM, + "The largest record batch size allowed by Kafka config max.message.bytes", GROUP_OTHER, + awsOtherGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, MAX_MESSAGE_BYTES_SIZE); } private static void addAwsStsConfigGroup(final ConfigDef configDef) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java index cfce865ca..798f444c4 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java @@ -46,7 +46,8 @@ public void configureValueConverter(final Map<String, String> config, final S3So } @Override - public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition) { + public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition, + final S3SourceConfig s3SourceConfig) { final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); return readAvroRecords(inputStream, datumReader); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java index e1052ac8c..038bec3e0 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java @@ -16,14 +16,16 @@ package io.aiven.kafka.connect.s3.source.output; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_MESSAGE_BYTES_SIZE; + import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; import java.util.List; import java.util.Map; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import com.amazonaws.util.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,18 +37,34 @@ public void configureValueConverter(final Map<String, String> config, final S3So } + @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") @Override - public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition) { - return List.of(inputStream); - } + public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition, + final S3SourceConfig s3SourceConfig) { - @Override - public byte[] getValueBytes(final Object record, final String topic, final S3SourceConfig s3SourceConfig) { + final int maxMessageBytesSize = s3SourceConfig.getInt(MAX_MESSAGE_BYTES_SIZE); + final byte[] buffer = new byte[maxMessageBytesSize]; + int bytesRead; + + final List<Object> chunks = new ArrayList<>(); try { - return IOUtils.toByteArray((InputStream) record); + bytesRead = inputStream.read(buffer); + while (bytesRead != -1) { + // Create a byte array with the exact number of bytes read + final byte[] chunk = new byte[bytesRead]; + System.arraycopy(buffer, 0, chunk, 0, bytesRead); + chunks.add(chunk); + bytesRead = inputStream.read(buffer); + } } catch (IOException e) { - LOGGER.error("Error in reading s3 object stream " + e.getMessage()); - return new byte[0]; + LOGGER.error("Error reading from input stream: " + e.getMessage(), e); } + + return chunks; + } + + @Override + public byte[] getValueBytes(final Object record, final String topic, final S3SourceConfig s3SourceConfig) { + return (byte[]) record; } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java index 73b0c42f5..ad95f88a9 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java @@ -43,7 +43,8 @@ public void configureValueConverter(final Map<String, String> config, final S3So } @Override - public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition) { + public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition, + final S3SourceConfig s3SourceConfig) { final List<Object> jsonNodeList = new ArrayList<>(); JsonNode jsonNode; try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java index af648564e..14616f807 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java @@ -31,7 +31,7 @@ public interface OutputWriter { void configureValueConverter(Map<String, String> config, S3SourceConfig s3SourceConfig); - List<Object> getRecords(InputStream inputStream, String topic, int topicPartition); + List<Object> getRecords(InputStream inputStream, String topic, int topicPartition, S3SourceConfig s3SourceConfig); byte[] getValueBytes(Object record, String topic, S3SourceConfig s3SourceConfig); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java index fd5ab11b0..ca5fec032 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java @@ -50,7 +50,8 @@ public void configureValueConverter(final Map<String, String> config, final S3So } @Override - public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition) { + public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition, + final S3SourceConfig s3SourceConfig) { return getParquetRecords(inputStream, topic, topicPartition); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 6c9b2c3a9..a7f91d606 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -146,7 +146,8 @@ private List<ConsumerRecord<byte[], byte[]>> readNext() { int numOfProcessedRecs = 1; boolean checkOffsetMap = true; - for (final Object record : outputWriter.getRecords(valueInputStream, topic, topicPartition)) { + for (final Object record : outputWriter.getRecords(valueInputStream, topic, topicPartition, + s3SourceConfig)) { if (offsetManager.getOffsets().containsKey(partitionMap) && checkOffsetMap) { final Map<String, Object> offsetVal = offsetManager.getOffsets().get(partitionMap); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonWriterTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonWriterTest.java index e33a2da1e..2c4bbc52f 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonWriterTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonWriterTest.java @@ -65,8 +65,8 @@ void testConfigureValueConverter() { void testHandleValueDataWithValidJson() { final InputStream validJsonInputStream = new ByteArrayInputStream( "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); - - final List<Object> jsonNodes = jsonWriter.getRecords(validJsonInputStream, "testtopic", 1); + final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); + final List<Object> jsonNodes = jsonWriter.getRecords(validJsonInputStream, "testtopic", 1, s3SourceConfig); assertThat(jsonNodes.size()).isEqualTo(1); } @@ -75,8 +75,9 @@ void testHandleValueDataWithValidJson() { void testHandleValueDataWithInvalidJson() { final InputStream invalidJsonInputStream = new ByteArrayInputStream( "invalid-json".getBytes(StandardCharsets.UTF_8)); + final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); - final List<Object> jsonNodes = jsonWriter.getRecords(invalidJsonInputStream, "testtopic", 1); + final List<Object> jsonNodes = jsonWriter.getRecords(invalidJsonInputStream, "testtopic", 1, s3SourceConfig); assertThat(jsonNodes.size()).isEqualTo(0); } @@ -86,7 +87,7 @@ void testSerializeJsonDataValid() throws IOException { final InputStream validJsonInputStream = new ByteArrayInputStream( "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); - final List<Object> jsonNodes = jsonWriter.getRecords(validJsonInputStream, "testtopic", 1); + final List<Object> jsonNodes = jsonWriter.getRecords(validJsonInputStream, "testtopic", 1, s3SourceConfig); final byte[] serializedData = jsonWriter.getValueBytes(jsonNodes.get(0), "testtopic", s3SourceConfig); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java index dd4619332..a3830a585 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java @@ -17,6 +17,7 @@ package io.aiven.kafka.connect.s3.source.output; import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -26,6 +27,7 @@ import java.nio.file.Path; import java.util.List; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; import com.amazonaws.util.IOUtils; @@ -45,10 +47,11 @@ public void setUp() { void testHandleValueDataWithZeroBytes() { final byte[] mockParquetData = new byte[0]; final InputStream inputStream = new ByteArrayInputStream(mockParquetData); + final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); final String topic = "test-topic"; final int topicPartition = 0; - final List<Object> recs = parquetWriter.getRecords(inputStream, topic, topicPartition); + final List<Object> recs = parquetWriter.getRecords(inputStream, topic, topicPartition, s3SourceConfig); assertThat(recs).isEmpty(); } @@ -57,11 +60,12 @@ void testHandleValueDataWithZeroBytes() { void testGetRecordsWithValidData() throws Exception { final byte[] mockParquetData = generateMockParquetData(); final InputStream inputStream = new ByteArrayInputStream(mockParquetData); + final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); final String topic = "test-topic"; final int topicPartition = 0; - final List<Object> records = parquetWriter.getRecords(inputStream, topic, topicPartition); + final List<Object> records = parquetWriter.getRecords(inputStream, topic, topicPartition, s3SourceConfig); assertThat(records).isNotEmpty(); assertThat(records).extracting(record -> ((GenericRecord) record).get("name").toString()) @@ -73,11 +77,12 @@ void testGetRecordsWithValidData() throws Exception { void testGetRecordsWithInvalidData() { final byte[] invalidData = "invalid data".getBytes(StandardCharsets.UTF_8); final InputStream inputStream = new ByteArrayInputStream(invalidData); + final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); final String topic = "test-topic"; final int topicPartition = 0; - final List<Object> records = parquetWriter.getRecords(inputStream, topic, topicPartition); + final List<Object> records = parquetWriter.getRecords(inputStream, topic, topicPartition, s3SourceConfig); assertThat(records).isEmpty(); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java index 4a4fe9c7b..da77c8b1a 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -77,7 +77,7 @@ void testIteratorProcessesS3Objects() throws Exception { when(mockS3Client.getObject(anyString(), anyString())).thenReturn(mockS3Object); when(mockS3Object.getObjectContent()).thenReturn(mockInputStream); - when(mockOutputWriter.getRecords(any(), anyString(), anyInt())) + when(mockOutputWriter.getRecords(any(), anyString(), anyInt(), any())) .thenReturn(Collections.singletonList(new Object())); final String outStr = "this is a test"; From 1017d35c19d48f96ee5b4229bae364011fa2b387 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Tue, 15 Oct 2024 22:30:49 +0200 Subject: [PATCH 48/90] Add test for bytearray with chunks based on max messsage bytes --- .../connect/s3/source/IntegrationTest.java | 2 + .../s3/source/output/ByteArrayWriter.java | 1 - .../s3/source/utils/SourceRecordIterator.java | 1 - .../s3/source/output/ByteArrayWriterTest.java | 94 +++++++++++++++++++ 4 files changed, 96 insertions(+), 2 deletions(-) create mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriterTest.java diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 2c06d0151..2705ce25a 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -21,6 +21,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_ENDPOINT_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_MESSAGE_BYTES_SIZE; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT_KEY; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; @@ -146,6 +147,7 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + connectorConfig.put(MAX_MESSAGE_BYTES_SIZE, "2"); connectRunner.createConnector(connectorConfig); connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.BYTES.getValue()); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java index 038bec3e0..fabe2cb03 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java @@ -50,7 +50,6 @@ public List<Object> getRecords(final InputStream inputStream, final String topic try { bytesRead = inputStream.read(buffer); while (bytesRead != -1) { - // Create a byte array with the exact number of bytes read final byte[] chunk = new byte[bytesRead]; System.arraycopy(buffer, 0, chunk, 0, bytesRead); chunks.add(chunk); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index a7f91d606..d5b7bd236 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -78,7 +78,6 @@ public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 this.bucketName = bucketName; this.outputWriter = outputWriter; this.fileReader = fileReader; - // final FileReader fileReader = new FileReader(s3SourceConfig, bucketName, failedObjectKeys); try { final List<S3ObjectSummary> chunks = fileReader.fetchObjectSummaries(s3Client); nextFileIterator = chunks.iterator(); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriterTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriterTest.java new file mode 100644 index 000000000..1169cef01 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriterTest.java @@ -0,0 +1,94 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.output; + +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_MESSAGE_BYTES_SIZE; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.when; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +class ByteArrayWriterTest { + + private ByteArrayWriter byteArrayWriter; + + @Mock + private S3SourceConfig s3SourceConfig; + + @BeforeEach + void setUp() { + MockitoAnnotations.openMocks(this); + byteArrayWriter = new ByteArrayWriter(); + } + + @Test + void testGetRecordsSingleChunk() { + final byte[] data = { 1, 2, 3, 4, 5 }; + final InputStream inputStream = new ByteArrayInputStream(data); + + when(s3SourceConfig.getInt(MAX_MESSAGE_BYTES_SIZE)).thenReturn(10_000); // Larger than data size + + final List<Object> records = byteArrayWriter.getRecords(inputStream, "test-topic", 0, s3SourceConfig); + + assertEquals(1, records.size()); + assertArrayEquals(data, (byte[]) records.get(0)); + } + + @Test + void testGetRecordsMultipleChunks() { + final byte[] data = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + final InputStream inputStream = new ByteArrayInputStream(data); + + when(s3SourceConfig.getInt(MAX_MESSAGE_BYTES_SIZE)).thenReturn(5); // Smaller than data size + + final List<Object> records = byteArrayWriter.getRecords(inputStream, "test-topic", 0, s3SourceConfig); + + assertEquals(2, records.size()); + assertArrayEquals(new byte[] { 1, 2, 3, 4, 5 }, (byte[]) records.get(0)); + assertArrayEquals(new byte[] { 6, 7, 8, 9, 10 }, (byte[]) records.get(1)); + } + + @Test + void testGetRecordsEmptyInputStream() throws IOException { + final InputStream inputStream = new ByteArrayInputStream(new byte[] {}); + + when(s3SourceConfig.getInt(MAX_MESSAGE_BYTES_SIZE)).thenReturn(5); + + final List<Object> records = byteArrayWriter.getRecords(inputStream, "test-topic", 0, s3SourceConfig); + + assertEquals(0, records.size()); + } + + @Test + void testGetValueBytes() { + final byte[] record = { 1, 2, 3 }; + final byte[] result = byteArrayWriter.getValueBytes(record, "test-topic", s3SourceConfig); + + assertArrayEquals(record, result); + } +} From af7ff5d2d0344f45fbcc14f330c14de4fb4c96d7 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Wed, 16 Oct 2024 18:05:41 +0200 Subject: [PATCH 49/90] Fixes from review --- .../connect/s3/source/IntegrationTest.java | 32 +++----------- .../kafka/connect/s3/source/S3SourceTask.java | 20 +++++---- .../s3/source/config/S3SourceConfig.java | 22 +++------- .../{AvroWriter.java => AvroTransformer.java} | 10 ++--- ...yWriter.java => ByteArrayTransformer.java} | 8 ++-- .../{OutputFormat.java => InputFormat.java} | 6 +-- .../{JsonWriter.java => JsonTransformer.java} | 12 +++-- ...uetWriter.java => ParquetTransformer.java} | 12 ++--- ...putUtils.java => TransformationUtils.java} | 8 ++-- .../{OutputWriter.java => Transformer.java} | 7 +-- ...erFactory.java => TransformerFactory.java} | 24 +++++----- .../s3/source/utils/AivenS3SourceRecord.java | 10 ++--- .../connect/s3/source/utils/FileReader.java | 28 ++++++++++-- .../s3/source/utils/OffsetManager.java | 14 ++++++ .../s3/source/utils/RecordProcessor.java | 20 ++++----- .../s3/source/utils/SourceRecordIterator.java | 38 +++++++--------- .../connect/s3/source/S3SourceTaskTest.java | 20 ++++----- .../s3/source/config/S3SourceConfigTest.java | 6 +-- ...iterTest.java => AvroTransformerTest.java} | 17 +++---- ...est.java => ByteArrayTransformerTest.java} | 19 ++++---- ...iterTest.java => JsonTransformerTest.java} | 34 ++++++-------- ...rTest.java => ParquetTransformerTest.java} | 11 +++-- .../s3/source/testutils/BucketAccessor.java | 4 +- .../s3/source/utils/FileReaderTest.java | 44 +++++++++++-------- .../s3/source/utils/OffsetManagerTest.java | 5 +-- .../s3/source/utils/RecordProcessorTest.java | 24 +++++----- .../utils/SourceRecordIteratorTest.java | 19 ++++---- 27 files changed, 240 insertions(+), 234 deletions(-) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/{AvroWriter.java => AvroTransformer.java} (89%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/{ByteArrayWriter.java => ByteArrayTransformer.java} (89%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/{OutputFormat.java => InputFormat.java} (86%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/{JsonWriter.java => JsonTransformer.java} (87%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/{ParquetWriter.java => ParquetTransformer.java} (88%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/{OutputUtils.java => TransformationUtils.java} (93%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/{OutputWriter.java => Transformer.java} (87%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/{OutputWriterFactory.java => TransformerFactory.java} (65%) rename s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/{AvroWriterTest.java => AvroTransformerTest.java} (88%) rename s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/{ByteArrayWriterTest.java => ByteArrayTransformerTest.java} (77%) rename s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/{JsonWriterTest.java => JsonTransformerTest.java} (69%) rename s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/{ParquetWriterTest.java => ParquetTransformerTest.java} (91%) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 2705ce25a..dbc3bfba2 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -21,8 +21,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_ENDPOINT_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_MESSAGE_BYTES_SIZE; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT_KEY; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.INPUT_FORMAT_KEY; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; @@ -46,7 +45,7 @@ import org.apache.kafka.clients.admin.AdminClient; -import io.aiven.kafka.connect.s3.source.output.OutputFormat; +import io.aiven.kafka.connect.s3.source.output.InputFormat; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; import io.aiven.kafka.connect.s3.source.testutils.S3OutputStream; @@ -147,9 +146,8 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectorConfig.put(MAX_MESSAGE_BYTES_SIZE, "2"); connectRunner.createConnector(connectorConfig); - connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.BYTES.getValue()); + connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; @@ -192,7 +190,7 @@ void multiPartUploadBytesTest(final TestInfo testInfo) throws ExecutionException void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.AVRO.getValue()); + connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.AVRO.getValue()); connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); connectorConfig.put(VALUE_CONVERTER_SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); @@ -238,7 +236,7 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc void parquetTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.PARQUET.getValue()); + connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.PARQUET.getValue()); connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); connectorConfig.put("value.converter.schema.registry.url", SCHEMA_REGISTRY.getSchemaRegistryUrl()); @@ -254,7 +252,7 @@ void parquetTest(final TestInfo testInfo) throws ExecutionException, Interrupted try { s3Client.putObject(TEST_BUCKET_NAME, fileName, Files.newInputStream(path), null); } catch (final Exception e) { // NOPMD broad exception caught - LOGGER.error("Error in reading file" + e.getMessage()); + LOGGER.error("Error in reading file {}", e.getMessage(), e); } finally { Files.delete(path); } @@ -270,7 +268,7 @@ void parquetTest(final TestInfo testInfo) throws ExecutionException, Interrupted void jsonTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectorConfig.put(OUTPUT_FORMAT_KEY, OutputFormat.JSON.getValue()); + connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.JSONL.getValue()); connectorConfig.put("value.converter", "org.apache.kafka.connect.json.JsonConverter"); connectRunner.createConnector(connectorConfig); @@ -319,7 +317,6 @@ private static ByteArrayOutputStream getAvroRecord(final Schema schema, final in private static void writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) throws IOException { final String filePrefix = topicName + "-" + partitionId + "-" + System.currentTimeMillis(); - // final String filePrefix = topicName + "-" + partitionId + "-" + "1234567891010"; final String fileSuffix = ".txt"; final Path testFilePath = File.createTempFile(filePrefix, fileSuffix).toPath(); @@ -371,19 +368,4 @@ public void multipartUpload(final String bucketName, final String key) { LOGGER.error(e.getMessage()); } } - - // private Map<Map<String, Object>, Map<String, Object>> getTmpData() { - // final Map<Map<String, Object>, Map<String, Object>> tmpOffsets = new HashMap<>(); - // final Map<String, Object> partitionKeyMap = new HashMap<>(); - // partitionKeyMap.put("topic", "avroTest"); - // partitionKeyMap.put("bucket", "test-bucket0"); - // partitionKeyMap.put("topicPartition", 1); - // - // final Map<String, Object> offsetValMap = new HashMap<>(); - // offsetValMap.put(OBJECT_KEY + ":" + "avroTest-00001-1234567891010.txt", 4L); - // - // tmpOffsets.put(partitionKeyMap, offsetValMap); - // - // return tmpOffsets; - // } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 99cdaf953..19d229ed7 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -37,8 +37,8 @@ import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.output.OutputWriter; -import io.aiven.kafka.connect.s3.source.output.OutputWriterFactory; +import io.aiven.kafka.connect.s3.source.output.Transformer; +import io.aiven.kafka.connect.s3.source.output.TransformerFactory; import io.aiven.kafka.connect.s3.source.utils.AivenS3SourceRecord; import io.aiven.kafka.connect.s3.source.utils.FileReader; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; @@ -77,7 +77,7 @@ public class S3SourceTask extends SourceTask { private Converter valueConverter; - private OutputWriter outputWriter; + private Transformer transformer; private String s3Bucket; @@ -87,7 +87,9 @@ public class S3SourceTask extends SourceTask { private final S3ClientFactory s3ClientFactory = new S3ClientFactory(); private final Object pollLock = new Object(); + private FileReader fileReader; private final Set<String> failedObjectKeys = new HashSet<>(); + private final Set<String> inProcessObjectKeys = new HashSet<>(); private OffsetManager offsetManager; @@ -108,8 +110,9 @@ public void start(final Map<String, String> props) { initializeConverters(); initializeS3Client(); this.s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); - this.outputWriter = OutputWriterFactory.getWriter(s3SourceConfig); + this.transformer = TransformerFactory.getWriter(s3SourceConfig); offsetManager = new OffsetManager(context, s3SourceConfig); + fileReader = new FileReader(s3SourceConfig, this.s3Bucket, failedObjectKeys, inProcessObjectKeys); prepareReaderFromOffsetStorageReader(); this.taskInitialized = true; } @@ -133,9 +136,8 @@ private void initializeS3Client() { } private void prepareReaderFromOffsetStorageReader() { - final FileReader fileReader = new FileReader(s3SourceConfig, this.s3Bucket, failedObjectKeys); sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, s3Client, this.s3Bucket, offsetManager, - this.outputWriter, fileReader); + this.transformer, fileReader); } @Override @@ -186,7 +188,7 @@ private List<SourceRecord> extractSourceRecords(final List<SourceRecord> results return results; } return RecordProcessor.processRecords(sourceRecordIterator, results, s3SourceConfig, keyConverter, - valueConverter, connectorStopped, this.outputWriter, failedObjectKeys, offsetManager); + valueConverter, connectorStopped, this.transformer, fileReader, offsetManager); } private void waitForObjects() throws InterruptedException { @@ -219,8 +221,8 @@ public Converter getValueConverter() { return valueConverter; } - public OutputWriter getOutputWriter() { - return outputWriter; + public Transformer getOutputWriter() { + return transformer; } public boolean isTaskInitialized() { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index c138fd38a..c704d839b 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -31,7 +31,7 @@ import io.aiven.kafka.connect.common.config.validators.NonEmptyPassword; import io.aiven.kafka.connect.common.config.validators.UrlValidator; -import io.aiven.kafka.connect.s3.source.output.OutputFormat; +import io.aiven.kafka.connect.s3.source.output.InputFormat; import com.amazonaws.auth.AWSCredentialsProvider; import com.amazonaws.regions.Region; @@ -47,17 +47,11 @@ final public class S3SourceConfig extends AbstractConfig { public static final Logger LOGGER = LoggerFactory.getLogger(S3SourceConfig.class); public static final String AWS_S3_PREFIX_CONFIG = "aws.s3.prefix"; - public static final String AWS_S3_RETRY_BACKOFF_DELAY_MS_CONFIG = "aws.s3.backoff.delay.ms"; - public static final String AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_CONFIG = "aws.s3.backoff.max.delay.ms"; - public static final String AWS_S3_RETRY_BACKOFF_MAX_RETRIES_CONFIG = "aws.s3.backoff.max.retries"; - public static final String AWS_S3_REGION_CONFIG = "aws.s3.region"; - public static final String AWS_S3_ENDPOINT_CONFIG = "aws.s3.endpoint"; - public static final String AWS_STS_ROLE_ARN = "aws.sts.role.arn"; public static final String AWS_STS_ROLE_EXTERNAL_ID = "aws.sts.role.external.id"; public static final String AWS_STS_ROLE_SESSION_NAME = "aws.sts.role.session.name"; @@ -73,7 +67,6 @@ final public class S3SourceConfig extends AbstractConfig { public static final int AWS_S3_RETRY_BACKOFF_DELAY_MS_DEFAULT = 100; public static final int AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_DEFAULT = 20_000; public static final String SCHEMA_REGISTRY_URL = "schema.registry.url"; - public static final String VALUE_CONVERTER_SCHEMA_REGISTRY_URL = "value.converter.schema.registry.url"; public static final String VALUE_SERIALIZER = "value.serializer"; public static final String AWS_ACCESS_KEY_ID_CONFIG = "aws.access.key.id"; @@ -86,12 +79,11 @@ final public class S3SourceConfig extends AbstractConfig { public static final String TARGET_TOPICS = "topics"; public static final String FETCH_PAGE_SIZE = "aws.s3.fetch.page.size"; public static final String MAX_POLL_RECORDS = "max.poll.records"; - public static final String MAX_MESSAGE_BYTES_SIZE = "max.message.bytes"; public static final String KEY_CONVERTER = "key.converter"; public static final String VALUE_CONVERTER = "value.converter"; public static final int S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT = 3; - public static final String OUTPUT_FORMAT_KEY = "output.format"; + public static final String INPUT_FORMAT_KEY = "input.format"; public static final String SCHEMAS_ENABLE = "schemas.enable"; public S3SourceConfig(final Map<String, String> properties) { @@ -151,10 +143,10 @@ private static void addSchemaRegistryGroup(final ConfigDef configDef) { configDef.define(VALUE_CONVERTER_SCHEMA_REGISTRY_URL, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "SCHEMA REGISTRY URL", GROUP_OTHER, srCounter++, ConfigDef.Width.NONE, VALUE_CONVERTER_SCHEMA_REGISTRY_URL); - configDef.define(OUTPUT_FORMAT_KEY, ConfigDef.Type.STRING, OutputFormat.BYTES.getValue(), + configDef.define(INPUT_FORMAT_KEY, ConfigDef.Type.STRING, InputFormat.BYTES.getValue(), new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "Output format avro/json/parquet/bytes", GROUP_OTHER, srCounter++, // NOPMD - ConfigDef.Width.NONE, OUTPUT_FORMAT_KEY); + ConfigDef.Width.NONE, INPUT_FORMAT_KEY); configDef.define(VALUE_SERIALIZER, ConfigDef.Type.CLASS, null, ConfigDef.Importance.MEDIUM, "Value serializer", GROUP_OTHER, srCounter++, // NOPMD @@ -168,7 +160,7 @@ private static void addOtherConfig(final S3SourceConfigDef configDef) { ConfigDef.Importance.MEDIUM, "Fetch page size", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD // UnusedAssignment ConfigDef.Width.NONE, FETCH_PAGE_SIZE); - configDef.define(MAX_POLL_RECORDS, ConfigDef.Type.INT, 500, ConfigDef.Range.atLeast(1), + configDef.define(MAX_POLL_RECORDS, ConfigDef.Type.INT, 5, ConfigDef.Range.atLeast(1), ConfigDef.Importance.MEDIUM, "Max poll records", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD // UnusedAssignment ConfigDef.Width.NONE, MAX_POLL_RECORDS); @@ -347,8 +339,8 @@ public String getAwsS3BucketName() { return getString(AWS_S3_BUCKET_NAME_CONFIG); } - public OutputFormat getOutputFormat() { - return OutputFormat.valueOf(getString(OUTPUT_FORMAT_KEY).toUpperCase(Locale.ROOT)); + public InputFormat getOutputFormat() { + return InputFormat.valueOf(getString(INPUT_FORMAT_KEY).toUpperCase(Locale.ROOT)); } Region getAwsS3Region() { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroTransformer.java similarity index 89% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroTransformer.java index 798f444c4..6a6f1678e 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroTransformer.java @@ -36,9 +36,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class AvroWriter implements OutputWriter { +public class AvroTransformer implements Transformer { - private static final Logger LOGGER = LoggerFactory.getLogger(AvroWriter.class); + private static final Logger LOGGER = LoggerFactory.getLogger(AvroTransformer.class); @Override public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { @@ -54,7 +54,7 @@ public List<Object> getRecords(final InputStream inputStream, final String topic @Override public byte[] getValueBytes(final Object record, final String topic, final S3SourceConfig s3SourceConfig) { - return OutputUtils.serializeAvroRecordToBytes(Collections.singletonList((GenericRecord) record), topic, + return TransformationUtils.serializeAvroRecordToBytes(Collections.singletonList((GenericRecord) record), topic, s3SourceConfig); } @@ -64,10 +64,10 @@ List<Object> readAvroRecords(final InputStream content, final DatumReader<Generi try (DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, datumReader)) { reader.forEach(records::add); } catch (IOException e) { - LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + LOGGER.error("Error in reading s3 object stream {}", e.getMessage(), e); } } catch (IOException e) { - LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + LOGGER.error("Error in reading s3 object stream {}", e.getMessage(), e); } return records; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayTransformer.java similarity index 89% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayTransformer.java index fabe2cb03..dadbfa8e3 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayTransformer.java @@ -29,12 +29,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class ByteArrayWriter implements OutputWriter { - private static final Logger LOGGER = LoggerFactory.getLogger(ByteArrayWriter.class); +public class ByteArrayTransformer implements Transformer { + private static final Logger LOGGER = LoggerFactory.getLogger(ByteArrayTransformer.class); @Override public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { - + // For byte array transformations, no explicit converter is configured. } @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") @@ -56,7 +56,7 @@ public List<Object> getRecords(final InputStream inputStream, final String topic bytesRead = inputStream.read(buffer); } } catch (IOException e) { - LOGGER.error("Error reading from input stream: " + e.getMessage(), e); + LOGGER.error("Error reading from input stream: {}", e.getMessage(), e); } return chunks; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/InputFormat.java similarity index 86% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/InputFormat.java index 16bca89a4..87a240182 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputFormat.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/InputFormat.java @@ -18,12 +18,12 @@ import java.util.Locale; -public enum OutputFormat { - AVRO("avro"), PARQUET("parquet"), JSON("json"), BYTES("bytes"); +public enum InputFormat { + AVRO("avro"), PARQUET("parquet"), JSONL("jsonl"), BYTES("bytes"); private final String format; - OutputFormat(final String format) { + InputFormat(final String format) { this.format = format; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonTransformer.java similarity index 87% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonTransformer.java index ad95f88a9..2c4fa286a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonTransformer.java @@ -32,8 +32,12 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -public class JsonWriter implements OutputWriter { +public class JsonTransformer implements Transformer { + + private static final Logger LOGGER = LoggerFactory.getLogger(JsonTransformer.class); final ObjectMapper objectMapper = new ObjectMapper(); @@ -57,14 +61,14 @@ public List<Object> getRecords(final InputStream inputStream, final String topic jsonNode = objectMapper.readTree(line.trim()); // Parse the current line into a JsonNode jsonNodeList.add(jsonNode); // Add parsed JSON object to the list } catch (IOException e) { - LOGGER.error("Error parsing JSON record from S3 input stream: " + e.getMessage()); + LOGGER.error("Error parsing JSON record from S3 input stream: {}", e.getMessage(), e); } } line = reader.readLine(); } } catch (IOException e) { - LOGGER.error("Error reading S3 object stream: " + e.getMessage()); + LOGGER.error("Error reading S3 object stream: {}", e.getMessage()); } return jsonNodeList; } @@ -74,7 +78,7 @@ public byte[] getValueBytes(final Object record, final String topic, final S3Sou try { return objectMapper.writeValueAsBytes(record); } catch (JsonProcessingException e) { - LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + LOGGER.error("Error in reading s3 object stream {}", e.getMessage(), e); return new byte[0]; } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetTransformer.java similarity index 88% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetTransformer.java index ca5fec032..83a8c7e44 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetTransformer.java @@ -40,9 +40,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class ParquetWriter implements OutputWriter { +public class ParquetTransformer implements Transformer { - private static final Logger LOGGER = LoggerFactory.getLogger(ParquetWriter.class); + private static final Logger LOGGER = LoggerFactory.getLogger(ParquetTransformer.class); @Override public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { @@ -57,7 +57,7 @@ public List<Object> getRecords(final InputStream inputStream, final String topic @Override public byte[] getValueBytes(final Object record, final String topic, final S3SourceConfig s3SourceConfig) { - return OutputUtils.serializeAvroRecordToBytes(Collections.singletonList((GenericRecord) record), topic, + return TransformationUtils.serializeAvroRecordToBytes(Collections.singletonList((GenericRecord) record), topic, s3SourceConfig); } @@ -69,7 +69,7 @@ private List<Object> getParquetRecords(final InputStream inputStream, final Stri try { parquetFile = File.createTempFile(topic + "_" + topicPartition + "_" + timestamp, ".parquet"); } catch (IOException e) { - LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + LOGGER.error("Error in reading s3 object stream {}", e.getMessage(), e); return records; } @@ -85,7 +85,7 @@ record = parquetReader.read(); } } } catch (IOException | RuntimeException e) { // NOPMD - LOGGER.error("Error in reading s3 object stream " + e.getMessage()); + LOGGER.error("Error in reading s3 object stream {}", e.getMessage(), e); } finally { deleteTmpFile(parquetFile.toPath()); } @@ -97,7 +97,7 @@ static void deleteTmpFile(final Path parquetFile) { try { Files.delete(parquetFile); } catch (IOException e) { - LOGGER.error("Error in deleting tmp file " + e.getMessage()); + LOGGER.error("Error in deleting tmp file {}", e.getMessage(), e); } } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/TransformationUtils.java similarity index 93% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/TransformationUtils.java index a075a5c76..ee2f5726b 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputUtils.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/TransformationUtils.java @@ -33,10 +33,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -final public class OutputUtils { - private static final Logger LOGGER = LoggerFactory.getLogger(OutputUtils.class); +final public class TransformationUtils { + private static final Logger LOGGER = LoggerFactory.getLogger(TransformationUtils.class); - private OutputUtils() { + private TransformationUtils() { // hidden } @@ -55,7 +55,7 @@ static byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, return out.toByteArray(); } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | IOException e) { - LOGGER.error("Error in reading s3 object stream for topic " + topic + " with error : " + e.getMessage()); + LOGGER.error("Error in reading s3 object stream for topic {} with error : {}", topic, e.getMessage(), e); } return new byte[0]; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/Transformer.java similarity index 87% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/Transformer.java index 14616f807..d423ad991 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriter.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/Transformer.java @@ -22,12 +22,7 @@ import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public interface OutputWriter { - - Logger LOGGER = LoggerFactory.getLogger(AvroWriter.class); +public interface Transformer { void configureValueConverter(Map<String, String> config, S3SourceConfig s3SourceConfig); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/TransformerFactory.java similarity index 65% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/TransformerFactory.java index c3f7d3b3a..55278c91d 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/OutputWriterFactory.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/TransformerFactory.java @@ -16,29 +16,29 @@ package io.aiven.kafka.connect.s3.source.output; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.OUTPUT_FORMAT_KEY; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.INPUT_FORMAT_KEY; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -public final class OutputWriterFactory { +public final class TransformerFactory { - private OutputWriterFactory() { + private TransformerFactory() { // hidden } - public static OutputWriter getWriter(final S3SourceConfig s3SourceConfig) { - final OutputFormat outputFormatEnum = s3SourceConfig.getOutputFormat(); - switch (outputFormatEnum) { + public static Transformer getWriter(final S3SourceConfig s3SourceConfig) { + final InputFormat inputFormatEnum = s3SourceConfig.getOutputFormat(); + switch (inputFormatEnum) { case AVRO : - return new AvroWriter(); + return new AvroTransformer(); case PARQUET : - return new ParquetWriter(); - case JSON : - return new JsonWriter(); + return new ParquetTransformer(); + case JSONL : + return new JsonTransformer(); case BYTES : - return new ByteArrayWriter(); + return new ByteArrayTransformer(); default : throw new IllegalArgumentException( - "Unknown output format " + s3SourceConfig.getString(OUTPUT_FORMAT_KEY)); + "Unknown output format " + s3SourceConfig.getString(INPUT_FORMAT_KEY)); } } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java index 1857dc4be..d3008fc25 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java @@ -28,7 +28,7 @@ public class AivenS3SourceRecord { private final Map<String, Object> partitionMap; private Map<String, Object> offsetMap; - private final String toTopic; + private final String topic; private final Integer topicPartition; private final byte[] recordKey; private final byte[] recordValue; @@ -36,12 +36,12 @@ public class AivenS3SourceRecord { private final String objectKey; public AivenS3SourceRecord(final Map<String, Object> partitionMap, final Map<String, Object> offsetMap, - final String toTopic, final Integer topicPartition, final byte[] recordKey, final byte[] recordValue, + final String topic, final Integer topicPartition, final byte[] recordKey, final byte[] recordValue, final String objectKey) { this.partitionMap = new HashMap<>(partitionMap); this.offsetMap = new HashMap<>(offsetMap); - this.toTopic = toTopic; + this.topic = topic; this.topicPartition = topicPartition; this.recordKey = Arrays.copyOf(recordKey, recordKey.length); this.recordValue = Arrays.copyOf(recordValue, recordValue.length); @@ -56,8 +56,8 @@ public Map<String, Object> getOffsetMap() { return Collections.unmodifiableMap(offsetMap); } - public String getToTopic() { - return toTopic; + public String getTopic() { + return topic; } public Integer partition() { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java index e6c186aee..9e3e8ac49 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -20,7 +20,9 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -42,16 +44,22 @@ public class FileReader { private final String bucketName; private final Set<String> failedObjectKeys; + private final Set<String> inProcessObjectKeys; - public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName, - final Set<String> failedObjectKeys) { + public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName, final Set<String> failedObjectKeys, + final Set<String> inProcessObjectKeys) { this.s3SourceConfig = s3SourceConfig; this.bucketName = bucketName; this.failedObjectKeys = new HashSet<>(failedObjectKeys); + this.inProcessObjectKeys = new HashSet<>(inProcessObjectKeys); + } + + public Set<String> getInProcessObjectKeys() { + return Collections.unmodifiableSet(this.inProcessObjectKeys); } @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") - List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOException { + Iterator<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOException { final List<S3ObjectSummary> allSummaries = new ArrayList<>(); String continuationToken = null; ListObjectsV2Result objectListing; @@ -70,17 +78,29 @@ List<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOExc .stream() .filter(objectSummary -> objectSummary.getSize() > 0) .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey())) + .filter(objectSummary -> !inProcessObjectKeys.contains(objectSummary.getKey())) .collect(Collectors.toList()); allSummaries.addAll(filteredSummaries); // Add the filtered summaries to the main list + // TO BE REMOVED before release allSummaries.forEach(objSummary -> LOGGER.info("Objects to be processed {} ", objSummary.getKey())); + // Objects being processed + allSummaries.forEach(objSummary -> this.inProcessObjectKeys.add(objSummary.getKey())); + // Check if there are more objects to fetch continuationToken = objectListing.getNextContinuationToken(); } while (objectListing.isTruncated()); // Continue fetching if the result is truncated - return allSummaries; + return allSummaries.iterator(); } + public void addFailedObjectKeys(final String objectKey) { + this.failedObjectKeys.add(objectKey); + } + + public void removeProcessedObjectKeys(final String objectKey) { + this.inProcessObjectKeys.remove(objectKey); + } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index 7716b5d85..2fc195f03 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -85,6 +85,20 @@ public String getObjectMapKey(final String currentObjectKey) { return OBJECT_KEY + SEPARATOR + currentObjectKey; } + public boolean shouldSkipRecord(final Map<String, Object> partitionMap, final String currentObjectKey, + final long numOfProcessedRecs) { + if (offsets.containsKey(partitionMap)) { + final Map<String, Object> offsetVal = offsets.get(partitionMap); + final String objectMapKey = getObjectMapKey(currentObjectKey); + + if (offsetVal.containsKey(objectMapKey)) { + final long offsetValue = (long) offsetVal.get(objectMapKey); + return numOfProcessedRecs <= offsetValue; + } + } + return false; + } + public void createNewOffsetMap(final Map<String, Object> partitionMap, final String objectKey, final long offsetId) { final Map<String, Object> offsetMap = getOffsetValueMap(objectKey, offsetId); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index f523b9e86..2a20e9ff2 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -21,7 +21,6 @@ import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.kafka.connect.data.SchemaAndValue; @@ -30,7 +29,7 @@ import org.apache.kafka.connect.storage.Converter; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.output.OutputWriter; +import io.aiven.kafka.connect.s3.source.output.Transformer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,7 +45,7 @@ private RecordProcessor() { public static List<SourceRecord> processRecords(final Iterator<AivenS3SourceRecord> sourceRecordIterator, final List<SourceRecord> results, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, - final AtomicBoolean connectorStopped, final OutputWriter outputWriter, final Set<String> failedObjectKeys, + final AtomicBoolean connectorStopped, final Transformer transformer, final FileReader fileReader, final OffsetManager offsetManager) { final Map<String, String> conversionConfig = new HashMap<>(); @@ -56,7 +55,7 @@ public static List<SourceRecord> processRecords(final Iterator<AivenS3SourceReco final AivenS3SourceRecord aivenS3SourceRecord = sourceRecordIterator.next(); if (aivenS3SourceRecord != null) { final SourceRecord sourceRecord = createSourceRecord(aivenS3SourceRecord, s3SourceConfig, keyConverter, - valueConverter, conversionConfig, outputWriter, failedObjectKeys, offsetManager); + valueConverter, conversionConfig, transformer, fileReader, offsetManager); results.add(sourceRecord); } } @@ -67,24 +66,25 @@ public static List<SourceRecord> processRecords(final Iterator<AivenS3SourceReco static SourceRecord createSourceRecord(final AivenS3SourceRecord aivenS3SourceRecord, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, - final Map<String, String> conversionConfig, final OutputWriter outputWriter, - final Set<String> failedObjectKeys, final OffsetManager offsetManager) { + final Map<String, String> conversionConfig, final Transformer transformer, final FileReader fileReader, + final OffsetManager offsetManager) { - final String topic = aivenS3SourceRecord.getToTopic(); + final String topic = aivenS3SourceRecord.getTopic(); final Optional<SchemaAndValue> keyData = keyConverter .map(c -> c.toConnectData(topic, aivenS3SourceRecord.key())); - outputWriter.configureValueConverter(conversionConfig, s3SourceConfig); + transformer.configureValueConverter(conversionConfig, s3SourceConfig); valueConverter.configure(conversionConfig, false); try { final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); offsetManager.updateCurrentOffsets(aivenS3SourceRecord.getPartitionMap(), aivenS3SourceRecord.getOffsetMap()); + fileReader.removeProcessedObjectKeys(aivenS3SourceRecord.getObjectKey()); aivenS3SourceRecord.setOffsetMap(offsetManager.getOffsets().get(aivenS3SourceRecord.getPartitionMap())); return aivenS3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue); } catch (DataException e) { - LOGGER.error("Error in reading s3 object stream " + e.getMessage()); - failedObjectKeys.add(aivenS3SourceRecord.getObjectKey()); + LOGGER.error("Error in reading s3 object stream {}", e.getMessage(), e); + fileReader.addFailedObjectKeys(aivenS3SourceRecord.getObjectKey()); throw e; } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index d5b7bd236..2fee6f5ff 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -33,7 +33,7 @@ import org.apache.kafka.clients.consumer.ConsumerRecord; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.output.OutputWriter; +import io.aiven.kafka.connect.s3.source.output.Transformer; import com.amazonaws.AmazonClientException; import com.amazonaws.services.s3.AmazonS3; @@ -51,7 +51,6 @@ public final class SourceRecordIterator implements Iterator<AivenS3SourceRecord> private static final Logger LOGGER = LoggerFactory.getLogger(SourceRecordIterator.class); public static final String PATTERN_TOPIC_KEY = "topicName"; public static final String PATTERN_PARTITION_KEY = "partitionId"; - public static final String OFFSET_KEY = "offset"; public static final Pattern FILE_DEFAULT_PATTERN = Pattern.compile("(?<topicName>[^/]+?)-" + "(?<partitionId>\\d{5})-" + "(?<uniqueId>[a-zA-Z0-9]+)" + "\\.(?<fileExtension>[^.]+)$"); // topic-00001.txt @@ -66,21 +65,20 @@ public final class SourceRecordIterator implements Iterator<AivenS3SourceRecord> private final String bucketName; private final AmazonS3 s3Client; - private final OutputWriter outputWriter; + private final Transformer transformer; private final FileReader fileReader; // NOPMD public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, - final OffsetManager offsetManager, final OutputWriter outputWriter, final FileReader fileReader) { + final OffsetManager offsetManager, final Transformer transformer, final FileReader fileReader) { this.s3SourceConfig = s3SourceConfig; this.offsetManager = offsetManager; this.s3Client = s3Client; this.bucketName = bucketName; - this.outputWriter = outputWriter; + this.transformer = transformer; this.fileReader = fileReader; try { - final List<S3ObjectSummary> chunks = fileReader.fetchObjectSummaries(s3Client); - nextFileIterator = chunks.iterator(); + nextFileIterator = fileReader.fetchObjectSummaries(s3Client); } catch (IOException e) { throw new AmazonClientException("Failed to initialize S3 file reader", e); } @@ -113,7 +111,7 @@ private Iterator<ConsumerRecord<byte[], byte[]>> createIteratorForCurrentFile() topicName = fileMatcher.group(PATTERN_TOPIC_KEY); defaultPartitionId = Integer.parseInt(fileMatcher.group(PATTERN_PARTITION_KEY)); } else { - LOGGER.error("File naming doesn't match to any topic. " + currentObjectKey); + LOGGER.error("File naming doesn't match to any topic. {}", currentObjectKey); inputStream.abort(); s3Object.close(); return Collections.emptyIterator(); @@ -125,14 +123,14 @@ private Iterator<ConsumerRecord<byte[], byte[]>> createIteratorForCurrentFile() final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(topicName, defaultPartitionId, bucketName); - return getObjectIterator(inputStream, finalTopic, defaultPartitionId, defaultStartOffsetId, outputWriter, + return getObjectIterator(inputStream, finalTopic, defaultPartitionId, defaultStartOffsetId, transformer, partitionMap); } } @SuppressWarnings("PMD.CognitiveComplexity") private Iterator<ConsumerRecord<byte[], byte[]>> getObjectIterator(final InputStream valueInputStream, - final String topic, final int topicPartition, final long startOffset, final OutputWriter outputWriter, + final String topic, final int topicPartition, final long startOffset, final Transformer transformer, final Map<String, Object> partitionMap) { return new Iterator<>() { private final Iterator<ConsumerRecord<byte[], byte[]>> internalIterator = readNext().iterator(); @@ -145,21 +143,15 @@ private List<ConsumerRecord<byte[], byte[]>> readNext() { int numOfProcessedRecs = 1; boolean checkOffsetMap = true; - for (final Object record : outputWriter.getRecords(valueInputStream, topic, topicPartition, + for (final Object record : transformer.getRecords(valueInputStream, topic, topicPartition, s3SourceConfig)) { - - if (offsetManager.getOffsets().containsKey(partitionMap) && checkOffsetMap) { - final Map<String, Object> offsetVal = offsetManager.getOffsets().get(partitionMap); - if (offsetVal.containsKey(offsetManager.getObjectMapKey(currentObjectKey))) { - final long offsetValue = (long) offsetVal - .get(offsetManager.getObjectMapKey(currentObjectKey)); - if (numOfProcessedRecs <= offsetValue) { - numOfProcessedRecs++; - continue; - } - } + if (offsetManager.shouldSkipRecord(partitionMap, currentObjectKey, numOfProcessedRecs) + && checkOffsetMap) { + numOfProcessedRecs++; + continue; } - final byte[] valueBytes = outputWriter.getValueBytes(record, topic, s3SourceConfig); + + final byte[] valueBytes = transformer.getValueBytes(record, topic, s3SourceConfig); checkOffsetMap = false; consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, topic, topicPartition, offsetManager, startOffset, partitionMap)); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java index d0a626a76..7701fcf76 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -36,9 +36,9 @@ import org.apache.kafka.connect.storage.OffsetStorageReader; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.output.ByteArrayWriter; -import io.aiven.kafka.connect.s3.source.output.OutputFormat; -import io.aiven.kafka.connect.s3.source.output.OutputWriter; +import io.aiven.kafka.connect.s3.source.output.ByteArrayTransformer; +import io.aiven.kafka.connect.s3.source.output.InputFormat; +import io.aiven.kafka.connect.s3.source.output.Transformer; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import io.aiven.kafka.connect.s3.source.utils.AivenS3SourceRecord; import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; @@ -54,8 +54,11 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +@ExtendWith(MockitoExtension.class) final class S3SourceTaskTest { private static final Random RANDOM = new Random(); @@ -133,8 +136,8 @@ void testS3SourceTaskInitialization() { final Converter valueConverter = s3SourceTask.getValueConverter(); assertThat(valueConverter).isInstanceOf(ByteArrayConverter.class); - final OutputWriter outputWriter = s3SourceTask.getOutputWriter(); - assertThat(outputWriter).isInstanceOf(ByteArrayWriter.class); + final Transformer transformer = s3SourceTask.getOutputWriter(); + assertThat(transformer).isInstanceOf(ByteArrayTransformer.class); final boolean taskInitialized = s3SourceTask.isTaskInitialized(); assertThat(taskInitialized).isTrue(); @@ -170,11 +173,6 @@ void testStop() { } private static AivenS3SourceRecord getAivenS3SourceRecord() { - // final List<AivenS3SourceRecord> aivenS3SourceRecordList = new ArrayList<>(); - // aivenS3SourceRecordList.add(aivenS3SourceRecord1); - // final AivenS3SourceRecord aivenS3SourceRecord2 = new AivenS3SourceRecord(new HashMap<>(), new HashMap<>(), - // "testtopic", 1, new byte[0], new byte[0], ""); - // aivenS3SourceRecordList.add(aivenS3SourceRecord2); return new AivenS3SourceRecord(new HashMap<>(), new HashMap<>(), "testtopic", 0, new byte[0], new byte[0], ""); } @@ -196,7 +194,7 @@ private void startSourceTask(final S3SourceTask s3SourceTask) { } private void setBasicProperties() { - properties.put(S3SourceConfig.OUTPUT_FORMAT_KEY, OutputFormat.BYTES.getValue()); + properties.put(S3SourceConfig.INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); properties.put("name", "test_source_connector"); properties.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); properties.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java index e27604ce9..b38bc0b47 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java @@ -23,7 +23,7 @@ import java.util.HashMap; -import io.aiven.kafka.connect.s3.source.output.OutputFormat; +import io.aiven.kafka.connect.s3.source.output.InputFormat; import com.amazonaws.regions.RegionUtils; import com.amazonaws.regions.Regions; @@ -43,7 +43,7 @@ void correctFullConfig() { props.put(S3SourceConfig.AWS_S3_REGION_CONFIG, Regions.US_EAST_1.getName()); // record, topic specific props - props.put(S3SourceConfig.OUTPUT_FORMAT_KEY, OutputFormat.AVRO.getValue()); + props.put(S3SourceConfig.INPUT_FORMAT_KEY, InputFormat.AVRO.getValue()); props.put(TARGET_TOPIC_PARTITIONS, "0,1"); props.put(TARGET_TOPICS, "testtopic"); props.put(SCHEMA_REGISTRY_URL, "localhost:8081"); @@ -57,7 +57,7 @@ void correctFullConfig() { assertThat(conf.getAwsS3EndPoint()).isEqualTo("AWS_S3_ENDPOINT"); assertThat(conf.getAwsS3Region()).isEqualTo(RegionUtils.getRegion("us-east-1")); - assertThat(conf.getOutputFormat()).isEqualTo(OutputFormat.AVRO); + assertThat(conf.getOutputFormat()).isEqualTo(InputFormat.AVRO); assertThat(conf.getTargetTopics()).isEqualTo("testtopic"); assertThat(conf.getTargetTopicPartitions()).isEqualTo("0,1"); assertThat(conf.getSchemaRegistryUrl()).isEqualTo("localhost:8081"); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroTransformerTest.java similarity index 88% rename from s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java rename to s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroTransformerTest.java index c902ffeea..c02b967a3 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroWriterTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroTransformerTest.java @@ -41,21 +41,22 @@ import org.apache.avro.io.DatumWriter; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; -import org.mockito.MockitoAnnotations; +import org.mockito.junit.jupiter.MockitoExtension; -final class AvroWriterTest { +@ExtendWith(MockitoExtension.class) +final class AvroTransformerTest { @Mock private S3SourceConfig s3SourceConfig; - private AvroWriter avroWriter; + private AvroTransformer avroTransformer; private Map<String, String> config; @BeforeEach void setUp() { - MockitoAnnotations.openMocks(this); - avroWriter = new AvroWriter(); + avroTransformer = new AvroTransformer(); config = new HashMap<>(); } @@ -63,7 +64,7 @@ void setUp() { void testConfigureValueConverter() { final String value = "http://localhost:8081"; when(s3SourceConfig.getString(SCHEMA_REGISTRY_URL)).thenReturn(value); - avroWriter.configureValueConverter(config, s3SourceConfig); + avroTransformer.configureValueConverter(config, s3SourceConfig); assertThat(config.get(SCHEMA_REGISTRY_URL)).isEqualTo("http://localhost:8081") .describedAs("The schema registry URL should be correctly set in the config."); } @@ -73,7 +74,7 @@ void testReadAvroRecordsInvalidData() { final InputStream inputStream = new ByteArrayInputStream("mock-avro-data".getBytes(StandardCharsets.UTF_8)); final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); - final List<Object> records = avroWriter.readAvroRecords(inputStream, datumReader); + final List<Object> records = avroTransformer.readAvroRecords(inputStream, datumReader); assertThat(records.size()).isEqualTo(0); } @@ -84,7 +85,7 @@ void testReadAvroRecords() throws Exception { final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); - final List<Object> records = avroWriter.readAvroRecords(inputStream, datumReader); + final List<Object> records = avroTransformer.readAvroRecords(inputStream, datumReader); assertThat(records.size()).isEqualTo(2); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriterTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ByteArrayTransformerTest.java similarity index 77% rename from s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriterTest.java rename to s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ByteArrayTransformerTest.java index 1169cef01..6d7093fc0 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ByteArrayWriterTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ByteArrayTransformerTest.java @@ -30,20 +30,21 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; -import org.mockito.MockitoAnnotations; +import org.mockito.junit.jupiter.MockitoExtension; -class ByteArrayWriterTest { +@ExtendWith(MockitoExtension.class) +final class ByteArrayTransformerTest { - private ByteArrayWriter byteArrayWriter; + private ByteArrayTransformer byteArrayTransformer; @Mock private S3SourceConfig s3SourceConfig; @BeforeEach void setUp() { - MockitoAnnotations.openMocks(this); - byteArrayWriter = new ByteArrayWriter(); + byteArrayTransformer = new ByteArrayTransformer(); } @Test @@ -53,7 +54,7 @@ void testGetRecordsSingleChunk() { when(s3SourceConfig.getInt(MAX_MESSAGE_BYTES_SIZE)).thenReturn(10_000); // Larger than data size - final List<Object> records = byteArrayWriter.getRecords(inputStream, "test-topic", 0, s3SourceConfig); + final List<Object> records = byteArrayTransformer.getRecords(inputStream, "test-topic", 0, s3SourceConfig); assertEquals(1, records.size()); assertArrayEquals(data, (byte[]) records.get(0)); @@ -66,7 +67,7 @@ void testGetRecordsMultipleChunks() { when(s3SourceConfig.getInt(MAX_MESSAGE_BYTES_SIZE)).thenReturn(5); // Smaller than data size - final List<Object> records = byteArrayWriter.getRecords(inputStream, "test-topic", 0, s3SourceConfig); + final List<Object> records = byteArrayTransformer.getRecords(inputStream, "test-topic", 0, s3SourceConfig); assertEquals(2, records.size()); assertArrayEquals(new byte[] { 1, 2, 3, 4, 5 }, (byte[]) records.get(0)); @@ -79,7 +80,7 @@ void testGetRecordsEmptyInputStream() throws IOException { when(s3SourceConfig.getInt(MAX_MESSAGE_BYTES_SIZE)).thenReturn(5); - final List<Object> records = byteArrayWriter.getRecords(inputStream, "test-topic", 0, s3SourceConfig); + final List<Object> records = byteArrayTransformer.getRecords(inputStream, "test-topic", 0, s3SourceConfig); assertEquals(0, records.size()); } @@ -87,7 +88,7 @@ void testGetRecordsEmptyInputStream() throws IOException { @Test void testGetValueBytes() { final byte[] record = { 1, 2, 3 }; - final byte[] result = byteArrayWriter.getValueBytes(record, "test-topic", s3SourceConfig); + final byte[] result = byteArrayTransformer.getValueBytes(record, "test-topic", s3SourceConfig); assertArrayEquals(record, result); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonWriterTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonTransformerTest.java similarity index 69% rename from s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonWriterTest.java rename to s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonTransformerTest.java index 2c4bbc52f..58b3e420d 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonWriterTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonTransformerTest.java @@ -30,34 +30,32 @@ import java.util.Map; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.mockito.Mock; -import org.mockito.MockitoAnnotations; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.junit.jupiter.MockitoExtension; -final class JsonWriterTest { +@ExtendWith(MockitoExtension.class) +final class JsonTransformerTest { - JsonWriter jsonWriter; + JsonTransformer jsonTransformer; - @Mock - OffsetManager offsetManager; + S3SourceConfig s3SourceConfig; @BeforeEach void setUp() { - MockitoAnnotations.openMocks(this); - jsonWriter = new JsonWriter(); + jsonTransformer = new JsonTransformer(); + s3SourceConfig = mock(S3SourceConfig.class); } @Test void testConfigureValueConverter() { final Map<String, String> config = new HashMap<>(); - final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); - jsonWriter.configureValueConverter(config, s3SourceConfig); + jsonTransformer.configureValueConverter(config, s3SourceConfig); assertEquals("false", config.get(SCHEMAS_ENABLE), "SCHEMAS_ENABLE should be set to false"); } @@ -65,8 +63,7 @@ void testConfigureValueConverter() { void testHandleValueDataWithValidJson() { final InputStream validJsonInputStream = new ByteArrayInputStream( "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); - final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); - final List<Object> jsonNodes = jsonWriter.getRecords(validJsonInputStream, "testtopic", 1, s3SourceConfig); + final List<Object> jsonNodes = jsonTransformer.getRecords(validJsonInputStream, "testtopic", 1, s3SourceConfig); assertThat(jsonNodes.size()).isEqualTo(1); } @@ -75,9 +72,9 @@ void testHandleValueDataWithValidJson() { void testHandleValueDataWithInvalidJson() { final InputStream invalidJsonInputStream = new ByteArrayInputStream( "invalid-json".getBytes(StandardCharsets.UTF_8)); - final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); - final List<Object> jsonNodes = jsonWriter.getRecords(invalidJsonInputStream, "testtopic", 1, s3SourceConfig); + final List<Object> jsonNodes = jsonTransformer.getRecords(invalidJsonInputStream, "testtopic", 1, + s3SourceConfig); assertThat(jsonNodes.size()).isEqualTo(0); } @@ -86,13 +83,10 @@ void testHandleValueDataWithInvalidJson() { void testSerializeJsonDataValid() throws IOException { final InputStream validJsonInputStream = new ByteArrayInputStream( "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); - final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); - final List<Object> jsonNodes = jsonWriter.getRecords(validJsonInputStream, "testtopic", 1, s3SourceConfig); - - final byte[] serializedData = jsonWriter.getValueBytes(jsonNodes.get(0), "testtopic", s3SourceConfig); + final List<Object> jsonNodes = jsonTransformer.getRecords(validJsonInputStream, "testtopic", 1, s3SourceConfig); + final byte[] serializedData = jsonTransformer.getValueBytes(jsonNodes.get(0), "testtopic", s3SourceConfig); final ObjectMapper objectMapper = new ObjectMapper(); - final JsonNode expectedData = objectMapper.readTree(serializedData); assertThat(expectedData.get("key").asText()).isEqualTo("value"); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetTransformerTest.java similarity index 91% rename from s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java rename to s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetTransformerTest.java index a3830a585..1006b1247 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetWriterTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetTransformerTest.java @@ -34,13 +34,16 @@ import org.apache.avro.generic.GenericRecord; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.junit.jupiter.MockitoExtension; -final class ParquetWriterTest { - private ParquetWriter parquetWriter; +@ExtendWith(MockitoExtension.class) +final class ParquetTransformerTest { + private ParquetTransformer parquetWriter; @BeforeEach public void setUp() { - parquetWriter = new ParquetWriter(); + parquetWriter = new ParquetTransformer(); } @Test @@ -91,7 +94,7 @@ void testTemporaryFileDeletion() throws Exception { final Path tempFile = Files.createTempFile("test-file", ".parquet"); assertThat(Files.exists(tempFile)).isTrue(); - ParquetWriter.deleteTmpFile(tempFile); + ParquetTransformer.deleteTmpFile(tempFile); assertThat(Files.exists(tempFile)).isFalse(); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java index 65b914822..212088560 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java @@ -76,8 +76,8 @@ public final void removeBucket() { err.getMessage())); } } catch (final AmazonClientException e) { - LOGGER.error( - "Couldn't delete objects: " + Arrays.stream(chunk).reduce(" ", String::concat) + e.getMessage()); + LOGGER.error("Couldn't delete objects: {}", + Arrays.stream(chunk).reduce(" ", String::concat) + e.getMessage()); } s3Client.deleteBucket(bucketName); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java index 91a33b723..f038468a1 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java @@ -19,6 +19,8 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -26,12 +28,14 @@ import java.io.IOException; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import io.aiven.kafka.connect.s3.source.AivenKafkaConnectS3SourceConnector; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.output.OutputFormat; +import io.aiven.kafka.connect.s3.source.output.InputFormat; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.ListObjectsV2Request; @@ -60,7 +64,7 @@ public void setUp() { setBasicProperties(); final S3SourceConfig s3SourceConfig = new S3SourceConfig(properties); offsetManager = mock(OffsetManager.class); - fileReader = new FileReader(s3SourceConfig, TEST_BUCKET, Collections.emptySet()); + fileReader = new FileReader(s3SourceConfig, TEST_BUCKET, Collections.emptySet(), new HashSet<>()); s3Client = mock(AmazonS3.class); } @@ -70,43 +74,43 @@ void testFetchObjectSummariesWithNoObjects() throws IOException { when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); when(offsetManager.getOffsets()).thenReturn(new HashMap<>()); - final List<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); - assertThat(summaries.size()).isEqualTo(0); + final Iterator<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); + assertFalse(summaries.hasNext()); } @Test void testFetchObjectSummariesWithOneNonZeroByteObject() throws IOException { - final S3ObjectSummary objectSummary = createObjectSummary(1); + final S3ObjectSummary objectSummary = createObjectSummary(1, "key1"); final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result( Collections.singletonList(objectSummary), null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); when(offsetManager.getOffsets()).thenReturn(new HashMap<>()); - final List<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); + final Iterator<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); - assertThat(summaries.size()).isEqualTo(1); - assertThat(summaries.get(0).getSize()).isEqualTo(1); + assertTrue(summaries.hasNext()); + assertThat(summaries.next().getSize()).isEqualTo(1); } @Test void testFetchObjectSummariesWithZeroByteObject() throws IOException { - final S3ObjectSummary zeroByteObject = createObjectSummary(0); - final S3ObjectSummary nonZeroByteObject = createObjectSummary(1); + final S3ObjectSummary zeroByteObject = createObjectSummary(0, "key1"); + final S3ObjectSummary nonZeroByteObject = createObjectSummary(1, "key2"); final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result( List.of(zeroByteObject, nonZeroByteObject), null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); when(offsetManager.getOffsets()).thenReturn(new HashMap<>()); - final List<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); + final Iterator<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); - assertThat(summaries.size()).isEqualTo(1); - assertThat(summaries.get(0).getSize()).isEqualTo(1); + assertTrue(summaries.hasNext()); + assertThat(summaries.next().getSize()).isEqualTo(1); } @Test void testFetchObjectSummariesWithPagination() throws IOException { - final S3ObjectSummary object1 = createObjectSummary(1); - final S3ObjectSummary object2 = createObjectSummary(2); + final S3ObjectSummary object1 = createObjectSummary(1, "key1"); + final S3ObjectSummary object2 = createObjectSummary(2, "key2"); final List<S3ObjectSummary> firstBatch = List.of(object1); final List<S3ObjectSummary> secondBatch = List.of(object2); @@ -116,9 +120,10 @@ void testFetchObjectSummariesWithPagination() throws IOException { when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(firstResult).thenReturn(secondResult); when(offsetManager.getOffsets()).thenReturn(new HashMap<>()); - final List<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); + final Iterator<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); - assertThat(summaries.size()).isEqualTo(2); + assertThat(summaries.next()).isNotNull(); + assertThat(summaries.next()).isNotNull(); } private ListObjectsV2Result createListObjectsV2Result(final List<S3ObjectSummary> summaries, @@ -130,14 +135,15 @@ private ListObjectsV2Result createListObjectsV2Result(final List<S3ObjectSummary return result; } - private S3ObjectSummary createObjectSummary(final long sizeOfObject) { + private S3ObjectSummary createObjectSummary(final long sizeOfObject, final String objectKey) { final S3ObjectSummary summary = mock(S3ObjectSummary.class); when(summary.getSize()).thenReturn(sizeOfObject); + when(summary.getKey()).thenReturn(objectKey); return summary; } private void setBasicProperties() { - properties.put(S3SourceConfig.OUTPUT_FORMAT_KEY, OutputFormat.BYTES.getValue()); + properties.put(S3SourceConfig.INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); properties.put("name", "test_source_connector"); properties.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); properties.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java index 99ded7905..52d0bfd0e 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java @@ -19,7 +19,6 @@ import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; -import static io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator.OFFSET_KEY; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; @@ -69,7 +68,7 @@ void testWithOffsets() { partitionKey.put("bucket", TEST_BUCKET); final Map<String, Object> offsetValue = new HashMap<>(); - offsetValue.put(OFFSET_KEY, 5L); + offsetValue.put("object_key_file", 5L); final Map<Map<String, Object>, Map<String, Object>> offsets = new HashMap<>(); offsets.put(partitionKey, offsetValue); @@ -79,7 +78,7 @@ void testWithOffsets() { final Map<Map<String, Object>, Map<String, Object>> retrievedOffsets = offsetManager.getOffsets(); assertThat(retrievedOffsets.size()).isEqualTo(1); - assertThat(retrievedOffsets.values().iterator().next().get(OFFSET_KEY)).isEqualTo(5L); + assertThat(retrievedOffsets.values().iterator().next().get("object_key_file")).isEqualTo(5L); } @Test diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java index be2e273be..8f983d57f 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -29,7 +29,6 @@ import java.net.ConnectException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; -import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -41,13 +40,15 @@ import org.apache.kafka.connect.storage.Converter; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.output.OutputWriter; +import io.aiven.kafka.connect.s3.source.output.Transformer; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; -import org.mockito.MockitoAnnotations; +import org.mockito.junit.jupiter.MockitoExtension; +@ExtendWith(MockitoExtension.class) class RecordProcessorTest { @Mock @@ -55,18 +56,20 @@ class RecordProcessorTest { @Mock private Converter valueConverter; @Mock - private OutputWriter outputWriter; + private Transformer transformer; @Mock private Converter keyConverter; @Mock private OffsetManager offsetManager; + @Mock + private FileReader fileReader; + private AtomicBoolean connectorStopped; private Iterator<AivenS3SourceRecord> sourceRecordIterator; @BeforeEach void setUp() { - MockitoAnnotations.openMocks(this); connectorStopped = new AtomicBoolean(false); sourceRecordIterator = mock(Iterator.class); } @@ -84,7 +87,7 @@ void testProcessRecordsNoRecords() { Optional.of(keyConverter), valueConverter, connectorStopped, - outputWriter, Collections.emptySet(), offsetManager + transformer, fileReader, offsetManager ); assertTrue(processedRecords.isEmpty(), "Processed records should be empty when there are no records."); @@ -106,7 +109,7 @@ void testProcessRecordsWithRecords() throws ConnectException { Optional.of(keyConverter), valueConverter, connectorStopped, - outputWriter, Collections.emptySet(), offsetManager + transformer, fileReader, offsetManager ); assertThat(results.size()).isEqualTo(1); @@ -126,7 +129,7 @@ void testProcessRecordsConnectorStopped() { Optional.of(keyConverter), valueConverter, connectorStopped, - outputWriter, Collections.emptySet(), offsetManager + transformer, fileReader, offsetManager ); assertTrue(processedRecords.isEmpty(), "Processed records should be empty when connector is stopped."); @@ -136,7 +139,7 @@ void testProcessRecordsConnectorStopped() { @Test void testCreateSourceRecords() { final AivenS3SourceRecord mockRecord = mock(AivenS3SourceRecord.class); - when(mockRecord.getToTopic()).thenReturn("test-topic"); + when(mockRecord.getTopic()).thenReturn("test-topic"); when(mockRecord.key()).thenReturn("mock-key".getBytes(StandardCharsets.UTF_8)); when(mockRecord.value()).thenReturn("mock-value".getBytes(StandardCharsets.UTF_8)); @@ -145,8 +148,7 @@ void testCreateSourceRecords() { when(mockRecord.getSourceRecord(anyString(), any(), any())).thenReturn(mock(SourceRecord.class)); final SourceRecord sourceRecords = RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, - Optional.of(keyConverter), valueConverter, new HashMap<>(), outputWriter, Collections.emptySet(), - offsetManager); + Optional.of(keyConverter), valueConverter, new HashMap<>(), transformer, fileReader, offsetManager); assertThat(sourceRecords).isNotNull(); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java index da77c8b1a..4e23ec12c 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -32,7 +32,7 @@ import java.util.List; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.output.OutputWriter; +import io.aiven.kafka.connect.s3.source.output.Transformer; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.ListObjectsV2Result; @@ -47,7 +47,7 @@ final class SourceRecordIteratorTest { private AmazonS3 mockS3Client; private S3SourceConfig mockConfig; private OffsetManager mockOffsetManager; - private OutputWriter mockOutputWriter; + private Transformer mockTransformer; private FileReader mockFileReader; @@ -56,7 +56,7 @@ public void setUp() { mockS3Client = mock(AmazonS3.class); mockConfig = mock(S3SourceConfig.class); mockOffsetManager = mock(OffsetManager.class); - mockOutputWriter = mock(OutputWriter.class); + mockTransformer = mock(Transformer.class); mockFileReader = mock(FileReader.class); } @@ -67,6 +67,7 @@ void testIteratorProcessesS3Objects() throws Exception { // Mock list of S3 object summaries final List<S3ObjectSummary> mockObjectSummaries = Collections.singletonList(mockSummary); + final ListObjectsV2Result result = mockListObjectsResult(mockObjectSummaries); when(mockS3Client.listObjectsV2(anyString())).thenReturn(result); @@ -77,26 +78,26 @@ void testIteratorProcessesS3Objects() throws Exception { when(mockS3Client.getObject(anyString(), anyString())).thenReturn(mockS3Object); when(mockS3Object.getObjectContent()).thenReturn(mockInputStream); - when(mockOutputWriter.getRecords(any(), anyString(), anyInt(), any())) + when(mockTransformer.getRecords(any(), anyString(), anyInt(), any())) .thenReturn(Collections.singletonList(new Object())); final String outStr = "this is a test"; - when(mockOutputWriter.getValueBytes(any(), anyString(), any())) + when(mockTransformer.getValueBytes(any(), anyString(), any())) .thenReturn(outStr.getBytes(StandardCharsets.UTF_8)); when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); - when(mockFileReader.fetchObjectSummaries(any())).thenReturn(Collections.emptyList()); + when(mockFileReader.fetchObjectSummaries(any())).thenReturn(Collections.emptyIterator()); SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockS3Client, "test-bucket", - mockOffsetManager, mockOutputWriter, mockFileReader); + mockOffsetManager, mockTransformer, mockFileReader); assertFalse(iterator.hasNext()); assertNull(iterator.next()); - when(mockFileReader.fetchObjectSummaries(any())).thenReturn(mockObjectSummaries); + when(mockFileReader.fetchObjectSummaries(any())).thenReturn(mockObjectSummaries.listIterator()); iterator = new SourceRecordIterator(mockConfig, mockS3Client, "test-bucket", mockOffsetManager, - mockOutputWriter, mockFileReader); + mockTransformer, mockFileReader); assertTrue(iterator.hasNext()); assertNotNull(iterator.next()); From 7ad5af7794318c399b18fca3ae4bf065ed71fb5f Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Thu, 17 Oct 2024 08:28:10 +0200 Subject: [PATCH 50/90] Changed output dir to input, added new test for max message bytes --- .../connect/s3/source/IntegrationTest.java | 28 ++++++++++++++++++- .../kafka/connect/s3/source/S3SourceTask.java | 4 +-- .../s3/source/config/S3SourceConfig.java | 2 +- .../{output => input}/AvroTransformer.java | 2 +- .../ByteArrayTransformer.java | 4 +-- .../source/{output => input}/InputFormat.java | 2 +- .../{output => input}/JsonTransformer.java | 2 +- .../{output => input}/ParquetTransformer.java | 2 +- .../TransformationUtils.java | 2 +- .../source/{output => input}/Transformer.java | 2 +- .../{output => input}/TransformerFactory.java | 2 +- .../s3/source/utils/RecordProcessor.java | 2 +- .../s3/source/utils/SourceRecordIterator.java | 2 +- .../connect/s3/source/S3SourceTaskTest.java | 6 ++-- .../s3/source/config/S3SourceConfigTest.java | 2 +- .../AvroTransformerTest.java | 2 +- .../ByteArrayTransformerTest.java | 2 +- .../JsonTransformerTest.java | 2 +- .../ParquetTransformerTest.java | 2 +- .../s3/source/utils/FileReaderTest.java | 2 +- .../s3/source/utils/RecordProcessorTest.java | 2 +- .../utils/SourceRecordIteratorTest.java | 2 +- 22 files changed, 52 insertions(+), 26 deletions(-) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/{output => input}/AvroTransformer.java (98%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/{output => input}/ByteArrayTransformer.java (93%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/{output => input}/InputFormat.java (95%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/{output => input}/JsonTransformer.java (98%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/{output => input}/ParquetTransformer.java (98%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/{output => input}/TransformationUtils.java (98%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/{output => input}/Transformer.java (95%) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/{output => input}/TransformerFactory.java (96%) rename s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/{output => input}/AvroTransformerTest.java (98%) rename s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/{output => input}/ByteArrayTransformerTest.java (98%) rename s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/{output => input}/JsonTransformerTest.java (98%) rename s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/{output => input}/ParquetTransformerTest.java (98%) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index dbc3bfba2..650549734 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -22,6 +22,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.INPUT_FORMAT_KEY; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_MESSAGE_BYTES_SIZE; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; @@ -45,7 +46,7 @@ import org.apache.kafka.clients.admin.AdminClient; -import io.aiven.kafka.connect.s3.source.output.InputFormat; +import io.aiven.kafka.connect.s3.source.input.InputFormat; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; import io.aiven.kafka.connect.s3.source.testutils.S3OutputStream; @@ -172,6 +173,31 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx assertThat(records).contains(testData1).contains(testData2); } + @Test + void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) + throws ExecutionException, InterruptedException, IOException { + final var topicName = IntegrationBase.topicName(testInfo); + final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); + connectorConfig.put(MAX_MESSAGE_BYTES_SIZE, "2"); + connectRunner.createConnector(connectorConfig); + + final String testData = "AABBCCDDEE"; + + writeToS3(topicName, testData.getBytes(StandardCharsets.UTF_8), "00000"); + + // Poll messages from the Kafka topic and verify the consumed data + final List<String> records = IntegrationBase.consumeMessages(topicName, 5, KAFKA_CONTAINER); + + // Verify that the correct data is read from the S3 bucket and pushed to Kafka + assertThat(records.size()).isEqualTo(5); + assertThat(records.get(0)).isEqualTo("AA"); + assertThat(records.get(1)).isEqualTo("BB"); + assertThat(records.get(2)).isEqualTo("CC"); + assertThat(records.get(3)).isEqualTo("DD"); + assertThat(records.get(4)).isEqualTo("EE"); + } + @Test void multiPartUploadBytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedException { final var topicName = IntegrationBase.topicName(testInfo); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 19d229ed7..9c1ac3d11 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -37,8 +37,8 @@ import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.output.Transformer; -import io.aiven.kafka.connect.s3.source.output.TransformerFactory; +import io.aiven.kafka.connect.s3.source.input.Transformer; +import io.aiven.kafka.connect.s3.source.input.TransformerFactory; import io.aiven.kafka.connect.s3.source.utils.AivenS3SourceRecord; import io.aiven.kafka.connect.s3.source.utils.FileReader; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index c704d839b..c64e64dc0 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -31,7 +31,7 @@ import io.aiven.kafka.connect.common.config.validators.NonEmptyPassword; import io.aiven.kafka.connect.common.config.validators.UrlValidator; -import io.aiven.kafka.connect.s3.source.output.InputFormat; +import io.aiven.kafka.connect.s3.source.input.InputFormat; import com.amazonaws.auth.AWSCredentialsProvider; import com.amazonaws.regions.Region; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroTransformer.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java similarity index 98% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroTransformer.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java index 6a6f1678e..620aee2d4 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/AvroTransformer.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.output; +package io.aiven.kafka.connect.s3.source.input; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayTransformer.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformer.java similarity index 93% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayTransformer.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformer.java index dadbfa8e3..472d8b93a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ByteArrayTransformer.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformer.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.output; +package io.aiven.kafka.connect.s3.source.input; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_MESSAGE_BYTES_SIZE; @@ -34,7 +34,7 @@ public class ByteArrayTransformer implements Transformer { @Override public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { - // For byte array transformations, no explicit converter is configured. + // For byte array transformations, ByteArrayConverter is the converter which is the default config. } @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/InputFormat.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/InputFormat.java similarity index 95% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/InputFormat.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/InputFormat.java index 87a240182..12334ba7a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/InputFormat.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/InputFormat.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.output; +package io.aiven.kafka.connect.s3.source.input; import java.util.Locale; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonTransformer.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/JsonTransformer.java similarity index 98% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonTransformer.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/JsonTransformer.java index 2c4fa286a..7e1010fa8 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/JsonTransformer.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/JsonTransformer.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.output; +package io.aiven.kafka.connect.s3.source.input; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMAS_ENABLE; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetTransformer.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformer.java similarity index 98% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetTransformer.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformer.java index 83a8c7e44..39fec83de 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/ParquetTransformer.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformer.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.output; +package io.aiven.kafka.connect.s3.source.input; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/TransformationUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformationUtils.java similarity index 98% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/TransformationUtils.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformationUtils.java index ee2f5726b..9c6e31f9d 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/TransformationUtils.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformationUtils.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.output; +package io.aiven.kafka.connect.s3.source.input; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/Transformer.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/Transformer.java similarity index 95% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/Transformer.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/Transformer.java index d423ad991..70fe28d96 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/Transformer.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/Transformer.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.output; +package io.aiven.kafka.connect.s3.source.input; import java.io.InputStream; import java.util.List; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/TransformerFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformerFactory.java similarity index 96% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/TransformerFactory.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformerFactory.java index 55278c91d..95ca6619b 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/output/TransformerFactory.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformerFactory.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.output; +package io.aiven.kafka.connect.s3.source.input; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.INPUT_FORMAT_KEY; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index 2a20e9ff2..5ebe3c919 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -29,7 +29,7 @@ import org.apache.kafka.connect.storage.Converter; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.output.Transformer; +import io.aiven.kafka.connect.s3.source.input.Transformer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 2fee6f5ff..ba758165b 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -33,7 +33,7 @@ import org.apache.kafka.clients.consumer.ConsumerRecord; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.output.Transformer; +import io.aiven.kafka.connect.s3.source.input.Transformer; import com.amazonaws.AmazonClientException; import com.amazonaws.services.s3.AmazonS3; diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java index 7701fcf76..26a867f29 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -36,9 +36,9 @@ import org.apache.kafka.connect.storage.OffsetStorageReader; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.output.ByteArrayTransformer; -import io.aiven.kafka.connect.s3.source.output.InputFormat; -import io.aiven.kafka.connect.s3.source.output.Transformer; +import io.aiven.kafka.connect.s3.source.input.ByteArrayTransformer; +import io.aiven.kafka.connect.s3.source.input.InputFormat; +import io.aiven.kafka.connect.s3.source.input.Transformer; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import io.aiven.kafka.connect.s3.source.utils.AivenS3SourceRecord; import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java index b38bc0b47..2876840c2 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java @@ -23,7 +23,7 @@ import java.util.HashMap; -import io.aiven.kafka.connect.s3.source.output.InputFormat; +import io.aiven.kafka.connect.s3.source.input.InputFormat; import com.amazonaws.regions.RegionUtils; import com.amazonaws.regions.Regions; diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroTransformerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/AvroTransformerTest.java similarity index 98% rename from s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroTransformerTest.java rename to s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/AvroTransformerTest.java index c02b967a3..39b689736 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/AvroTransformerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/AvroTransformerTest.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.output; +package io.aiven.kafka.connect.s3.source.input; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static org.assertj.core.api.Assertions.assertThat; diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ByteArrayTransformerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformerTest.java similarity index 98% rename from s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ByteArrayTransformerTest.java rename to s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformerTest.java index 6d7093fc0..4c2bb0099 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ByteArrayTransformerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformerTest.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.output; +package io.aiven.kafka.connect.s3.source.input; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_MESSAGE_BYTES_SIZE; import static org.junit.jupiter.api.Assertions.assertArrayEquals; diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonTransformerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/JsonTransformerTest.java similarity index 98% rename from s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonTransformerTest.java rename to s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/JsonTransformerTest.java index 58b3e420d..e24711f36 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/JsonTransformerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/JsonTransformerTest.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.output; +package io.aiven.kafka.connect.s3.source.input; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMAS_ENABLE; import static org.assertj.core.api.Assertions.assertThat; diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetTransformerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformerTest.java similarity index 98% rename from s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetTransformerTest.java rename to s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformerTest.java index 1006b1247..827cdf381 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/output/ParquetTransformerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformerTest.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.output; +package io.aiven.kafka.connect.s3.source.input; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.mock; diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java index f038468a1..8a2ac24dc 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java @@ -35,7 +35,7 @@ import io.aiven.kafka.connect.s3.source.AivenKafkaConnectS3SourceConnector; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.output.InputFormat; +import io.aiven.kafka.connect.s3.source.input.InputFormat; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.ListObjectsV2Request; diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java index 8f983d57f..9e1b65ec4 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -40,7 +40,7 @@ import org.apache.kafka.connect.storage.Converter; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.output.Transformer; +import io.aiven.kafka.connect.s3.source.input.Transformer; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java index 4e23ec12c..a2fb31d9f 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -32,7 +32,7 @@ import java.util.List; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.output.Transformer; +import io.aiven.kafka.connect.s3.source.input.Transformer; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.ListObjectsV2Result; From 9bc403eec08b1f8de00c3d64e9a9b6315c4f6311 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Thu, 17 Oct 2024 11:40:36 +0200 Subject: [PATCH 51/90] Remove logic for processed recs --- .../kafka/connect/s3/source/IntegrationTest.java | 9 ++++++--- .../aiven/kafka/connect/s3/source/S3SourceTask.java | 1 + .../connect/s3/source/config/S3SourceConfig.java | 2 +- .../kafka/connect/s3/source/utils/FileReader.java | 11 +++-------- .../connect/s3/source/utils/RecordProcessor.java | 3 ++- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 650549734..46c519540 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -23,6 +23,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.INPUT_FORMAT_KEY; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_MESSAGE_BYTES_SIZE; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; @@ -176,13 +177,15 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx @Test void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { + final String testData = "AABBCCDDEE"; final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); - connectorConfig.put(MAX_MESSAGE_BYTES_SIZE, "2"); - connectRunner.createConnector(connectorConfig); + connectorConfig.put(MAX_MESSAGE_BYTES_SIZE, "2"); // For above test data of 10 bytes length, with 2 bytes each + // in source record, we expect 5 records. + connectorConfig.put(MAX_POLL_RECORDS, "2"); // In 3 polls all the 5 records should be processed - final String testData = "AABBCCDDEE"; + connectRunner.createConnector(connectorConfig); writeToS3(topicName, testData.getBytes(StandardCharsets.UTF_8), "00000"); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 9c1ac3d11..4f4541d63 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -142,6 +142,7 @@ private void prepareReaderFromOffsetStorageReader() { @Override public List<SourceRecord> poll() throws InterruptedException { + LOGGER.info("Polling again"); synchronized (pollLock) { final List<SourceRecord> results = new ArrayList<>(s3SourceConfig.getInt(MAX_POLL_RECORDS)); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index c64e64dc0..b98abf992 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -160,7 +160,7 @@ private static void addOtherConfig(final S3SourceConfigDef configDef) { ConfigDef.Importance.MEDIUM, "Fetch page size", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD // UnusedAssignment ConfigDef.Width.NONE, FETCH_PAGE_SIZE); - configDef.define(MAX_POLL_RECORDS, ConfigDef.Type.INT, 5, ConfigDef.Range.atLeast(1), + configDef.define(MAX_POLL_RECORDS, ConfigDef.Type.INT, 500, ConfigDef.Range.atLeast(1), ConfigDef.Importance.MEDIUM, "Max poll records", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD // UnusedAssignment ConfigDef.Width.NONE, MAX_POLL_RECORDS); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java index 9e3e8ac49..d09a6c4c5 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.List; @@ -54,16 +53,11 @@ public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName, this.inProcessObjectKeys = new HashSet<>(inProcessObjectKeys); } - public Set<String> getInProcessObjectKeys() { - return Collections.unmodifiableSet(this.inProcessObjectKeys); - } - @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") Iterator<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOException { final List<S3ObjectSummary> allSummaries = new ArrayList<>(); String continuationToken = null; ListObjectsV2Result objectListing; - do { // Create the request for listing objects final ListObjectsV2Request request = new ListObjectsV2Request().withBucketName(bucketName) @@ -78,7 +72,7 @@ Iterator<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws I .stream() .filter(objectSummary -> objectSummary.getSize() > 0) .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey())) - .filter(objectSummary -> !inProcessObjectKeys.contains(objectSummary.getKey())) + // .filter(objectSummary -> !inProcessObjectKeys.contains(objectSummary.getKey())) .collect(Collectors.toList()); allSummaries.addAll(filteredSummaries); // Add the filtered summaries to the main list @@ -86,8 +80,9 @@ Iterator<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws I // TO BE REMOVED before release allSummaries.forEach(objSummary -> LOGGER.info("Objects to be processed {} ", objSummary.getKey())); + // TODO handle objects in process // Objects being processed - allSummaries.forEach(objSummary -> this.inProcessObjectKeys.add(objSummary.getKey())); + // allSummaries.forEach(objSummary -> this.inProcessObjectKeys.add(objSummary.getKey())); // Check if there are more objects to fetch continuationToken = objectListing.getNextContinuationToken(); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index 5ebe3c919..98840181a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -79,7 +79,8 @@ static SourceRecord createSourceRecord(final AivenS3SourceRecord aivenS3SourceRe final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); offsetManager.updateCurrentOffsets(aivenS3SourceRecord.getPartitionMap(), aivenS3SourceRecord.getOffsetMap()); - fileReader.removeProcessedObjectKeys(aivenS3SourceRecord.getObjectKey()); + // TODO + // fileReader.removeProcessedObjectKeys(aivenS3SourceRecord.getObjectKey()); aivenS3SourceRecord.setOffsetMap(offsetManager.getOffsets().get(aivenS3SourceRecord.getPartitionMap())); return aivenS3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue); } catch (DataException e) { From fe6a1808b37eadab6c6f034bb3505ad04e0742e6 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Thu, 17 Oct 2024 14:25:56 +0200 Subject: [PATCH 52/90] Removed intermediate ConsumerRecord --- .../connect/s3/source/IntegrationTest.java | 2 +- .../s3/source/utils/SourceRecordIterator.java | 47 ++++++++++--------- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 46c519540..966fe92bc 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -148,8 +148,8 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectRunner.createConnector(connectorConfig); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); + connectRunner.createConnector(connectorConfig); final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index ba758165b..cca6ef08d 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -30,8 +30,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.kafka.clients.consumer.ConsumerRecord; - import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.input.Transformer; @@ -57,7 +55,7 @@ public final class SourceRecordIterator implements Iterator<AivenS3SourceRecord> private String currentObjectKey; private Iterator<S3ObjectSummary> nextFileIterator; - private Iterator<ConsumerRecord<byte[], byte[]>> recordIterator = Collections.emptyIterator(); + private Iterator<AivenS3SourceRecord> recordIterator = Collections.emptyIterator(); private final OffsetManager offsetManager; @@ -99,7 +97,7 @@ private void nextS3Object() { } } - private Iterator<ConsumerRecord<byte[], byte[]>> createIteratorForCurrentFile() throws IOException { + private Iterator<AivenS3SourceRecord> createIteratorForCurrentFile() throws IOException { try (S3Object s3Object = s3Client.getObject(bucketName, currentObjectKey); S3ObjectInputStream inputStream = s3Object.getObjectContent()) { @@ -129,17 +127,17 @@ private Iterator<ConsumerRecord<byte[], byte[]>> createIteratorForCurrentFile() } @SuppressWarnings("PMD.CognitiveComplexity") - private Iterator<ConsumerRecord<byte[], byte[]>> getObjectIterator(final InputStream valueInputStream, - final String topic, final int topicPartition, final long startOffset, final Transformer transformer, + private Iterator<AivenS3SourceRecord> getObjectIterator(final InputStream valueInputStream, final String topic, + final int topicPartition, final long startOffset, final Transformer transformer, final Map<String, Object> partitionMap) { return new Iterator<>() { - private final Iterator<ConsumerRecord<byte[], byte[]>> internalIterator = readNext().iterator(); + private final Iterator<AivenS3SourceRecord> internalIterator = readNext().iterator(); - private List<ConsumerRecord<byte[], byte[]>> readNext() { + private List<AivenS3SourceRecord> readNext() { final Optional<byte[]> optionalKeyBytes = Optional.ofNullable(currentObjectKey) .map(k -> k.getBytes(StandardCharsets.UTF_8)); - final List<ConsumerRecord<byte[], byte[]>> consumerRecordList = new ArrayList<>(); + final List<AivenS3SourceRecord> sourceRecords = new ArrayList<>(); int numOfProcessedRecs = 1; boolean checkOffsetMap = true; @@ -153,18 +151,18 @@ private List<ConsumerRecord<byte[], byte[]>> readNext() { final byte[] valueBytes = transformer.getValueBytes(record, topic, s3SourceConfig); checkOffsetMap = false; - consumerRecordList.add(getConsumerRecord(optionalKeyBytes, valueBytes, topic, topicPartition, + sourceRecords.add(getSourceRecord(optionalKeyBytes, valueBytes, topic, topicPartition, offsetManager, startOffset, partitionMap)); - if (consumerRecordList.size() >= s3SourceConfig.getInt(MAX_POLL_RECORDS)) { + if (sourceRecords.size() >= s3SourceConfig.getInt(MAX_POLL_RECORDS)) { break; } numOfProcessedRecs++; } - return consumerRecordList; + return sourceRecords; } - private ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> key, final byte[] value, + private AivenS3SourceRecord getSourceRecord(final Optional<byte[]> key, final byte[] value, final String topic, final int topicPartition, final OffsetManager offsetManager, final long startOffset, final Map<String, Object> partitionMap) { @@ -180,7 +178,10 @@ private ConsumerRecord<byte[], byte[]> getConsumerRecord(final Optional<byte[]> offsetManager.createNewOffsetMap(partitionMap, currentObjectKey, currentOffset); } - return new ConsumerRecord<>(topic, topicPartition, currentOffset, key.orElse(null), value); + final Map<String, Object> offsetMap = offsetManager.getOffsetValueMap(currentObjectKey, currentOffset); + + return new AivenS3SourceRecord(partitionMap, offsetMap, topic, topicPartition, key.orElse(null), value, + currentObjectKey); } @Override @@ -189,7 +190,7 @@ public boolean hasNext() { } @Override - public ConsumerRecord<byte[], byte[]> next() { + public AivenS3SourceRecord next() { return internalIterator.next(); } }; @@ -211,14 +212,14 @@ public AivenS3SourceRecord next() { return null; // Or throw new NoSuchElementException(); } - final ConsumerRecord<byte[], byte[]> consumerRecord = recordIterator.next(); - final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(consumerRecord.topic(), - consumerRecord.partition(), bucketName); - final Map<String, Object> offsetMap = offsetManager.getOffsetValueMap(currentObjectKey, - consumerRecord.offset()); - - return new AivenS3SourceRecord(partitionMap, offsetMap, consumerRecord.topic(), consumerRecord.partition(), - consumerRecord.key(), consumerRecord.value(), currentObjectKey); + return recordIterator.next(); + // final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(consumerRecord.topic(), + // consumerRecord.partition(), bucketName); + // final Map<String, Object> offsetMap = offsetManager.getOffsetValueMap(currentObjectKey, + // consumerRecord.offset()); + // + // return new AivenS3SourceRecord(partitionMap, offsetMap, consumerRecord.topic(), consumerRecord.partition(), + // consumerRecord.key(), consumerRecord.value(), currentObjectKey); } @Override From 440c18d7aaccbf5adb24e0f9087d1c5f60b1dfab Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Thu, 17 Oct 2024 16:00:06 +0200 Subject: [PATCH 53/90] Removed commented code --- .../connect/s3/source/utils/SourceRecordIterator.java | 7 ------- 1 file changed, 7 deletions(-) diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index cca6ef08d..bde839d42 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -213,13 +213,6 @@ public AivenS3SourceRecord next() { } return recordIterator.next(); - // final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(consumerRecord.topic(), - // consumerRecord.partition(), bucketName); - // final Map<String, Object> offsetMap = offsetManager.getOffsetValueMap(currentObjectKey, - // consumerRecord.offset()); - // - // return new AivenS3SourceRecord(partitionMap, offsetMap, consumerRecord.topic(), consumerRecord.partition(), - // consumerRecord.key(), consumerRecord.value(), currentObjectKey); } @Override From bce849f19255b6c9e603f0e9bb1a689723f306c4 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Fri, 18 Oct 2024 09:41:48 +0200 Subject: [PATCH 54/90] Updated from review --- .../kafka/connect/s3/source/S3SourceTask.java | 2 +- .../s3/source/config/S3SourceConfig.java | 2 +- .../s3/source/input/TransformerFactory.java | 2 +- .../connect/s3/source/utils/FileReader.java | 18 +++--------------- .../s3/source/utils/RecordProcessor.java | 2 -- .../s3/source/config/S3SourceConfigTest.java | 2 +- .../s3/source/utils/FileReaderTest.java | 3 +-- 7 files changed, 8 insertions(+), 23 deletions(-) diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 4f4541d63..180c32bda 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -112,7 +112,7 @@ public void start(final Map<String, String> props) { this.s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); this.transformer = TransformerFactory.getWriter(s3SourceConfig); offsetManager = new OffsetManager(context, s3SourceConfig); - fileReader = new FileReader(s3SourceConfig, this.s3Bucket, failedObjectKeys, inProcessObjectKeys); + fileReader = new FileReader(s3SourceConfig, this.s3Bucket, failedObjectKeys); prepareReaderFromOffsetStorageReader(); this.taskInitialized = true; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index b98abf992..1db2ba9aa 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -339,7 +339,7 @@ public String getAwsS3BucketName() { return getString(AWS_S3_BUCKET_NAME_CONFIG); } - public InputFormat getOutputFormat() { + public InputFormat getInputFormat() { return InputFormat.valueOf(getString(INPUT_FORMAT_KEY).toUpperCase(Locale.ROOT)); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformerFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformerFactory.java index 95ca6619b..a40e3ef22 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformerFactory.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformerFactory.java @@ -26,7 +26,7 @@ private TransformerFactory() { // hidden } public static Transformer getWriter(final S3SourceConfig s3SourceConfig) { - final InputFormat inputFormatEnum = s3SourceConfig.getOutputFormat(); + final InputFormat inputFormatEnum = s3SourceConfig.getInputFormat(); switch (inputFormatEnum) { case AVRO : return new AvroTransformer(); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java index d09a6c4c5..31cc4e381 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -43,14 +43,12 @@ public class FileReader { private final String bucketName; private final Set<String> failedObjectKeys; - private final Set<String> inProcessObjectKeys; - public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName, final Set<String> failedObjectKeys, - final Set<String> inProcessObjectKeys) { + public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName, + final Set<String> failedObjectKeys) { this.s3SourceConfig = s3SourceConfig; this.bucketName = bucketName; this.failedObjectKeys = new HashSet<>(failedObjectKeys); - this.inProcessObjectKeys = new HashSet<>(inProcessObjectKeys); } @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") @@ -72,17 +70,11 @@ Iterator<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws I .stream() .filter(objectSummary -> objectSummary.getSize() > 0) .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey())) - // .filter(objectSummary -> !inProcessObjectKeys.contains(objectSummary.getKey())) .collect(Collectors.toList()); allSummaries.addAll(filteredSummaries); // Add the filtered summaries to the main list - // TO BE REMOVED before release - allSummaries.forEach(objSummary -> LOGGER.info("Objects to be processed {} ", objSummary.getKey())); - - // TODO handle objects in process - // Objects being processed - // allSummaries.forEach(objSummary -> this.inProcessObjectKeys.add(objSummary.getKey())); + allSummaries.forEach(objSummary -> LOGGER.debug("Objects to be processed {} ", objSummary.getKey())); // Check if there are more objects to fetch continuationToken = objectListing.getNextContinuationToken(); @@ -94,8 +86,4 @@ Iterator<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws I public void addFailedObjectKeys(final String objectKey) { this.failedObjectKeys.add(objectKey); } - - public void removeProcessedObjectKeys(final String objectKey) { - this.inProcessObjectKeys.remove(objectKey); - } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index 98840181a..36954b542 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -79,8 +79,6 @@ static SourceRecord createSourceRecord(final AivenS3SourceRecord aivenS3SourceRe final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); offsetManager.updateCurrentOffsets(aivenS3SourceRecord.getPartitionMap(), aivenS3SourceRecord.getOffsetMap()); - // TODO - // fileReader.removeProcessedObjectKeys(aivenS3SourceRecord.getObjectKey()); aivenS3SourceRecord.setOffsetMap(offsetManager.getOffsets().get(aivenS3SourceRecord.getPartitionMap())); return aivenS3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue); } catch (DataException e) { diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java index 2876840c2..6e84687eb 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java @@ -57,7 +57,7 @@ void correctFullConfig() { assertThat(conf.getAwsS3EndPoint()).isEqualTo("AWS_S3_ENDPOINT"); assertThat(conf.getAwsS3Region()).isEqualTo(RegionUtils.getRegion("us-east-1")); - assertThat(conf.getOutputFormat()).isEqualTo(InputFormat.AVRO); + assertThat(conf.getInputFormat()).isEqualTo(InputFormat.AVRO); assertThat(conf.getTargetTopics()).isEqualTo("testtopic"); assertThat(conf.getTargetTopicPartitions()).isEqualTo("0,1"); assertThat(conf.getSchemaRegistryUrl()).isEqualTo("localhost:8081"); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java index 8a2ac24dc..190cb33c9 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java @@ -28,7 +28,6 @@ import java.io.IOException; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -64,7 +63,7 @@ public void setUp() { setBasicProperties(); final S3SourceConfig s3SourceConfig = new S3SourceConfig(properties); offsetManager = mock(OffsetManager.class); - fileReader = new FileReader(s3SourceConfig, TEST_BUCKET, Collections.emptySet(), new HashSet<>()); + fileReader = new FileReader(s3SourceConfig, TEST_BUCKET, Collections.emptySet()); s3Client = mock(AmazonS3.class); } From 6e12b3d480df85dfecadebf200d881be63e28ca1 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Fri, 18 Oct 2024 09:48:33 +0200 Subject: [PATCH 55/90] Rename writers --- .../io/aiven/kafka/connect/s3/source/S3SourceTask.java | 4 ++-- .../connect/s3/source/input/TransformerFactory.java | 2 +- .../kafka/connect/s3/source/S3SourceTaskTest.java | 2 +- .../s3/source/input/ParquetTransformerTest.java | 10 +++++----- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 180c32bda..4f2e54a52 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -110,7 +110,7 @@ public void start(final Map<String, String> props) { initializeConverters(); initializeS3Client(); this.s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); - this.transformer = TransformerFactory.getWriter(s3SourceConfig); + this.transformer = TransformerFactory.getTransformer(s3SourceConfig); offsetManager = new OffsetManager(context, s3SourceConfig); fileReader = new FileReader(s3SourceConfig, this.s3Bucket, failedObjectKeys); prepareReaderFromOffsetStorageReader(); @@ -222,7 +222,7 @@ public Converter getValueConverter() { return valueConverter; } - public Transformer getOutputWriter() { + public Transformer getTransformer() { return transformer; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformerFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformerFactory.java index a40e3ef22..4033e734a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformerFactory.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformerFactory.java @@ -25,7 +25,7 @@ public final class TransformerFactory { private TransformerFactory() { // hidden } - public static Transformer getWriter(final S3SourceConfig s3SourceConfig) { + public static Transformer getTransformer(final S3SourceConfig s3SourceConfig) { final InputFormat inputFormatEnum = s3SourceConfig.getInputFormat(); switch (inputFormatEnum) { case AVRO : diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java index 26a867f29..ef5dd3eec 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -136,7 +136,7 @@ void testS3SourceTaskInitialization() { final Converter valueConverter = s3SourceTask.getValueConverter(); assertThat(valueConverter).isInstanceOf(ByteArrayConverter.class); - final Transformer transformer = s3SourceTask.getOutputWriter(); + final Transformer transformer = s3SourceTask.getTransformer(); assertThat(transformer).isInstanceOf(ByteArrayTransformer.class); final boolean taskInitialized = s3SourceTask.isTaskInitialized(); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformerTest.java index 827cdf381..69d7ac493 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformerTest.java @@ -39,11 +39,11 @@ @ExtendWith(MockitoExtension.class) final class ParquetTransformerTest { - private ParquetTransformer parquetWriter; + private ParquetTransformer parquetTransformer; @BeforeEach public void setUp() { - parquetWriter = new ParquetTransformer(); + parquetTransformer = new ParquetTransformer(); } @Test @@ -54,7 +54,7 @@ void testHandleValueDataWithZeroBytes() { final String topic = "test-topic"; final int topicPartition = 0; - final List<Object> recs = parquetWriter.getRecords(inputStream, topic, topicPartition, s3SourceConfig); + final List<Object> recs = parquetTransformer.getRecords(inputStream, topic, topicPartition, s3SourceConfig); assertThat(recs).isEmpty(); } @@ -68,7 +68,7 @@ void testGetRecordsWithValidData() throws Exception { final String topic = "test-topic"; final int topicPartition = 0; - final List<Object> records = parquetWriter.getRecords(inputStream, topic, topicPartition, s3SourceConfig); + final List<Object> records = parquetTransformer.getRecords(inputStream, topic, topicPartition, s3SourceConfig); assertThat(records).isNotEmpty(); assertThat(records).extracting(record -> ((GenericRecord) record).get("name").toString()) @@ -85,7 +85,7 @@ void testGetRecordsWithInvalidData() { final String topic = "test-topic"; final int topicPartition = 0; - final List<Object> records = parquetWriter.getRecords(inputStream, topic, topicPartition, s3SourceConfig); + final List<Object> records = parquetTransformer.getRecords(inputStream, topic, topicPartition, s3SourceConfig); assertThat(records).isEmpty(); } From 9dda15501d3592c9456460e9d1ff63db05e26bbf Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Wed, 23 Oct 2024 16:52:16 +0200 Subject: [PATCH 56/90] Lazy file reader iterator, review changes --- .../connect/s3/source/IntegrationTest.java | 7 +- .../kafka/connect/s3/source/S3SourceTask.java | 12 ++- .../s3/source/config/S3SourceConfig.java | 17 +--- .../s3/source/input/ByteArrayTransformer.java | 4 +- .../s3/source/utils/AivenS3SourceRecord.java | 10 +-- .../connect/s3/source/utils/FileReader.java | 83 ++++++++++++------- .../s3/source/utils/RecordProcessor.java | 1 - .../s3/source/utils/SourceRecordIterator.java | 37 ++++----- .../input/ByteArrayTransformerTest.java | 8 +- 9 files changed, 94 insertions(+), 85 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 966fe92bc..0f4a3bf61 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -21,8 +21,8 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_ENDPOINT_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.EXPECTED_MAX_MESSAGE_BYTES; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.INPUT_FORMAT_KEY; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_MESSAGE_BYTES_SIZE; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; @@ -181,8 +181,9 @@ void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); - connectorConfig.put(MAX_MESSAGE_BYTES_SIZE, "2"); // For above test data of 10 bytes length, with 2 bytes each - // in source record, we expect 5 records. + connectorConfig.put(EXPECTED_MAX_MESSAGE_BYTES, "2"); // For above test data of 10 bytes length, with 2 bytes + // each + // in source record, we expect 5 records. connectorConfig.put(MAX_POLL_RECORDS, "2"); // In 3 polls all the 5 records should be processed connectRunner.createConnector(connectorConfig); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 4f2e54a52..eee1b36a2 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -120,11 +120,13 @@ public void start(final Map<String, String> props) { private void initializeConverters() { try { keyConverter = Optional - .of((Converter) s3SourceConfig.getClass("key.converter").getDeclaredConstructor().newInstance()); - valueConverter = (Converter) s3SourceConfig.getClass("value.converter") + .of((Converter) Class.forName((String) s3SourceConfig.originals().get("key.converter")) + .getDeclaredConstructor() + .newInstance()); + valueConverter = (Converter) Class.forName((String) s3SourceConfig.originals().get("value.converter")) .getDeclaredConstructor() .newInstance(); - } catch (InstantiationException | IllegalAccessException | InvocationTargetException + } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) { throw new ConnectException("Connect converters could not be instantiated.", e); } @@ -152,7 +154,8 @@ public List<SourceRecord> poll() throws InterruptedException { while (!connectorStopped.get()) { try { - return extractSourceRecords(results); + LOGGER.info("Number of records sent {}", extractSourceRecords(results).size()); + return results; } catch (AmazonS3Exception | DataException exception) { if (handleException(exception)) { return null; // NOPMD @@ -172,6 +175,7 @@ private boolean handleException(final RuntimeException exception) throws Interru if (((AmazonS3Exception) exception).isRetryable()) { LOGGER.warn("Retryable error while polling. Will sleep and try again.", exception); Thread.sleep(ERROR_BACKOFF); + prepareReaderFromOffsetStorageReader(); } else { return true; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 1db2ba9aa..77241348e 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -79,9 +79,7 @@ final public class S3SourceConfig extends AbstractConfig { public static final String TARGET_TOPICS = "topics"; public static final String FETCH_PAGE_SIZE = "aws.s3.fetch.page.size"; public static final String MAX_POLL_RECORDS = "max.poll.records"; - public static final String MAX_MESSAGE_BYTES_SIZE = "max.message.bytes"; - public static final String KEY_CONVERTER = "key.converter"; - public static final String VALUE_CONVERTER = "value.converter"; + public static final String EXPECTED_MAX_MESSAGE_BYTES = "expected.max.message.bytes"; public static final int S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT = 3; public static final String INPUT_FORMAT_KEY = "input.format"; public static final String SCHEMAS_ENABLE = "schemas.enable"; @@ -164,20 +162,11 @@ private static void addOtherConfig(final S3SourceConfigDef configDef) { ConfigDef.Importance.MEDIUM, "Max poll records", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD // UnusedAssignment ConfigDef.Width.NONE, MAX_POLL_RECORDS); - configDef.define(KEY_CONVERTER, ConfigDef.Type.CLASS, "org.apache.kafka.connect.converters.ByteArrayConverter", - ConfigDef.Importance.MEDIUM, "Key converter", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, KEY_CONVERTER); - configDef.define(VALUE_CONVERTER, ConfigDef.Type.CLASS, - "org.apache.kafka.connect.converters.ByteArrayConverter", ConfigDef.Importance.MEDIUM, - "Value converter", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, VALUE_CONVERTER); - configDef.define(MAX_MESSAGE_BYTES_SIZE, ConfigDef.Type.INT, 1_048_588, ConfigDef.Importance.MEDIUM, + configDef.define(EXPECTED_MAX_MESSAGE_BYTES, ConfigDef.Type.INT, 1_048_588, ConfigDef.Importance.MEDIUM, "The largest record batch size allowed by Kafka config max.message.bytes", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD // UnusedAssignment - ConfigDef.Width.NONE, MAX_MESSAGE_BYTES_SIZE); + ConfigDef.Width.NONE, EXPECTED_MAX_MESSAGE_BYTES); } private static void addAwsStsConfigGroup(final ConfigDef configDef) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformer.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformer.java index 472d8b93a..bc53e6330 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformer.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformer.java @@ -16,7 +16,7 @@ package io.aiven.kafka.connect.s3.source.input; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_MESSAGE_BYTES_SIZE; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.EXPECTED_MAX_MESSAGE_BYTES; import java.io.IOException; import java.io.InputStream; @@ -42,7 +42,7 @@ public void configureValueConverter(final Map<String, String> config, final S3So public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition, final S3SourceConfig s3SourceConfig) { - final int maxMessageBytesSize = s3SourceConfig.getInt(MAX_MESSAGE_BYTES_SIZE); + final int maxMessageBytesSize = s3SourceConfig.getInt(EXPECTED_MAX_MESSAGE_BYTES); final byte[] buffer = new byte[maxMessageBytesSize]; int bytesRead; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java index d3008fc25..87803c636 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java @@ -16,7 +16,6 @@ package io.aiven.kafka.connect.s3.source.utils; -import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.Map; @@ -43,8 +42,9 @@ public AivenS3SourceRecord(final Map<String, Object> partitionMap, final Map<Str this.topic = topic; this.topicPartition = topicPartition; - this.recordKey = Arrays.copyOf(recordKey, recordKey.length); - this.recordValue = Arrays.copyOf(recordValue, recordValue.length); + this.recordKey = recordKey.clone(); // Defensive copy + this.recordValue = recordValue.clone(); // Defensive copy + this.objectKey = objectKey; } @@ -65,11 +65,11 @@ public Integer partition() { } public byte[] key() { - return recordKey.clone(); + return (recordKey == null) ? null : recordKey.clone(); // Return a defensive copy } public byte[] value() { - return recordValue.clone(); + return (recordValue == null) ? null : recordValue.clone(); // Return a defensive copy } public String getObjectKey() { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java index 31cc4e381..f5c9ee864 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -18,7 +18,6 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; -import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; @@ -52,37 +51,59 @@ public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName, } @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") - Iterator<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) throws IOException { - final List<S3ObjectSummary> allSummaries = new ArrayList<>(); - String continuationToken = null; - ListObjectsV2Result objectListing; - do { - // Create the request for listing objects - final ListObjectsV2Request request = new ListObjectsV2Request().withBucketName(bucketName) - .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * PAGE_SIZE_FACTOR) - .withContinuationToken(continuationToken); // Set continuation token for pagination - - // List objects from S3 - objectListing = s3Client.listObjectsV2(request); - - // Filter out zero-byte objects and add to the list - final List<S3ObjectSummary> filteredSummaries = objectListing.getObjectSummaries() - .stream() - .filter(objectSummary -> objectSummary.getSize() > 0) - .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey())) - .collect(Collectors.toList()); - - allSummaries.addAll(filteredSummaries); // Add the filtered summaries to the main list - - allSummaries.forEach(objSummary -> LOGGER.debug("Objects to be processed {} ", objSummary.getKey())); - - // Check if there are more objects to fetch - continuationToken = objectListing.getNextContinuationToken(); - } while (objectListing.isTruncated()); // Continue fetching if the result is truncated - - return allSummaries.iterator(); + Iterator<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) { + return new Iterator<>() { + private String continuationToken = null; // NOPMD + private List<S3ObjectSummary> currentBatch = new ArrayList<>(); + private int currentIndex = 0; // NOPMD + private boolean isTruncated = true; + + @Override + public boolean hasNext() { + // If there are unprocessed objects in the current batch, we return true + if (currentIndex < currentBatch.size()) { + return true; + } + + if (isTruncated) { + fetchNextBatch(); + return !currentBatch.isEmpty(); + } + + return false; + } + + @Override + public S3ObjectSummary next() { + if (!hasNext()) { + return null; + } + + return currentBatch.get(currentIndex++); + } + + private void fetchNextBatch() { + currentBatch.clear(); + currentIndex = 0; + + final ListObjectsV2Request request = new ListObjectsV2Request().withBucketName(bucketName) + .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * PAGE_SIZE_FACTOR) + .withContinuationToken(continuationToken); + + final ListObjectsV2Result objectListing = s3Client.listObjectsV2(request); + currentBatch = objectListing.getObjectSummaries() + .stream() + .filter(objectSummary -> objectSummary.getSize() > 0) + .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey())) + .collect(Collectors.toList()); + + continuationToken = objectListing.getNextContinuationToken(); + isTruncated = objectListing.isTruncated(); + + currentBatch.forEach(objSummary -> LOGGER.debug("Objects to be processed {} ", objSummary.getKey())); + } + }; } - public void addFailedObjectKeys(final String objectKey) { this.failedObjectKeys.add(objectKey); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index 36954b542..337870a9f 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -60,7 +60,6 @@ public static List<SourceRecord> processRecords(final Iterator<AivenS3SourceReco } } - LOGGER.info("Number of records sent {}", results.size()); return results; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index bde839d42..e59e1a7c4 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -26,7 +26,6 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -54,7 +53,7 @@ public final class SourceRecordIterator implements Iterator<AivenS3SourceRecord> + "(?<partitionId>\\d{5})-" + "(?<uniqueId>[a-zA-Z0-9]+)" + "\\.(?<fileExtension>[^.]+)$"); // topic-00001.txt private String currentObjectKey; - private Iterator<S3ObjectSummary> nextFileIterator; + private final Iterator<S3ObjectSummary> s3ObjectSummaryIterator; private Iterator<AivenS3SourceRecord> recordIterator = Collections.emptyIterator(); private final OffsetManager offsetManager; @@ -75,23 +74,21 @@ public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 this.bucketName = bucketName; this.transformer = transformer; this.fileReader = fileReader; - try { - nextFileIterator = fileReader.fetchObjectSummaries(s3Client); - } catch (IOException e) { - throw new AmazonClientException("Failed to initialize S3 file reader", e); - } + s3ObjectSummaryIterator = fileReader.fetchObjectSummaries(s3Client); } private void nextS3Object() { - if (!nextFileIterator.hasNext()) { + if (!s3ObjectSummaryIterator.hasNext()) { recordIterator = Collections.emptyIterator(); return; } try { - final S3ObjectSummary file = nextFileIterator.next(); - currentObjectKey = file.getKey(); - recordIterator = createIteratorForCurrentFile(); + final S3ObjectSummary file = s3ObjectSummaryIterator.next(); + if (file != null) { + currentObjectKey = file.getKey(); + recordIterator = createIteratorForCurrentFile(); + } } catch (IOException e) { throw new AmazonClientException(e); } @@ -134,9 +131,7 @@ private Iterator<AivenS3SourceRecord> getObjectIterator(final InputStream valueI private final Iterator<AivenS3SourceRecord> internalIterator = readNext().iterator(); private List<AivenS3SourceRecord> readNext() { - - final Optional<byte[]> optionalKeyBytes = Optional.ofNullable(currentObjectKey) - .map(k -> k.getBytes(StandardCharsets.UTF_8)); + final byte[] keyBytes = currentObjectKey.getBytes(StandardCharsets.UTF_8); final List<AivenS3SourceRecord> sourceRecords = new ArrayList<>(); int numOfProcessedRecs = 1; @@ -151,8 +146,8 @@ private List<AivenS3SourceRecord> readNext() { final byte[] valueBytes = transformer.getValueBytes(record, topic, s3SourceConfig); checkOffsetMap = false; - sourceRecords.add(getSourceRecord(optionalKeyBytes, valueBytes, topic, topicPartition, - offsetManager, startOffset, partitionMap)); + sourceRecords.add(getSourceRecord(keyBytes, valueBytes, topic, topicPartition, offsetManager, + startOffset, partitionMap)); if (sourceRecords.size() >= s3SourceConfig.getInt(MAX_POLL_RECORDS)) { break; } @@ -162,9 +157,9 @@ private List<AivenS3SourceRecord> readNext() { return sourceRecords; } - private AivenS3SourceRecord getSourceRecord(final Optional<byte[]> key, final byte[] value, - final String topic, final int topicPartition, final OffsetManager offsetManager, - final long startOffset, final Map<String, Object> partitionMap) { + private AivenS3SourceRecord getSourceRecord(final byte[] key, final byte[] value, final String topic, + final int topicPartition, final OffsetManager offsetManager, final long startOffset, + final Map<String, Object> partitionMap) { long currentOffset; @@ -180,7 +175,7 @@ private AivenS3SourceRecord getSourceRecord(final Optional<byte[]> key, final by final Map<String, Object> offsetMap = offsetManager.getOffsetValueMap(currentObjectKey, currentOffset); - return new AivenS3SourceRecord(partitionMap, offsetMap, topic, topicPartition, key.orElse(null), value, + return new AivenS3SourceRecord(partitionMap, offsetMap, topic, topicPartition, key, value, currentObjectKey); } @@ -198,7 +193,7 @@ public AivenS3SourceRecord next() { @Override public boolean hasNext() { - return recordIterator.hasNext() || nextFileIterator.hasNext(); + return recordIterator.hasNext() || s3ObjectSummaryIterator.hasNext(); } @Override diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformerTest.java index 4c2bb0099..db743748f 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformerTest.java @@ -16,7 +16,7 @@ package io.aiven.kafka.connect.s3.source.input; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_MESSAGE_BYTES_SIZE; +import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.EXPECTED_MAX_MESSAGE_BYTES; import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.mockito.Mockito.when; @@ -52,7 +52,7 @@ void testGetRecordsSingleChunk() { final byte[] data = { 1, 2, 3, 4, 5 }; final InputStream inputStream = new ByteArrayInputStream(data); - when(s3SourceConfig.getInt(MAX_MESSAGE_BYTES_SIZE)).thenReturn(10_000); // Larger than data size + when(s3SourceConfig.getInt(EXPECTED_MAX_MESSAGE_BYTES)).thenReturn(10_000); // Larger than data size final List<Object> records = byteArrayTransformer.getRecords(inputStream, "test-topic", 0, s3SourceConfig); @@ -65,7 +65,7 @@ void testGetRecordsMultipleChunks() { final byte[] data = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; final InputStream inputStream = new ByteArrayInputStream(data); - when(s3SourceConfig.getInt(MAX_MESSAGE_BYTES_SIZE)).thenReturn(5); // Smaller than data size + when(s3SourceConfig.getInt(EXPECTED_MAX_MESSAGE_BYTES)).thenReturn(5); // Smaller than data size final List<Object> records = byteArrayTransformer.getRecords(inputStream, "test-topic", 0, s3SourceConfig); @@ -78,7 +78,7 @@ void testGetRecordsMultipleChunks() { void testGetRecordsEmptyInputStream() throws IOException { final InputStream inputStream = new ByteArrayInputStream(new byte[] {}); - when(s3SourceConfig.getInt(MAX_MESSAGE_BYTES_SIZE)).thenReturn(5); + when(s3SourceConfig.getInt(EXPECTED_MAX_MESSAGE_BYTES)).thenReturn(5); final List<Object> records = byteArrayTransformer.getRecords(inputStream, "test-topic", 0, s3SourceConfig); From 633268237c692348364c013e467767d79f9f996d Mon Sep 17 00:00:00 2001 From: Anatolii Popov <anatolii.popov@aiven.io> Date: Tue, 5 Nov 2024 16:40:44 +0200 Subject: [PATCH 57/90] chore: enabling GH actions for PRs into S3 source feature branch --- .github/workflows/codeql-analysis.yml | 8 ++++++-- .github/workflows/main_push_workflow.yml | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index b6a7393ee..9a5db94e7 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -13,10 +13,14 @@ name: "CodeQL" on: push: - branches: [main] + branches: + - main + - s3-source-release pull_request: # The branches below must be a subset of the branches above - branches: [main] + branches: + - main + - s3-source-release schedule: - cron: "42 20 * * 6" diff --git a/.github/workflows/main_push_workflow.yml b/.github/workflows/main_push_workflow.yml index 7db41ce21..8244ec088 100644 --- a/.github/workflows/main_push_workflow.yml +++ b/.github/workflows/main_push_workflow.yml @@ -2,9 +2,13 @@ name: Main and pull request checks on: push: - branches: [ main ] + branches: + - main + - s3-source-release pull_request: - branches: [ main ] + branches: + - main + - s3-source-release jobs: build: strategy: From b4f91f93b94abecc00d28cce800394f63dcb00a1 Mon Sep 17 00:00:00 2001 From: Murali Basani <muralidhar.basani@aiven.io> Date: Wed, 6 Nov 2024 15:43:38 +0100 Subject: [PATCH 58/90] Improve logging and some exception handling for clarity --- .../connect/s3/source/IntegrationBase.java | 8 ++-- .../connect/s3/source/IntegrationTest.java | 31 -------------- .../kafka/connect/s3/source/S3SourceTask.java | 42 ++++++++----------- .../s3/source/input/AvroTransformer.java | 6 ++- .../s3/source/input/JsonTransformer.java | 2 +- .../s3/source/utils/RecordProcessor.java | 28 ++++++------- ...3SourceRecord.java => S3SourceRecord.java} | 4 +- .../s3/source/utils/SourceRecordIterator.java | 23 +++++----- .../connect/s3/source/S3SourceTaskTest.java | 10 ++--- .../s3/source/utils/RecordProcessorTest.java | 6 +-- 10 files changed, 62 insertions(+), 98 deletions(-) rename s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/{AivenS3SourceRecord.java => S3SourceRecord.java} (95%) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index d89d85473..67253ec29 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -59,6 +59,8 @@ public interface IntegrationBase { String DOCKER_IMAGE_KAFKA = "confluentinc/cp-kafka:7.7.0"; + String PLUGINS_S_3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA = "plugins/s3-source-connector-for-apache-kafka/"; + String S_3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA_TEST = "s3-source-connector-for-apache-kafka-test-"; default AdminClient newAdminClient(final KafkaContainer kafka) { final Properties adminClientConfig = new Properties(); @@ -81,9 +83,9 @@ static void extractConnectorPlugin(File pluginDir) throws IOException, Interrupt } static File getPluginDir() throws IOException { - final File testDir = Files.createTempDirectory("s3-source-connector-for-apache-kafka-test-").toFile(); + final File testDir = Files.createTempDirectory(S_3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA_TEST).toFile(); - final File pluginDir = new File(testDir, "plugins/s3-source-connector-for-apache-kafka/"); + final File pluginDir = new File(testDir, PLUGINS_S_3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA); assert pluginDir.mkdirs(); return pluginDir; } @@ -98,7 +100,7 @@ static KafkaContainer createKafkaContainer() { } static String topicName(final TestInfo testInfo) { - return testInfo.getTestMethod().get().getName();// + "-" + testInfo.getDisplayName().hashCode(); + return testInfo.getTestMethod().get().getName(); } static void createTopics(final AdminClient adminClient, final List<String> topicNames) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 0f4a3bf61..461e991ad 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -34,7 +34,6 @@ import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; -import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; @@ -50,7 +49,6 @@ import io.aiven.kafka.connect.s3.source.input.InputFormat; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; -import io.aiven.kafka.connect.s3.source.testutils.S3OutputStream; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.PutObjectRequest; @@ -61,7 +59,6 @@ import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumWriter; -import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; @@ -202,20 +199,6 @@ void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) assertThat(records.get(4)).isEqualTo("EE"); } - @Test - void multiPartUploadBytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedException { - final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - - connectRunner.createConnector(connectorConfig); - final String partition = "00001"; - final String key = topicName + "-" + partition + "-" + System.currentTimeMillis() + ".txt"; - multipartUpload(TEST_BUCKET_NAME, key); - // Poll messages from the Kafka topic and verify the consumed data - final List<String> records = IntegrationBase.consumeMessages(topicName, 1, KAFKA_CONTAINER); - assertThat(records.get(0)).contains("performanceeeqjz"); - } - @Test void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { final var topicName = IntegrationBase.topicName(testInfo); @@ -384,18 +367,4 @@ public static void saveToS3(final String bucketName, final String folderName, fi final PutObjectRequest request = new PutObjectRequest(bucketName, folderName + fileNameInS3, fileToWrite); s3Client.putObject(request); } - - public void multipartUpload(final String bucketName, final String key) { - try (S3OutputStream s3OutputStream = new S3OutputStream(bucketName, key, S3OutputStream.DEFAULT_PART_SIZE, - s3Client); - InputStream resourceStream = Thread.currentThread() - .getContextClassLoader() - .getResourceAsStream(S3_FILE_NAME)) { - assert resourceStream != null; - final byte[] fileBytes = IOUtils.toByteArray(resourceStream); - s3OutputStream.write(fileBytes); - } catch (IOException e) { - LOGGER.error(e.getMessage()); - } - } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index eee1b36a2..087fd0451 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -39,10 +39,10 @@ import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.input.Transformer; import io.aiven.kafka.connect.s3.source.input.TransformerFactory; -import io.aiven.kafka.connect.s3.source.utils.AivenS3SourceRecord; import io.aiven.kafka.connect.s3.source.utils.FileReader; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import io.aiven.kafka.connect.s3.source.utils.RecordProcessor; +import io.aiven.kafka.connect.s3.source.utils.S3SourceRecord; import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; import io.aiven.kafka.connect.s3.source.utils.Version; @@ -72,7 +72,7 @@ public class S3SourceTask extends SourceTask { private S3SourceConfig s3SourceConfig; private AmazonS3 s3Client; - private Iterator<AivenS3SourceRecord> sourceRecordIterator; + private Iterator<S3SourceRecord> sourceRecordIterator; private Optional<Converter> keyConverter; private Converter valueConverter; @@ -144,24 +144,35 @@ private void prepareReaderFromOffsetStorageReader() { @Override public List<SourceRecord> poll() throws InterruptedException { - LOGGER.info("Polling again"); + LOGGER.info("Polling for new records..."); synchronized (pollLock) { final List<SourceRecord> results = new ArrayList<>(s3SourceConfig.getInt(MAX_POLL_RECORDS)); if (connectorStopped.get()) { + LOGGER.info("Connector has been stopped. Returning empty result list."); return results; } while (!connectorStopped.get()) { try { - LOGGER.info("Number of records sent {}", extractSourceRecords(results).size()); + extractSourceRecords(results); + LOGGER.info("Number of records extracted and sent: {}", results.size()); return results; - } catch (AmazonS3Exception | DataException exception) { - if (handleException(exception)) { + } catch (AmazonS3Exception exception) { + if (exception.isRetryable()) { + LOGGER.warn("Retryable error encountered during polling. Waiting before retrying...", + exception); + pollLock.wait(ERROR_BACKOFF); + + prepareReaderFromOffsetStorageReader(); + } else { + LOGGER.warn("Non-retryable AmazonS3Exception occurred. Stopping polling.", exception); return null; // NOPMD } + } catch (DataException exception) { + LOGGER.warn("DataException occurred during polling. No retries will be attempted.", exception); } catch (final Throwable t) { // NOPMD - // This task has failed, so close any resources (may be reopened if needed) before throwing + LOGGER.error("Unexpected error encountered. Closing resources and stopping task.", t); closeResources(); throw t; } @@ -170,23 +181,6 @@ public List<SourceRecord> poll() throws InterruptedException { } } - private boolean handleException(final RuntimeException exception) throws InterruptedException { - if (exception instanceof AmazonS3Exception) { - if (((AmazonS3Exception) exception).isRetryable()) { - LOGGER.warn("Retryable error while polling. Will sleep and try again.", exception); - Thread.sleep(ERROR_BACKOFF); - - prepareReaderFromOffsetStorageReader(); - } else { - return true; - } - } - if (exception instanceof DataException) { - LOGGER.warn("DataException. Will NOT try again.", exception); - } - return false; - } - private List<SourceRecord> extractSourceRecords(final List<SourceRecord> results) throws InterruptedException { waitForObjects(); if (connectorStopped.get()) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java index 620aee2d4..a781f6bd1 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java @@ -64,10 +64,12 @@ List<Object> readAvroRecords(final InputStream content, final DatumReader<Generi try (DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, datumReader)) { reader.forEach(records::add); } catch (IOException e) { - LOGGER.error("Error in reading s3 object stream {}", e.getMessage(), e); + LOGGER.error("Failed to read records from DataFileReader for S3 object stream. Error: {}", + e.getMessage(), e); } } catch (IOException e) { - LOGGER.error("Error in reading s3 object stream {}", e.getMessage(), e); + LOGGER.error("Failed to initialize SeekableByteArrayInput for S3 object stream. Error: {}", e.getMessage(), + e); } return records; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/JsonTransformer.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/JsonTransformer.java index 7e1010fa8..5cda04f1a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/JsonTransformer.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/JsonTransformer.java @@ -78,7 +78,7 @@ public byte[] getValueBytes(final Object record, final String topic, final S3Sou try { return objectMapper.writeValueAsBytes(record); } catch (JsonProcessingException e) { - LOGGER.error("Error in reading s3 object stream {}", e.getMessage(), e); + LOGGER.error("Failed to serialize record to JSON bytes. Error: {}", e.getMessage(), e); return new byte[0]; } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index 337870a9f..40bf80bc4 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -42,7 +42,7 @@ private RecordProcessor() { } - public static List<SourceRecord> processRecords(final Iterator<AivenS3SourceRecord> sourceRecordIterator, + public static List<SourceRecord> processRecords(final Iterator<S3SourceRecord> sourceRecordIterator, final List<SourceRecord> results, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, final AtomicBoolean connectorStopped, final Transformer transformer, final FileReader fileReader, @@ -52,9 +52,9 @@ public static List<SourceRecord> processRecords(final Iterator<AivenS3SourceReco final int maxPollRecords = s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS); for (int i = 0; sourceRecordIterator.hasNext() && i < maxPollRecords && !connectorStopped.get(); i++) { - final AivenS3SourceRecord aivenS3SourceRecord = sourceRecordIterator.next(); - if (aivenS3SourceRecord != null) { - final SourceRecord sourceRecord = createSourceRecord(aivenS3SourceRecord, s3SourceConfig, keyConverter, + final S3SourceRecord s3SourceRecord = sourceRecordIterator.next(); + if (s3SourceRecord != null) { + final SourceRecord sourceRecord = createSourceRecord(s3SourceRecord, s3SourceConfig, keyConverter, valueConverter, conversionConfig, transformer, fileReader, offsetManager); results.add(sourceRecord); } @@ -63,26 +63,24 @@ public static List<SourceRecord> processRecords(final Iterator<AivenS3SourceReco return results; } - static SourceRecord createSourceRecord(final AivenS3SourceRecord aivenS3SourceRecord, - final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, + static SourceRecord createSourceRecord(final S3SourceRecord s3SourceRecord, final S3SourceConfig s3SourceConfig, + final Optional<Converter> keyConverter, final Converter valueConverter, final Map<String, String> conversionConfig, final Transformer transformer, final FileReader fileReader, final OffsetManager offsetManager) { - final String topic = aivenS3SourceRecord.getTopic(); - final Optional<SchemaAndValue> keyData = keyConverter - .map(c -> c.toConnectData(topic, aivenS3SourceRecord.key())); + final String topic = s3SourceRecord.getTopic(); + final Optional<SchemaAndValue> keyData = keyConverter.map(c -> c.toConnectData(topic, s3SourceRecord.key())); transformer.configureValueConverter(conversionConfig, s3SourceConfig); valueConverter.configure(conversionConfig, false); try { - final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, aivenS3SourceRecord.value()); - offsetManager.updateCurrentOffsets(aivenS3SourceRecord.getPartitionMap(), - aivenS3SourceRecord.getOffsetMap()); - aivenS3SourceRecord.setOffsetMap(offsetManager.getOffsets().get(aivenS3SourceRecord.getPartitionMap())); - return aivenS3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue); + final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, s3SourceRecord.value()); + offsetManager.updateCurrentOffsets(s3SourceRecord.getPartitionMap(), s3SourceRecord.getOffsetMap()); + s3SourceRecord.setOffsetMap(offsetManager.getOffsets().get(s3SourceRecord.getPartitionMap())); + return s3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue); } catch (DataException e) { LOGGER.error("Error in reading s3 object stream {}", e.getMessage(), e); - fileReader.addFailedObjectKeys(aivenS3SourceRecord.getObjectKey()); + fileReader.addFailedObjectKeys(s3SourceRecord.getObjectKey()); throw e; } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java similarity index 95% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java index 87803c636..7880bf868 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AivenS3SourceRecord.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java @@ -24,7 +24,7 @@ import org.apache.kafka.connect.data.SchemaAndValue; import org.apache.kafka.connect.source.SourceRecord; -public class AivenS3SourceRecord { +public class S3SourceRecord { private final Map<String, Object> partitionMap; private Map<String, Object> offsetMap; private final String topic; @@ -34,7 +34,7 @@ public class AivenS3SourceRecord { private final String objectKey; - public AivenS3SourceRecord(final Map<String, Object> partitionMap, final Map<String, Object> offsetMap, + public S3SourceRecord(final Map<String, Object> partitionMap, final Map<String, Object> offsetMap, final String topic, final Integer topicPartition, final byte[] recordKey, final byte[] recordValue, final String objectKey) { this.partitionMap = new HashMap<>(partitionMap); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index e59e1a7c4..8c1fcb77d 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -44,7 +44,7 @@ * Iterator that processes S3 files and creates Kafka source records. Supports different output formats (Avro, JSON, * Parquet). */ -public final class SourceRecordIterator implements Iterator<AivenS3SourceRecord> { +public final class SourceRecordIterator implements Iterator<S3SourceRecord> { private static final Logger LOGGER = LoggerFactory.getLogger(SourceRecordIterator.class); public static final String PATTERN_TOPIC_KEY = "topicName"; public static final String PATTERN_PARTITION_KEY = "partitionId"; @@ -54,7 +54,7 @@ public final class SourceRecordIterator implements Iterator<AivenS3SourceRecord> private String currentObjectKey; private final Iterator<S3ObjectSummary> s3ObjectSummaryIterator; - private Iterator<AivenS3SourceRecord> recordIterator = Collections.emptyIterator(); + private Iterator<S3SourceRecord> recordIterator = Collections.emptyIterator(); private final OffsetManager offsetManager; @@ -94,7 +94,7 @@ private void nextS3Object() { } } - private Iterator<AivenS3SourceRecord> createIteratorForCurrentFile() throws IOException { + private Iterator<S3SourceRecord> createIteratorForCurrentFile() throws IOException { try (S3Object s3Object = s3Client.getObject(bucketName, currentObjectKey); S3ObjectInputStream inputStream = s3Object.getObjectContent()) { @@ -124,15 +124,15 @@ private Iterator<AivenS3SourceRecord> createIteratorForCurrentFile() throws IOEx } @SuppressWarnings("PMD.CognitiveComplexity") - private Iterator<AivenS3SourceRecord> getObjectIterator(final InputStream valueInputStream, final String topic, + private Iterator<S3SourceRecord> getObjectIterator(final InputStream valueInputStream, final String topic, final int topicPartition, final long startOffset, final Transformer transformer, final Map<String, Object> partitionMap) { return new Iterator<>() { - private final Iterator<AivenS3SourceRecord> internalIterator = readNext().iterator(); + private final Iterator<S3SourceRecord> internalIterator = readNext().iterator(); - private List<AivenS3SourceRecord> readNext() { + private List<S3SourceRecord> readNext() { final byte[] keyBytes = currentObjectKey.getBytes(StandardCharsets.UTF_8); - final List<AivenS3SourceRecord> sourceRecords = new ArrayList<>(); + final List<S3SourceRecord> sourceRecords = new ArrayList<>(); int numOfProcessedRecs = 1; boolean checkOffsetMap = true; @@ -157,7 +157,7 @@ private List<AivenS3SourceRecord> readNext() { return sourceRecords; } - private AivenS3SourceRecord getSourceRecord(final byte[] key, final byte[] value, final String topic, + private S3SourceRecord getSourceRecord(final byte[] key, final byte[] value, final String topic, final int topicPartition, final OffsetManager offsetManager, final long startOffset, final Map<String, Object> partitionMap) { @@ -175,8 +175,7 @@ private AivenS3SourceRecord getSourceRecord(final byte[] key, final byte[] value final Map<String, Object> offsetMap = offsetManager.getOffsetValueMap(currentObjectKey, currentOffset); - return new AivenS3SourceRecord(partitionMap, offsetMap, topic, topicPartition, key, value, - currentObjectKey); + return new S3SourceRecord(partitionMap, offsetMap, topic, topicPartition, key, value, currentObjectKey); } @Override @@ -185,7 +184,7 @@ public boolean hasNext() { } @Override - public AivenS3SourceRecord next() { + public S3SourceRecord next() { return internalIterator.next(); } }; @@ -197,7 +196,7 @@ public boolean hasNext() { } @Override - public AivenS3SourceRecord next() { + public S3SourceRecord next() { if (!recordIterator.hasNext()) { nextS3Object(); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java index ef5dd3eec..c839a1269 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -40,7 +40,7 @@ import io.aiven.kafka.connect.s3.source.input.InputFormat; import io.aiven.kafka.connect.s3.source.input.Transformer; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; -import io.aiven.kafka.connect.s3.source.utils.AivenS3SourceRecord; +import io.aiven.kafka.connect.s3.source.utils.S3SourceRecord; import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; import com.amazonaws.auth.AWSStaticCredentialsProvider; @@ -154,8 +154,8 @@ void testPoll() throws Exception { setPrivateField(s3SourceTask, "sourceRecordIterator", mockSourceRecordIterator); when(mockSourceRecordIterator.hasNext()).thenReturn(true).thenReturn(true).thenReturn(false); - final AivenS3SourceRecord aivenS3SourceRecordList = getAivenS3SourceRecord(); - when(mockSourceRecordIterator.next()).thenReturn(aivenS3SourceRecordList); + final S3SourceRecord s3SourceRecordList = getAivenS3SourceRecord(); + when(mockSourceRecordIterator.next()).thenReturn(s3SourceRecordList); final List<SourceRecord> sourceRecordList = s3SourceTask.poll(); assertThat(sourceRecordList).isNotEmpty(); @@ -172,8 +172,8 @@ void testStop() { assertThat(s3SourceTask.getConnectorStopped()).isTrue(); } - private static AivenS3SourceRecord getAivenS3SourceRecord() { - return new AivenS3SourceRecord(new HashMap<>(), new HashMap<>(), "testtopic", 0, new byte[0], new byte[0], ""); + private static S3SourceRecord getAivenS3SourceRecord() { + return new S3SourceRecord(new HashMap<>(), new HashMap<>(), "testtopic", 0, new byte[0], new byte[0], ""); } @SuppressWarnings("PMD.AvoidAccessibilityAlteration") diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java index 9e1b65ec4..cc7d765c6 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -66,7 +66,7 @@ class RecordProcessorTest { private FileReader fileReader; private AtomicBoolean connectorStopped; - private Iterator<AivenS3SourceRecord> sourceRecordIterator; + private Iterator<S3SourceRecord> sourceRecordIterator; @BeforeEach void setUp() { @@ -98,7 +98,7 @@ void testProcessRecordsWithRecords() throws ConnectException { when(s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS)).thenReturn(5); when(sourceRecordIterator.hasNext()).thenReturn(true, false); // One iteration with records - final AivenS3SourceRecord mockRecord = mock(AivenS3SourceRecord.class); + final S3SourceRecord mockRecord = mock(S3SourceRecord.class); when(sourceRecordIterator.next()).thenReturn(mockRecord); final List<SourceRecord> results = new ArrayList<>(); @@ -138,7 +138,7 @@ void testProcessRecordsConnectorStopped() { @Test void testCreateSourceRecords() { - final AivenS3SourceRecord mockRecord = mock(AivenS3SourceRecord.class); + final S3SourceRecord mockRecord = mock(S3SourceRecord.class); when(mockRecord.getTopic()).thenReturn("test-topic"); when(mockRecord.key()).thenReturn("mock-key".getBytes(StandardCharsets.UTF_8)); when(mockRecord.value()).thenReturn("mock-value".getBytes(StandardCharsets.UTF_8)); From f7b097cf1f4bfed0df9d641a634ce453155800b1 Mon Sep 17 00:00:00 2001 From: Anatolii Popov <anatolii.popov@aiven.io> Date: Thu, 7 Nov 2024 21:05:42 +0200 Subject: [PATCH 59/90] tests: migration to EmbeddedConnectCluster for integration tests --- s3-source-connector/build.gradle.kts | 11 +- .../connect/s3/source/ConnectRunner.java | 123 +++++++----------- .../connect/s3/source/IntegrationBase.java | 52 ++++---- .../connect/s3/source/IntegrationTest.java | 71 +++++----- .../s3/source/SchemaRegistryContainer.java | 19 +-- 5 files changed, 124 insertions(+), 152 deletions(-) diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts index d5517b4cc..ad2c69d2a 100644 --- a/s3-source-connector/build.gradle.kts +++ b/s3-source-connector/build.gradle.kts @@ -22,6 +22,7 @@ val amazonS3Version by extra("1.12.729") val amazonSTSVersion by extra("1.12.729") val s3mockVersion by extra("0.2.6") val parquetVersion by extra("1.14.3") +val testKafkaVersion by extra("3.7.1") val integrationTest: SourceSet = sourceSets.create("integrationTest") { @@ -148,7 +149,9 @@ dependencies { testRuntimeOnly(logginglibs.logback.classic) - integrationTestImplementation(testinglibs.localstack) + integrationTestImplementation(testinglibs.localstack) { + exclude(group = "io.netty", module = "netty-transport-native-epoll") + } integrationTestImplementation(testcontainers.junit.jupiter) integrationTestImplementation(testcontainers.kafka) // this is not Kafka version integrationTestImplementation(testcontainers.localstack) @@ -197,6 +200,12 @@ dependencies { exclude(group = "io.netty", module = "netty") } + integrationTestImplementation("org.apache.kafka:connect-runtime:${testKafkaVersion}:test") + integrationTestImplementation("org.apache.kafka:connect-runtime:${testKafkaVersion}") + integrationTestImplementation("org.apache.kafka:kafka-clients:${testKafkaVersion}:test") + integrationTestImplementation("org.apache.kafka:kafka_2.13:${testKafkaVersion}:test") + integrationTestImplementation("org.apache.kafka:kafka_2.13:${testKafkaVersion}") + // Make test utils from 'test' available in 'integration-test' integrationTestImplementation(sourceSets["test"].output) integrationTestImplementation(testinglibs.awaitility) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/ConnectRunner.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/ConnectRunner.java index fbe1ad97a..d746405da 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/ConnectRunner.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/ConnectRunner.java @@ -16,26 +16,11 @@ package io.aiven.kafka.connect.s3.source; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; import java.util.HashMap; import java.util.Map; -import java.util.concurrent.ExecutionException; - -import org.apache.kafka.common.utils.Time; -import org.apache.kafka.connect.runtime.Connect; -import org.apache.kafka.connect.runtime.ConnectorConfig; -import org.apache.kafka.connect.runtime.Herder; -import org.apache.kafka.connect.runtime.Worker; -import org.apache.kafka.connect.runtime.isolation.Plugins; -import org.apache.kafka.connect.runtime.rest.RestServer; -import org.apache.kafka.connect.runtime.rest.entities.ConnectorInfo; -import org.apache.kafka.connect.runtime.standalone.StandaloneConfig; -import org.apache.kafka.connect.runtime.standalone.StandaloneHerder; -import org.apache.kafka.connect.storage.MemoryOffsetBackingStore; -import org.apache.kafka.connect.util.FutureCallback; +import java.util.Properties; + +import org.apache.kafka.connect.util.clusters.EmbeddedConnectCluster; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -43,25 +28,55 @@ final class ConnectRunner { private static final Logger LOGGER = LoggerFactory.getLogger(ConnectRunner.class); - private final File pluginDir; - private final String bootstrapServers; - private final int offsetFlushInterval; + private EmbeddedConnectCluster connectCluster; + + private final int offsetFlushIntervalMs; + + public ConnectRunner(final int offsetFlushIntervalMs) { + this.offsetFlushIntervalMs = offsetFlushIntervalMs; + } + + void startConnectCluster(final String connectorName, final int localPort, final int containerPort) { + + final Properties brokerProperties = new Properties(); + brokerProperties.put("advertised.listeners", "PLAINTEXT://localhost:" + localPort + + ",TESTCONTAINERS://host.testcontainers.internal:" + containerPort); + brokerProperties.put("listeners", + "PLAINTEXT://localhost:" + localPort + ",TESTCONTAINERS://localhost:" + containerPort); + brokerProperties.put("listener.security.protocol.map", "PLAINTEXT:PLAINTEXT,TESTCONTAINERS:PLAINTEXT"); + + connectCluster = new EmbeddedConnectCluster.Builder().name(connectorName) + .brokerProps(brokerProperties) + .workerProps(getWorkerProperties()) + .build(); + connectCluster.start(); + LOGGER.info("connectCluster started"); + } + + String getBootstrapServers() { + return connectCluster.kafka().bootstrapServers(); + } - private Herder herder; - private Connect connect; + void deleteConnector(final String connectorName) { + connectCluster.deleteConnector(connectorName); + } + + void stopConnectCluster() { + // stop all Connect, Kafka and Zk threads. + if (connectCluster != null) { + connectCluster.stop(); + } + LOGGER.info("connectCluster stopped"); + } - public ConnectRunner(final File pluginDir, final String bootstrapServers, final int offsetFlushIntervalMs) { - this.pluginDir = pluginDir; - this.bootstrapServers = bootstrapServers; - this.offsetFlushInterval = offsetFlushIntervalMs; + String configureConnector(final String connName, final Map<String, String> connConfig) { + return connectCluster.configureConnector(connName, connConfig); } - void start() throws IOException { + private Map<String, String> getWorkerProperties() { final Map<String, String> workerProps = new HashMap<>(); - final File tempFile = File.createTempFile("connect", "offsets"); - workerProps.put("bootstrap.servers", bootstrapServers); - workerProps.put("offset.flush.interval.ms", Integer.toString(offsetFlushInterval)); + workerProps.put("offset.flush.interval.ms", Integer.toString(offsetFlushIntervalMs)); // These don't matter much (each connector sets its own converters), but need to be filled with valid classes. workerProps.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); @@ -71,50 +86,8 @@ void start() throws IOException { workerProps.put("internal.value.converter", "org.apache.kafka.connect.json.JsonConverter"); workerProps.put("internal.value.converter.schemas.enable", "true"); - workerProps.put("offset.storage.file.filename", tempFile.getCanonicalPath()); - - workerProps.put("plugin.path", pluginDir.getPath()); - - final Time time = Time.SYSTEM; - final String workerId = "test-worker"; - final String kafkaClusterId = "test-cluster"; - - final Plugins plugins = new Plugins(workerProps); - final StandaloneConfig config = new StandaloneConfig(workerProps); - - final Worker worker = new Worker(workerId, time, plugins, config, new MemoryOffsetBackingStore()); - herder = new StandaloneHerder(worker, kafkaClusterId); - - final RestServer rest = new RestServer(config); - - connect = new Connect(herder, rest); - - connect.start(); - } - - void createConnector(final Map<String, String> config) throws ExecutionException, InterruptedException { - assert herder != null; - - final FutureCallback<Herder.Created<ConnectorInfo>> callback = new FutureCallback<>((error, info) -> { - if (error != null) { - LOGGER.error("Failed to create job"); - } else { - LOGGER.info("Created connector {}", info.result().name()); - } - }); - herder.putConnectorConfig(config.get(ConnectorConfig.NAME_CONFIG), config, false, callback); - - final Herder.Created<ConnectorInfo> connectorInfoCreated = callback.get(); - assert connectorInfoCreated.created(); - assertThat(connectorInfoCreated.result().config().get("connector.class")) - .isEqualTo(AivenKafkaConnectS3SourceConnector.class.getName()); - } - - void stop() { - connect.stop(); - } + workerProps.put("plugin.discovery", "hybrid_warn"); - void awaitStop() { - connect.awaitStop(); + return workerProps; } } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index 67253ec29..e06bd3a8a 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -16,8 +16,11 @@ package io.aiven.kafka.connect.s3.source; +import static org.awaitility.Awaitility.await; + import java.io.File; import java.io.IOException; +import java.net.ServerSocket; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.time.Duration; @@ -45,34 +48,24 @@ import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3ClientBuilder; import com.fasterxml.jackson.databind.JsonNode; -import com.github.dockerjava.api.model.Ulimit; import io.confluent.kafka.serializers.KafkaAvroDeserializer; import org.apache.avro.generic.GenericRecord; -import org.awaitility.Awaitility; import org.junit.jupiter.api.TestInfo; import org.testcontainers.containers.Container; -import org.testcontainers.containers.KafkaContainer; -import org.testcontainers.containers.Network; import org.testcontainers.containers.localstack.LocalStackContainer; import org.testcontainers.utility.DockerImageName; public interface IntegrationBase { - String DOCKER_IMAGE_KAFKA = "confluentinc/cp-kafka:7.7.0"; String PLUGINS_S_3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA = "plugins/s3-source-connector-for-apache-kafka/"; String S_3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA_TEST = "s3-source-connector-for-apache-kafka-test-"; - default AdminClient newAdminClient(final KafkaContainer kafka) { + default AdminClient newAdminClient(final String bootstrapServers) { final Properties adminClientConfig = new Properties(); - adminClientConfig.put(AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, kafka.getBootstrapServers()); + adminClientConfig.put(AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); return AdminClient.create(adminClientConfig); } - default ConnectRunner newConnectRunner(final KafkaContainer kafka, final File pluginDir, - final int offsetFlushIntervalMs) { - return new ConnectRunner(pluginDir, kafka.getBootstrapServers(), offsetFlushIntervalMs); - } - static void extractConnectorPlugin(File pluginDir) throws IOException, InterruptedException { final File distFile = new File(System.getProperty("integration-test.distribution.file.path")); assert distFile.exists(); @@ -90,15 +83,6 @@ static File getPluginDir() throws IOException { return pluginDir; } - static KafkaContainer createKafkaContainer() { - return new KafkaContainer(DockerImageName.parse(DOCKER_IMAGE_KAFKA)) - .withEnv("KAFKA_AUTO_CREATE_TOPICS_ENABLE", "false") - .withNetwork(Network.newNetwork()) - .withExposedPorts(KafkaContainer.KAFKA_PORT, 9092) - .withCreateContainerCmdModifier( - cmd -> cmd.getHostConfig().withUlimits(List.of(new Ulimit("nofile", 30_000L, 30_000L)))); - } - static String topicName(final TestInfo testInfo) { return testInfo.getTestMethod().get().getName(); } @@ -109,8 +93,8 @@ static void createTopics(final AdminClient adminClient, final List<String> topic adminClient.createTopics(newTopics).all().get(); } - static void waitForRunningContainer(final Container<?> kafka) { - Awaitility.await().atMost(Duration.ofMinutes(1)).until(kafka::isRunning); + static void waitForRunningContainer(final Container<?> container) { + await().atMost(Duration.ofMinutes(1)).until(container::isRunning); } static AmazonS3 createS3Client(final LocalStackContainer localStackContainer) { @@ -128,10 +112,18 @@ static LocalStackContainer createS3Container() { .withServices(LocalStackContainer.Service.S3); } + static int getRandomPort() throws IOException { + try (ServerSocket socket = new ServerSocket(0)) { + return socket.getLocalPort(); + } catch (IOException e) { + throw new IOException("Failed to allocate port for test", e); + } + } + static List<String> consumeMessages(final String topic, final int expectedMessageCount, - final KafkaContainer kafka) { + final String bootstrapServers) { final Properties props = new Properties(); - props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, kafka.getBootstrapServers()); + props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group"); props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); @@ -154,9 +146,9 @@ static List<String> consumeMessages(final String topic, final int expectedMessag } static List<GenericRecord> consumeAvroMessages(final String topic, final int expectedMessageCount, - final KafkaContainer kafka, final String schemaRegistryUrl) { + final String bootstrapServers, final String schemaRegistryUrl) { final Properties props = new Properties(); - props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, kafka.getBootstrapServers()); + props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group-avro"); props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); // Assuming string // key @@ -184,10 +176,10 @@ static List<GenericRecord> consumeAvroMessages(final String topic, final int exp } static List<JsonNode> consumeJsonMessages(final String topic, final int expectedMessageCount, - final KafkaContainer kafka) { + final String bootstrapServers) { final Properties props = new Properties(); - props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, kafka.getBootstrapServers()); - props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group-avro"); + props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); + props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group-json"); props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); // Assuming string // key props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, JsonDeserializer.class.getName()); // Json diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 461e991ad..5ad484395 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -66,17 +66,14 @@ import org.junit.jupiter.api.TestInfo; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.testcontainers.containers.KafkaContainer; import org.testcontainers.containers.localstack.LocalStackContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; @Testcontainers -@SuppressWarnings("PMD.ExcessiveImports") final class IntegrationTest implements IntegrationBase { private static final Logger LOGGER = LoggerFactory.getLogger(IntegrationTest.class); - private static final String S3_FILE_NAME = "testtopic-0-0001.txt"; private static final String CONNECTOR_NAME = "aiven-s3-source-connector"; private static final String COMMON_PREFIX = "s3-source-connector-for-apache-kafka-test-"; private static final int OFFSET_FLUSH_INTERVAL_MS = 500; @@ -94,17 +91,15 @@ final class IntegrationTest implements IntegrationBase { @Container public static final LocalStackContainer LOCALSTACK = IntegrationBase.createS3Container(); + private SchemaRegistryContainer schemaRegistry; - @Container - private static final KafkaContainer KAFKA_CONTAINER = IntegrationBase.createKafkaContainer(); - - @Container - private static final SchemaRegistryContainer SCHEMA_REGISTRY = new SchemaRegistryContainer(KAFKA_CONTAINER); private AdminClient adminClient; private ConnectRunner connectRunner; private static AmazonS3 s3Client; + private String topicName; + @BeforeAll static void setUpAll() throws IOException, InterruptedException { s3Prefix = COMMON_PREFIX + ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) + "/"; @@ -115,29 +110,37 @@ static void setUpAll() throws IOException, InterruptedException { pluginDir = IntegrationBase.getPluginDir(); IntegrationBase.extractConnectorPlugin(pluginDir); - IntegrationBase.waitForRunningContainer(KAFKA_CONTAINER); } @BeforeEach - void setUp(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { + void setUp(final TestInfo testInfo) throws Exception { testBucketAccessor.createBucket(); - adminClient = newAdminClient(KAFKA_CONTAINER); + connectRunner = new ConnectRunner(OFFSET_FLUSH_INTERVAL_MS); + final int localListenerPort = IntegrationBase.getRandomPort(); + final int containerListenerPort = IntegrationBase.getRandomPort(); + connectRunner.startConnectCluster(CONNECTOR_NAME, localListenerPort, containerListenerPort); + + adminClient = newAdminClient(connectRunner.getBootstrapServers()); final String topicName = IntegrationBase.topicName(testInfo); final var topics = List.of(topicName); IntegrationBase.createTopics(adminClient, topics); - connectRunner = newConnectRunner(KAFKA_CONTAINER, pluginDir, OFFSET_FLUSH_INTERVAL_MS); - connectRunner.start(); + // This should be done after the process listening the port is already started by host but + // before the container that will access it is started. + org.testcontainers.Testcontainers.exposeHostPorts(containerListenerPort); + schemaRegistry = new SchemaRegistryContainer("host.testcontainers.internal:" + containerListenerPort); + schemaRegistry.start(); + IntegrationBase.waitForRunningContainer(schemaRegistry); } @AfterEach void tearDown() { - testBucketAccessor.removeBucket(); - connectRunner.stop(); adminClient.close(); - - connectRunner.awaitStop(); + connectRunner.deleteConnector(CONNECTOR_NAME); + connectRunner.stopConnectCluster(); + schemaRegistry.stop(); + testBucketAccessor.removeBucket(); } @Test @@ -146,7 +149,7 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); - connectRunner.createConnector(connectorConfig); + connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; @@ -165,7 +168,7 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); // Poll messages from the Kafka topic and verify the consumed data - final List<String> records = IntegrationBase.consumeMessages(topicName, 4, KAFKA_CONTAINER); + final List<String> records = IntegrationBase.consumeMessages(topicName, 4, connectRunner.getBootstrapServers()); // Verify that the correct data is read from the S3 bucket and pushed to Kafka assertThat(records).contains(testData1).contains(testData2); @@ -183,12 +186,12 @@ void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) // in source record, we expect 5 records. connectorConfig.put(MAX_POLL_RECORDS, "2"); // In 3 polls all the 5 records should be processed - connectRunner.createConnector(connectorConfig); + connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); writeToS3(topicName, testData.getBytes(StandardCharsets.UTF_8), "00000"); // Poll messages from the Kafka topic and verify the consumed data - final List<String> records = IntegrationBase.consumeMessages(topicName, 5, KAFKA_CONTAINER); + final List<String> records = IntegrationBase.consumeMessages(topicName, 5, connectRunner.getBootstrapServers()); // Verify that the correct data is read from the S3 bucket and pushed to Kafka assertThat(records.size()).isEqualTo(5); @@ -204,12 +207,12 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.AVRO.getValue()); - connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); + connectorConfig.put(SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); - connectorConfig.put(VALUE_CONVERTER_SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); + connectorConfig.put(VALUE_CONVERTER_SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); connectorConfig.put(VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); - connectRunner.createConnector(connectorConfig); + connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); // Define Avro schema final String schemaJson = "{\n" + " \"type\": \"record\",\n" + " \"name\": \"TestRecord\",\n" @@ -235,8 +238,9 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); // Poll Avro messages from the Kafka topic and deserialize them - final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 500, KAFKA_CONTAINER, - SCHEMA_REGISTRY.getSchemaRegistryUrl()); // Ensure this method deserializes Avro + final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 500, + connectRunner.getBootstrapServers(), schemaRegistry.getSchemaRegistryUrl()); // Ensure this method + // deserializes Avro // Verify that the correct data is read from the S3 bucket and pushed to Kafka assertThat(records).extracting(record -> record.get("message").toString()) @@ -250,16 +254,16 @@ void parquetTest(final TestInfo testInfo) throws ExecutionException, Interrupted final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.PARQUET.getValue()); - connectorConfig.put(SCHEMA_REGISTRY_URL, SCHEMA_REGISTRY.getSchemaRegistryUrl()); + connectorConfig.put(SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); - connectorConfig.put("value.converter.schema.registry.url", SCHEMA_REGISTRY.getSchemaRegistryUrl()); + connectorConfig.put("value.converter.schema.registry.url", schemaRegistry.getSchemaRegistryUrl()); connectorConfig.put(VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); final String partition = "00000"; final String fileName = topicName + "-" + partition + "-" + System.currentTimeMillis() + ".txt"; final String name = "testuser"; - connectRunner.createConnector(connectorConfig); + connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); final Path path = ContentUtils.getTmpFilePath(name); try { @@ -270,8 +274,8 @@ void parquetTest(final TestInfo testInfo) throws ExecutionException, Interrupted Files.delete(path); } - final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 100, KAFKA_CONTAINER, - SCHEMA_REGISTRY.getSchemaRegistryUrl()); + final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 100, + connectRunner.getBootstrapServers(), schemaRegistry.getSchemaRegistryUrl()); assertThat(records).extracting(record -> record.get("name").toString()) .contains(name + "1") .contains(name + "2"); @@ -284,7 +288,7 @@ void jsonTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.JSONL.getValue()); connectorConfig.put("value.converter", "org.apache.kafka.connect.json.JsonConverter"); - connectRunner.createConnector(connectorConfig); + connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); final String testMessage = "This is a test "; final StringBuilder jsonBuilder = new StringBuilder(); for (int i = 0; i < 500; i++) { @@ -296,7 +300,8 @@ void jsonTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc writeToS3(topicName, jsonBytes, "00001"); // Poll Json messages from the Kafka topic and deserialize them - final List<JsonNode> records = IntegrationBase.consumeJsonMessages(topicName, 500, KAFKA_CONTAINER); + final List<JsonNode> records = IntegrationBase.consumeJsonMessages(topicName, 500, + connectRunner.getBootstrapServers()); assertThat(records).extracting(record -> record.get("payload").get("message").asText()).contains(testMessage); assertThat(records).extracting(record -> record.get("payload").get("id").asText()).contains("1"); diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java index e53f0a88b..0755cb8d1 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java @@ -20,26 +20,19 @@ import com.github.dockerjava.api.model.Ulimit; import org.testcontainers.containers.GenericContainer; -import org.testcontainers.containers.KafkaContainer; -import org.testcontainers.utility.Base58; public final class SchemaRegistryContainer extends GenericContainer<SchemaRegistryContainer> { public static final int SCHEMA_REGISTRY_PORT = 8081; - public SchemaRegistryContainer(final KafkaContainer kafka) { - this("5.0.4", kafka); + public SchemaRegistryContainer(final String bootstrapServer) { + this("5.0.4", bootstrapServer); } - public SchemaRegistryContainer(final String confluentPlatformVersion, final KafkaContainer kafka) { + public SchemaRegistryContainer(final String confluentPlatformVersion, final String bootstrapServer) { super("confluentinc/cp-schema-registry:" + confluentPlatformVersion); - - dependsOn(kafka); - withNetwork(kafka.getNetwork()); - withNetworkAliases("schema-registry-" + Base58.randomString(6)); - - withEnv("SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS", - String.format("PLAINTEXT://%s:%s", kafka.getNetworkAliases().get(0), 9092)); - + withAccessToHost(true); + withEnv("SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS", "PLAINTEXT://" + bootstrapServer); + withEnv("SCHEMA_REGISTRY_LOG4J_LOGLEVEL=DEBUG", "DEBUG"); withExposedPorts(SCHEMA_REGISTRY_PORT); withEnv("SCHEMA_REGISTRY_HOST_NAME", "localhost"); From 68dee0e1850244e765f980b41dfcef659c2af705 Mon Sep 17 00:00:00 2001 From: Anatolii Popov <anatolii.popov@aiven.io> Date: Fri, 8 Nov 2024 14:41:53 +0200 Subject: [PATCH 60/90] FileReader improvements --- .../connect/s3/source/utils/FileReader.java | 76 +++++-------------- 1 file changed, 19 insertions(+), 57 deletions(-) diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java index f5c9ee864..561f7df21 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -18,25 +18,20 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; -import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; -import java.util.List; +import java.util.Objects; import java.util.Set; -import java.util.stream.Collectors; +import java.util.stream.Stream; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.ListObjectsV2Request; -import com.amazonaws.services.s3.model.ListObjectsV2Result; import com.amazonaws.services.s3.model.S3ObjectSummary; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class FileReader { - private static final Logger LOGGER = LoggerFactory.getLogger(FileReader.class); public static final int PAGE_SIZE_FACTOR = 2; private final S3SourceConfig s3SourceConfig; private final String bucketName; @@ -50,60 +45,27 @@ public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName, this.failedObjectKeys = new HashSet<>(failedObjectKeys); } - @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") Iterator<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) { - return new Iterator<>() { - private String continuationToken = null; // NOPMD - private List<S3ObjectSummary> currentBatch = new ArrayList<>(); - private int currentIndex = 0; // NOPMD - private boolean isTruncated = true; - - @Override - public boolean hasNext() { - // If there are unprocessed objects in the current batch, we return true - if (currentIndex < currentBatch.size()) { - return true; - } - - if (isTruncated) { - fetchNextBatch(); - return !currentBatch.isEmpty(); - } - - return false; - } - - @Override - public S3ObjectSummary next() { - if (!hasNext()) { - return null; - } - - return currentBatch.get(currentIndex++); - } - - private void fetchNextBatch() { - currentBatch.clear(); - currentIndex = 0; - - final ListObjectsV2Request request = new ListObjectsV2Request().withBucketName(bucketName) - .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * PAGE_SIZE_FACTOR) - .withContinuationToken(continuationToken); - - final ListObjectsV2Result objectListing = s3Client.listObjectsV2(request); - currentBatch = objectListing.getObjectSummaries() + final ListObjectsV2Request request = new ListObjectsV2Request().withBucketName(bucketName) + .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * PAGE_SIZE_FACTOR); + + final Stream<S3ObjectSummary> s3ObjectStream = Stream + .iterate(s3Client.listObjectsV2(request), Objects::nonNull, response -> { + if (response.isTruncated()) { + return s3Client.listObjectsV2(new ListObjectsV2Request().withBucketName(bucketName) + .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * PAGE_SIZE_FACTOR) + .withContinuationToken(response.getNextContinuationToken())); + } else { + return null; + } + }) + .flatMap(response -> response.getObjectSummaries() .stream() .filter(objectSummary -> objectSummary.getSize() > 0) - .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey())) - .collect(Collectors.toList()); - - continuationToken = objectListing.getNextContinuationToken(); - isTruncated = objectListing.isTruncated(); - - currentBatch.forEach(objSummary -> LOGGER.debug("Objects to be processed {} ", objSummary.getKey())); - } - }; + .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey()))); + return s3ObjectStream.iterator(); } + public void addFailedObjectKeys(final String objectKey) { this.failedObjectKeys.add(objectKey); } From 274fc34534b600e55c10f6ecb8bbaf84e2c776c2 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Fri, 8 Nov 2024 15:12:10 +0100 Subject: [PATCH 61/90] Updating with tasks test --- .../connect/s3/source/IntegrationBase.java | 47 ++++++++++-- .../connect/s3/source/IntegrationTest.java | 72 ++++++++++++++----- 2 files changed, 97 insertions(+), 22 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index e06bd3a8a..f2c0f606c 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -20,13 +20,16 @@ import java.io.File; import java.io.IOException; +import java.net.ConnectException; import java.net.ServerSocket; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.time.Duration; import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Properties; import java.util.concurrent.ExecutionException; import java.util.stream.Collectors; @@ -47,7 +50,10 @@ import com.amazonaws.client.builder.AwsClientBuilder; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3ClientBuilder; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import io.confluent.kafka.serializers.KafkaAvroDeserializer; import org.apache.avro.generic.GenericRecord; import org.junit.jupiter.api.TestInfo; @@ -56,9 +62,9 @@ import org.testcontainers.utility.DockerImageName; public interface IntegrationBase { - - String PLUGINS_S_3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA = "plugins/s3-source-connector-for-apache-kafka/"; - String S_3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA_TEST = "s3-source-connector-for-apache-kafka-test-"; + String PLUGINS_S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA = "plugins/s3-source-connector-for-apache-kafka/"; + String S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA_TEST = "s3-source-connector-for-apache-kafka-test-"; + ObjectMapper OBJECT_MAPPER = new ObjectMapper(); default AdminClient newAdminClient(final String bootstrapServers) { final Properties adminClientConfig = new Properties(); @@ -76,9 +82,9 @@ static void extractConnectorPlugin(File pluginDir) throws IOException, Interrupt } static File getPluginDir() throws IOException { - final File testDir = Files.createTempDirectory(S_3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA_TEST).toFile(); + final File testDir = Files.createTempDirectory(S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA_TEST).toFile(); - final File pluginDir = new File(testDir, PLUGINS_S_3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA); + final File pluginDir = new File(testDir, PLUGINS_S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA); assert pluginDir.mkdirs(); return pluginDir; } @@ -202,4 +208,35 @@ static List<JsonNode> consumeJsonMessages(final String topic, final int expected return recordsList; } } + + static Map<String, Object> consumeOffsetStorageMessages(final String topic, final int expectedMessageCount, + final String bootstrapServer) throws ConnectException { + final Properties props = new Properties(); + props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServer); + props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group"); + props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); + props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); + props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); + + final Map<String, Object> messages = new HashMap<>(); + Map<String, Object> offsetRec; + try (KafkaConsumer<byte[], byte[]> consumer = new KafkaConsumer<>(props)) { + consumer.subscribe(Collections.singletonList(topic)); + + // Poll messages from the topic + while (messages.size() < expectedMessageCount) { + final ConsumerRecords<byte[], byte[]> records = consumer.poll(5L); + for (final ConsumerRecord<byte[], byte[]> record : records) { + offsetRec = OBJECT_MAPPER.readValue(new String(record.value(), StandardCharsets.UTF_8), // NOPMD + new TypeReference<>() { // NOPMD + }); + messages.putAll(offsetRec); + } + } + + } catch (JsonProcessingException e) { + throw new ConnectException("Error while consuming messages " + e.getMessage()); + } + return messages; + } } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 5ad484395..c7f9fc52c 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -16,6 +16,7 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_ACCESS_KEY_ID_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_ENDPOINT_CONFIG; @@ -29,19 +30,23 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_CONVERTER_SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; +import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; import static org.assertj.core.api.Assertions.assertThat; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; +import java.net.ConnectException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.ExecutionException; import org.apache.kafka.clients.admin.AdminClient; @@ -71,6 +76,7 @@ import org.testcontainers.junit.jupiter.Testcontainers; @Testcontainers +@SuppressWarnings("PMD.ExcessiveImports") final class IntegrationTest implements IntegrationBase { private static final Logger LOGGER = LoggerFactory.getLogger(IntegrationTest.class); @@ -154,12 +160,14 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; + final Set<String> offsetKeys = new HashSet<>(); + // write 2 objects to s3 - writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000"); - writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00000"); - writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00001"); - writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00001"); - writeToS3(topicName, new byte[0], "00003"); // this should be ignored. + writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000", offsetKeys); + writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00000", offsetKeys); + writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00001", offsetKeys); + writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00001", offsetKeys); + writeToS3(topicName, new byte[0], "00003", offsetKeys); // this should be ignored. final List<String> objects = testBucketAccessor.listObjects(); assertThat(objects.size()).isEqualTo(5); @@ -172,6 +180,9 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx // Verify that the correct data is read from the S3 bucket and pushed to Kafka assertThat(records).contains(testData1).contains(testData2); + + // Verify offset positions + verifyOffsetPositions(offsetKeys, 4); } @Test @@ -188,7 +199,9 @@ void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); - writeToS3(topicName, testData.getBytes(StandardCharsets.UTF_8), "00000"); + final Set<String> offsetKeys = new HashSet<>(); + + writeToS3(topicName, testData.getBytes(StandardCharsets.UTF_8), "00000", offsetKeys); // Poll messages from the Kafka topic and verify the consumed data final List<String> records = IntegrationBase.consumeMessages(topicName, 5, connectRunner.getBootstrapServers()); @@ -200,6 +213,9 @@ void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) assertThat(records.get(2)).isEqualTo("CC"); assertThat(records.get(3)).isEqualTo("DD"); assertThat(records.get(4)).isEqualTo("EE"); + + // Verify offset positions + verifyOffsetPositions(offsetKeys, 1); } @Test @@ -224,12 +240,14 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc final ByteArrayOutputStream outputStream1 = getAvroRecord(schema, 1, 100); final ByteArrayOutputStream outputStream2 = getAvroRecord(schema, 2, 100); - writeToS3(topicName, outputStream1.toByteArray(), "00001"); - writeToS3(topicName, outputStream2.toByteArray(), "00001"); + final Set<String> offsetKeys = new HashSet<>(); - writeToS3(topicName, outputStream1.toByteArray(), "00002"); - writeToS3(topicName, outputStream2.toByteArray(), "00002"); - writeToS3(topicName, outputStream2.toByteArray(), "00002"); + writeToS3(topicName, outputStream1.toByteArray(), "00001", offsetKeys); + writeToS3(topicName, outputStream2.toByteArray(), "00001", offsetKeys); + + writeToS3(topicName, outputStream1.toByteArray(), "00002", offsetKeys); + writeToS3(topicName, outputStream2.toByteArray(), "00002", offsetKeys); + writeToS3(topicName, outputStream2.toByteArray(), "00002", offsetKeys); final List<String> objects = testBucketAccessor.listObjects(); assertThat(objects.size()).isEqualTo(5); @@ -247,10 +265,13 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc .contains("Hello, Kafka Connect S3 Source! object 1") .contains("Hello, Kafka Connect S3 Source! object 2"); assertThat(records).extracting(record -> record.get("id").toString()).contains("1").contains("2"); + + // Verify offset positions + verifyOffsetPositions(offsetKeys, 5); } @Test - void parquetTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { + void parquetTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.PARQUET.getValue()); @@ -282,7 +303,7 @@ void parquetTest(final TestInfo testInfo) throws ExecutionException, Interrupted } @Test - void jsonTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { + void jsonTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.JSONL.getValue()); @@ -296,8 +317,9 @@ void jsonTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc jsonBuilder.append(jsonContent).append("\n"); // NOPMD } final byte[] jsonBytes = jsonBuilder.toString().getBytes(StandardCharsets.UTF_8); + final Set<String> offsetKeys = new HashSet<>(); - writeToS3(topicName, jsonBytes, "00001"); + writeToS3(topicName, jsonBytes, "00001", offsetKeys); // Poll Json messages from the Kafka topic and deserialize them final List<JsonNode> records = IntegrationBase.consumeJsonMessages(topicName, 500, @@ -305,6 +327,9 @@ void jsonTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc assertThat(records).extracting(record -> record.get("payload").get("message").asText()).contains(testMessage); assertThat(records).extracting(record -> record.get("payload").get("id").asText()).contains("1"); + + // Verify offset positions + verifyOffsetPositions(offsetKeys, 1); } @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") @@ -332,15 +357,17 @@ private static ByteArrayOutputStream getAvroRecord(final Schema schema, final in return outputStream; } - private static void writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) - throws IOException { + private static void writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId, + final Set<String> offsetKeys) throws IOException { final String filePrefix = topicName + "-" + partitionId + "-" + System.currentTimeMillis(); final String fileSuffix = ".txt"; final Path testFilePath = File.createTempFile(filePrefix, fileSuffix).toPath(); + final String objectKey = filePrefix + fileSuffix; try { Files.write(testFilePath, testDataBytes); - saveToS3(TEST_BUCKET_NAME, "", filePrefix + fileSuffix, testFilePath.toFile()); + saveToS3(TEST_BUCKET_NAME, "", objectKey, testFilePath.toFile()); + offsetKeys.add(OBJECT_KEY + SEPARATOR + objectKey); } finally { Files.delete(testFilePath); } @@ -372,4 +399,15 @@ public static void saveToS3(final String bucketName, final String folderName, fi final PutObjectRequest request = new PutObjectRequest(bucketName, folderName + fileNameInS3, fileToWrite); s3Client.putObject(request); } + + private void verifyOffsetPositions(final Set<String> offsetKeys, final int messagesCount) throws ConnectException { + final Map<String, Object> offsetRecs = IntegrationBase.consumeOffsetStorageMessages( + "connect-offset-topic-" + CONNECTOR_NAME, messagesCount, connectRunner.getBootstrapServers()); + + assertThat(offsetRecs.size()).isEqualTo(messagesCount); + + for (final String offsetKey : offsetRecs.keySet()) { + assertThat(offsetKeys).contains(offsetKey); + } + } } From 0079f2867f7a7f45dec02e4baa36245a7c61ccf2 Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Tue, 12 Nov 2024 09:34:18 +0100 Subject: [PATCH 62/90] From review --- .../connect/s3/source/IntegrationBase.java | 6 +-- .../connect/s3/source/IntegrationTest.java | 51 ++++++++++--------- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index f2c0f606c..b81fdddfb 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -219,7 +219,6 @@ static Map<String, Object> consumeOffsetStorageMessages(final String topic, fina props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); final Map<String, Object> messages = new HashMap<>(); - Map<String, Object> offsetRec; try (KafkaConsumer<byte[], byte[]> consumer = new KafkaConsumer<>(props)) { consumer.subscribe(Collections.singletonList(topic)); @@ -227,10 +226,9 @@ static Map<String, Object> consumeOffsetStorageMessages(final String topic, fina while (messages.size() < expectedMessageCount) { final ConsumerRecords<byte[], byte[]> records = consumer.poll(5L); for (final ConsumerRecord<byte[], byte[]> record : records) { - offsetRec = OBJECT_MAPPER.readValue(new String(record.value(), StandardCharsets.UTF_8), // NOPMD + messages.putAll(OBJECT_MAPPER.readValue(new String(record.value(), StandardCharsets.UTF_8), // NOPMD new TypeReference<>() { // NOPMD - }); - messages.putAll(offsetRec); + })); } } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index c7f9fc52c..6382dfa39 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -163,11 +163,16 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx final Set<String> offsetKeys = new HashSet<>(); // write 2 objects to s3 - writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000", offsetKeys); - writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00000", offsetKeys); - writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00001", offsetKeys); - writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00001", offsetKeys); - writeToS3(topicName, new byte[0], "00003", offsetKeys); // this should be ignored. + String offsetKey = writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000"); + offsetKeys.add(offsetKey); + offsetKey = writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00000"); + offsetKeys.add(offsetKey); + offsetKey = writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00001"); + offsetKeys.add(offsetKey); + offsetKey = writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00001"); + offsetKeys.add(offsetKey); + offsetKey = writeToS3(topicName, new byte[0], "00003"); // this should be ignored. + offsetKeys.add(offsetKey); final List<String> objects = testBucketAccessor.listObjects(); assertThat(objects.size()).isEqualTo(5); @@ -201,7 +206,8 @@ void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) final Set<String> offsetKeys = new HashSet<>(); - writeToS3(topicName, testData.getBytes(StandardCharsets.UTF_8), "00000", offsetKeys); + final String offsetKey = writeToS3(topicName, testData.getBytes(StandardCharsets.UTF_8), "00000"); + offsetKeys.add(offsetKey); // Poll messages from the Kafka topic and verify the consumed data final List<String> records = IntegrationBase.consumeMessages(topicName, 5, connectRunner.getBootstrapServers()); @@ -242,12 +248,16 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc final Set<String> offsetKeys = new HashSet<>(); - writeToS3(topicName, outputStream1.toByteArray(), "00001", offsetKeys); - writeToS3(topicName, outputStream2.toByteArray(), "00001", offsetKeys); - - writeToS3(topicName, outputStream1.toByteArray(), "00002", offsetKeys); - writeToS3(topicName, outputStream2.toByteArray(), "00002", offsetKeys); - writeToS3(topicName, outputStream2.toByteArray(), "00002", offsetKeys); + String offsetKey = writeToS3(topicName, outputStream1.toByteArray(), "00001"); + offsetKeys.add(offsetKey); + offsetKey = writeToS3(topicName, outputStream2.toByteArray(), "00001"); + offsetKeys.add(offsetKey); + offsetKey = writeToS3(topicName, outputStream1.toByteArray(), "00002"); + offsetKeys.add(offsetKey); + offsetKey = writeToS3(topicName, outputStream2.toByteArray(), "00002"); + offsetKeys.add(offsetKey); + offsetKey = writeToS3(topicName, outputStream2.toByteArray(), "00002"); + offsetKeys.add(offsetKey); final List<String> objects = testBucketAccessor.listObjects(); assertThat(objects.size()).isEqualTo(5); @@ -319,8 +329,8 @@ void jsonTest(final TestInfo testInfo) throws IOException { final byte[] jsonBytes = jsonBuilder.toString().getBytes(StandardCharsets.UTF_8); final Set<String> offsetKeys = new HashSet<>(); - writeToS3(topicName, jsonBytes, "00001", offsetKeys); - + final String offsetKey = writeToS3(topicName, jsonBytes, "00001"); + offsetKeys.add(offsetKey); // Poll Json messages from the Kafka topic and deserialize them final List<JsonNode> records = IntegrationBase.consumeJsonMessages(topicName, 500, connectRunner.getBootstrapServers()); @@ -357,8 +367,8 @@ private static ByteArrayOutputStream getAvroRecord(final Schema schema, final in return outputStream; } - private static void writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId, - final Set<String> offsetKeys) throws IOException { + private static String writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) + throws IOException { final String filePrefix = topicName + "-" + partitionId + "-" + System.currentTimeMillis(); final String fileSuffix = ".txt"; @@ -367,7 +377,7 @@ private static void writeToS3(final String topicName, final byte[] testDataBytes try { Files.write(testFilePath, testDataBytes); saveToS3(TEST_BUCKET_NAME, "", objectKey, testFilePath.toFile()); - offsetKeys.add(OBJECT_KEY + SEPARATOR + objectKey); + return OBJECT_KEY + SEPARATOR + objectKey; } finally { Files.delete(testFilePath); } @@ -403,11 +413,6 @@ public static void saveToS3(final String bucketName, final String folderName, fi private void verifyOffsetPositions(final Set<String> offsetKeys, final int messagesCount) throws ConnectException { final Map<String, Object> offsetRecs = IntegrationBase.consumeOffsetStorageMessages( "connect-offset-topic-" + CONNECTOR_NAME, messagesCount, connectRunner.getBootstrapServers()); - - assertThat(offsetRecs.size()).isEqualTo(messagesCount); - - for (final String offsetKey : offsetRecs.keySet()) { - assertThat(offsetKeys).contains(offsetKey); - } + assertThat(offsetRecs.keySet()).hasSize(messagesCount).isSubsetOf(offsetKeys); } } From 387ba6ce1f39d9cb0e5e6b967f039511f1fb39fd Mon Sep 17 00:00:00 2001 From: Muralidhar Basani <muralidhar.basani@aiven.io> Date: Tue, 12 Nov 2024 13:39:24 +0100 Subject: [PATCH 63/90] Inline invoke --- .../connect/s3/source/IntegrationBase.java | 4 +-- .../connect/s3/source/IntegrationTest.java | 36 +++++++------------ 2 files changed, 14 insertions(+), 26 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index b81fdddfb..c7200cad0 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -218,8 +218,8 @@ static Map<String, Object> consumeOffsetStorageMessages(final String topic, fina props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); - final Map<String, Object> messages = new HashMap<>(); try (KafkaConsumer<byte[], byte[]> consumer = new KafkaConsumer<>(props)) { + final Map<String, Object> messages = new HashMap<>(); consumer.subscribe(Collections.singletonList(topic)); // Poll messages from the topic @@ -231,10 +231,10 @@ static Map<String, Object> consumeOffsetStorageMessages(final String topic, fina })); } } + return messages; } catch (JsonProcessingException e) { throw new ConnectException("Error while consuming messages " + e.getMessage()); } - return messages; } } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 6382dfa39..8ae632c24 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -163,16 +163,11 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx final Set<String> offsetKeys = new HashSet<>(); // write 2 objects to s3 - String offsetKey = writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000"); - offsetKeys.add(offsetKey); - offsetKey = writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00000"); - offsetKeys.add(offsetKey); - offsetKey = writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00001"); - offsetKeys.add(offsetKey); - offsetKey = writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00001"); - offsetKeys.add(offsetKey); - offsetKey = writeToS3(topicName, new byte[0], "00003"); // this should be ignored. - offsetKeys.add(offsetKey); + offsetKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000")); + offsetKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00000")); + offsetKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00001")); + offsetKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00001")); + offsetKeys.add(writeToS3(topicName, new byte[0], "00003")); final List<String> objects = testBucketAccessor.listObjects(); assertThat(objects.size()).isEqualTo(5); @@ -206,8 +201,7 @@ void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) final Set<String> offsetKeys = new HashSet<>(); - final String offsetKey = writeToS3(topicName, testData.getBytes(StandardCharsets.UTF_8), "00000"); - offsetKeys.add(offsetKey); + offsetKeys.add(writeToS3(topicName, testData.getBytes(StandardCharsets.UTF_8), "00000")); // Poll messages from the Kafka topic and verify the consumed data final List<String> records = IntegrationBase.consumeMessages(topicName, 5, connectRunner.getBootstrapServers()); @@ -248,16 +242,11 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc final Set<String> offsetKeys = new HashSet<>(); - String offsetKey = writeToS3(topicName, outputStream1.toByteArray(), "00001"); - offsetKeys.add(offsetKey); - offsetKey = writeToS3(topicName, outputStream2.toByteArray(), "00001"); - offsetKeys.add(offsetKey); - offsetKey = writeToS3(topicName, outputStream1.toByteArray(), "00002"); - offsetKeys.add(offsetKey); - offsetKey = writeToS3(topicName, outputStream2.toByteArray(), "00002"); - offsetKeys.add(offsetKey); - offsetKey = writeToS3(topicName, outputStream2.toByteArray(), "00002"); - offsetKeys.add(offsetKey); + offsetKeys.add(writeToS3(topicName, outputStream1.toByteArray(), "00001")); + offsetKeys.add(writeToS3(topicName, outputStream2.toByteArray(), "00001")); + offsetKeys.add(writeToS3(topicName, outputStream1.toByteArray(), "00002")); + offsetKeys.add(writeToS3(topicName, outputStream2.toByteArray(), "00002")); + offsetKeys.add(writeToS3(topicName, outputStream2.toByteArray(), "00002")); final List<String> objects = testBucketAccessor.listObjects(); assertThat(objects.size()).isEqualTo(5); @@ -329,8 +318,7 @@ void jsonTest(final TestInfo testInfo) throws IOException { final byte[] jsonBytes = jsonBuilder.toString().getBytes(StandardCharsets.UTF_8); final Set<String> offsetKeys = new HashSet<>(); - final String offsetKey = writeToS3(topicName, jsonBytes, "00001"); - offsetKeys.add(offsetKey); + offsetKeys.add(writeToS3(topicName, jsonBytes, "00001")); // Poll Json messages from the Kafka topic and deserialize them final List<JsonNode> records = IntegrationBase.consumeJsonMessages(topicName, 500, connectRunner.getBootstrapServers()); From ced53a2d055c81bbf502527bb4bc7c750eda809e Mon Sep 17 00:00:00 2001 From: Anatolii Popov <anatolii.popov@aiven.io> Date: Tue, 12 Nov 2024 18:06:22 +0200 Subject: [PATCH 64/90] Integration tests clean up --- .../connect/s3/source/IntegrationBase.java | 46 +++--- .../connect/s3/source/IntegrationTest.java | 152 +++++++----------- 2 files changed, 83 insertions(+), 115 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index c7200cad0..9629b07c0 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -16,6 +16,7 @@ package io.aiven.kafka.connect.s3.source; +import static org.assertj.core.api.Assertions.assertThat; import static org.awaitility.Awaitility.await; import java.io.File; @@ -24,6 +25,7 @@ import java.net.ServerSocket; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.nio.file.Path; import java.time.Duration; import java.util.ArrayList; import java.util.Collections; @@ -50,7 +52,6 @@ import com.amazonaws.client.builder.AwsClientBuilder; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3ClientBuilder; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; @@ -72,21 +73,18 @@ default AdminClient newAdminClient(final String bootstrapServers) { return AdminClient.create(adminClientConfig); } - static void extractConnectorPlugin(File pluginDir) throws IOException, InterruptedException { + static void extractConnectorPlugin(Path pluginDir) throws IOException, InterruptedException { final File distFile = new File(System.getProperty("integration-test.distribution.file.path")); - assert distFile.exists(); + assertThat(distFile).exists(); final String cmd = String.format("tar -xf %s --strip-components=1 -C %s", distFile, pluginDir.toString()); final Process process = Runtime.getRuntime().exec(cmd); assert process.waitFor() == 0; } - static File getPluginDir() throws IOException { - final File testDir = Files.createTempDirectory(S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA_TEST).toFile(); - - final File pluginDir = new File(testDir, PLUGINS_S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA); - assert pluginDir.mkdirs(); - return pluginDir; + static Path getPluginDir() throws IOException { + final Path testDir = Files.createTempDirectory(S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA_TEST); + return Files.createDirectories(testDir.resolve(PLUGINS_S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA)); } static String topicName(final TestInfo testInfo) { @@ -156,11 +154,10 @@ static List<GenericRecord> consumeAvroMessages(final String topic, final int exp final Properties props = new Properties(); props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group-avro"); - props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); // Assuming string - // key - props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, KafkaAvroDeserializer.class.getName()); // Avro - // deserializer - // for values + // Assuming string key + props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); + // Avro deserializer for values + props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, KafkaAvroDeserializer.class.getName()); props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); props.put("schema.registry.url", schemaRegistryUrl); // URL of the schema registry props.put("specific.avro.reader", "false"); // Use GenericRecord instead of specific Avro classes @@ -186,11 +183,10 @@ static List<JsonNode> consumeJsonMessages(final String topic, final int expected final Properties props = new Properties(); props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group-json"); - props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); // Assuming string - // key - props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, JsonDeserializer.class.getName()); // Json - // deserializer - // for values + // Assuming string key + props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); + // Json deserializer for values + props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, JsonDeserializer.class.getName()); props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); try (KafkaConsumer<String, JsonNode> consumer = new KafkaConsumer<>(props)) { @@ -219,21 +215,21 @@ static Map<String, Object> consumeOffsetStorageMessages(final String topic, fina props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); try (KafkaConsumer<byte[], byte[]> consumer = new KafkaConsumer<>(props)) { - final Map<String, Object> messages = new HashMap<>(); consumer.subscribe(Collections.singletonList(topic)); // Poll messages from the topic + final Map<String, Object> messages = new HashMap<>(); while (messages.size() < expectedMessageCount) { - final ConsumerRecords<byte[], byte[]> records = consumer.poll(5L); + final ConsumerRecords<byte[], byte[]> records = consumer.poll(Duration.ofMillis(5L)); for (final ConsumerRecord<byte[], byte[]> record : records) { - messages.putAll(OBJECT_MAPPER.readValue(new String(record.value(), StandardCharsets.UTF_8), // NOPMD - new TypeReference<>() { // NOPMD - })); + Map<String, Object> offsetRec = OBJECT_MAPPER.readValue(record.value(), new TypeReference<>() { // NOPMD + }); + messages.putAll(offsetRec); } } return messages; - } catch (JsonProcessingException e) { + } catch (IOException e) { throw new ConnectException("Error while consuming messages " + e.getMessage()); } } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 8ae632c24..a077f0d0a 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -31,10 +31,11 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_CONVERTER_SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; +import static java.util.Map.entry; import static org.assertj.core.api.Assertions.assertThat; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; -import java.io.File; import java.io.IOException; import java.net.ConnectException; import java.nio.charset.StandardCharsets; @@ -47,7 +48,8 @@ import java.util.List; import java.util.Map; import java.util.Set; -import java.util.concurrent.ExecutionException; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import org.apache.kafka.clients.admin.AdminClient; @@ -56,6 +58,7 @@ import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.model.ObjectMetadata; import com.amazonaws.services.s3.model.PutObjectRequest; import com.fasterxml.jackson.databind.JsonNode; import org.apache.avro.Schema; @@ -87,14 +90,14 @@ final class IntegrationTest implements IntegrationBase { private static final String S3_ACCESS_KEY_ID = "test-key-id0"; private static final String S3_SECRET_ACCESS_KEY = "test_secret_key0"; + private static final String VALUE_CONVERTER_KEY = "value.converter"; + private static final String TEST_BUCKET_NAME = "test-bucket0"; private static String s3Endpoint; private static String s3Prefix; private static BucketAccessor testBucketAccessor; - private static File pluginDir; - @Container public static final LocalStackContainer LOCALSTACK = IntegrationBase.createS3Container(); private SchemaRegistryContainer schemaRegistry; @@ -104,8 +107,6 @@ final class IntegrationTest implements IntegrationBase { private static AmazonS3 s3Client; - private String topicName; - @BeforeAll static void setUpAll() throws IOException, InterruptedException { s3Prefix = COMMON_PREFIX + ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) + "/"; @@ -114,7 +115,7 @@ static void setUpAll() throws IOException, InterruptedException { s3Endpoint = LOCALSTACK.getEndpoint().toString(); testBucketAccessor = new BucketAccessor(s3Client, TEST_BUCKET_NAME); - pluginDir = IntegrationBase.getPluginDir(); + final Path pluginDir = IntegrationBase.getPluginDir(); IntegrationBase.extractConnectorPlugin(pluginDir); } @@ -150,9 +151,9 @@ void tearDown() { } @Test - void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { + void bytesTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); @@ -169,28 +170,23 @@ void bytesTest(final TestInfo testInfo) throws ExecutionException, InterruptedEx offsetKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00001")); offsetKeys.add(writeToS3(topicName, new byte[0], "00003")); - final List<String> objects = testBucketAccessor.listObjects(); - assertThat(objects.size()).isEqualTo(5); - - // Verify that the connector is correctly set up - assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); + assertThat(testBucketAccessor.listObjects()).hasSize(5); // Poll messages from the Kafka topic and verify the consumed data final List<String> records = IntegrationBase.consumeMessages(topicName, 4, connectRunner.getBootstrapServers()); // Verify that the correct data is read from the S3 bucket and pushed to Kafka - assertThat(records).contains(testData1).contains(testData2); + assertThat(records).containsOnly(testData1, testData2); // Verify offset positions verifyOffsetPositions(offsetKeys, 4); } @Test - void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) - throws ExecutionException, InterruptedException, IOException { + void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) throws IOException { final String testData = "AABBCCDDEE"; final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); connectorConfig.put(EXPECTED_MAX_MESSAGE_BYTES, "2"); // For above test data of 10 bytes length, with 2 bytes // each @@ -207,26 +203,16 @@ void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) final List<String> records = IntegrationBase.consumeMessages(topicName, 5, connectRunner.getBootstrapServers()); // Verify that the correct data is read from the S3 bucket and pushed to Kafka - assertThat(records.size()).isEqualTo(5); - assertThat(records.get(0)).isEqualTo("AA"); - assertThat(records.get(1)).isEqualTo("BB"); - assertThat(records.get(2)).isEqualTo("CC"); - assertThat(records.get(3)).isEqualTo("DD"); - assertThat(records.get(4)).isEqualTo("EE"); + assertThat(records).containsExactly("AA", "BB", "CC", "DD", "EE"); // Verify offset positions verifyOffsetPositions(offsetKeys, 1); } @Test - void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedException, IOException { + void avroTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.AVRO.getValue()); - connectorConfig.put(SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); - connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); - connectorConfig.put(VALUE_CONVERTER_SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); - connectorConfig.put(VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); + final Map<String, String> connectorConfig = getAvroConfig(topicName, InputFormat.AVRO); connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); @@ -237,22 +223,19 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc final Schema.Parser parser = new Schema.Parser(); final Schema schema = parser.parse(schemaJson); - final ByteArrayOutputStream outputStream1 = getAvroRecord(schema, 1, 100); - final ByteArrayOutputStream outputStream2 = getAvroRecord(schema, 2, 100); + final byte[] outputStream1 = getAvroRecord(schema, 1, 100); + final byte[] outputStream2 = getAvroRecord(schema, 2, 100); final Set<String> offsetKeys = new HashSet<>(); - offsetKeys.add(writeToS3(topicName, outputStream1.toByteArray(), "00001")); - offsetKeys.add(writeToS3(topicName, outputStream2.toByteArray(), "00001")); - offsetKeys.add(writeToS3(topicName, outputStream1.toByteArray(), "00002")); - offsetKeys.add(writeToS3(topicName, outputStream2.toByteArray(), "00002")); - offsetKeys.add(writeToS3(topicName, outputStream2.toByteArray(), "00002")); + offsetKeys.add(writeToS3(topicName, outputStream1, "00001")); + offsetKeys.add(writeToS3(topicName, outputStream2, "00001")); - final List<String> objects = testBucketAccessor.listObjects(); - assertThat(objects.size()).isEqualTo(5); + offsetKeys.add(writeToS3(topicName, outputStream1, "00002")); + offsetKeys.add(writeToS3(topicName, outputStream2, "00002")); + offsetKeys.add(writeToS3(topicName, outputStream2, "00002")); - // Verify that the connector is correctly set up - assertThat(connectorConfig.get("name")).isEqualTo(CONNECTOR_NAME); + assertThat(testBucketAccessor.listObjects()).hasSize(5); // Poll Avro messages from the Kafka topic and deserialize them final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 500, @@ -260,10 +243,10 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc // deserializes Avro // Verify that the correct data is read from the S3 bucket and pushed to Kafka - assertThat(records).extracting(record -> record.get("message").toString()) - .contains("Hello, Kafka Connect S3 Source! object 1") - .contains("Hello, Kafka Connect S3 Source! object 2"); - assertThat(records).extracting(record -> record.get("id").toString()).contains("1").contains("2"); + assertThat(records).hasSize(500) + .map(record -> entry(record.get("id"), String.valueOf(record.get("message")))) + .contains(entry(1, "Hello, Kafka Connect S3 Source! object 1"), + entry(2, "Hello, Kafka Connect S3 Source! object 2")); // Verify offset positions verifyOffsetPositions(offsetKeys, 5); @@ -272,17 +255,12 @@ void avroTest(final TestInfo testInfo) throws ExecutionException, InterruptedExc @Test void parquetTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); - connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.PARQUET.getValue()); - connectorConfig.put(SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); - connectorConfig.put("value.converter", "io.confluent.connect.avro.AvroConverter"); - connectorConfig.put("value.converter.schema.registry.url", schemaRegistry.getSchemaRegistryUrl()); - connectorConfig.put(VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); final String partition = "00000"; final String fileName = topicName + "-" + partition + "-" + System.currentTimeMillis() + ".txt"; final String name = "testuser"; + final Map<String, String> connectorConfig = getAvroConfig(topicName, InputFormat.PARQUET); connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); final Path path = ContentUtils.getTmpFilePath(name); @@ -296,17 +274,29 @@ void parquetTest(final TestInfo testInfo) throws IOException { final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 100, connectRunner.getBootstrapServers(), schemaRegistry.getSchemaRegistryUrl()); + final List<String> expectedRecordNames = IntStream.range(0, 100) + .mapToObj(i -> name + i) + .collect(Collectors.toList()); assertThat(records).extracting(record -> record.get("name").toString()) - .contains(name + "1") - .contains(name + "2"); + .containsExactlyInAnyOrderElementsOf(expectedRecordNames); + } + + private Map<String, String> getAvroConfig(final String topicName, final InputFormat parquet) { + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName); + connectorConfig.put(INPUT_FORMAT_KEY, parquet.getValue()); + connectorConfig.put(SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); + connectorConfig.put(VALUE_CONVERTER_KEY, "io.confluent.connect.avro.AvroConverter"); + connectorConfig.put(VALUE_CONVERTER_SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); + connectorConfig.put(VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); + return connectorConfig; } @Test void jsonTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(basicConnectorConfig(CONNECTOR_NAME), topicName); + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.JSONL.getValue()); - connectorConfig.put("value.converter", "org.apache.kafka.connect.json.JsonConverter"); + connectorConfig.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.json.JsonConverter"); connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); final String testMessage = "This is a test "; @@ -323,6 +313,7 @@ void jsonTest(final TestInfo testInfo) throws IOException { final List<JsonNode> records = IntegrationBase.consumeJsonMessages(topicName, 500, connectRunner.getBootstrapServers()); + assertThat(records).hasSize(500); assertThat(records).extracting(record -> record.get("payload").get("message").asText()).contains(testMessage); assertThat(records).extracting(record -> record.get("payload").get("id").asText()).contains("1"); @@ -330,48 +321,34 @@ void jsonTest(final TestInfo testInfo) throws IOException { verifyOffsetPositions(offsetKeys, 1); } - @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") - private static ByteArrayOutputStream getAvroRecord(final Schema schema, final int messageId, final int noOfAvroRecs) + private static byte[] getAvroRecord(final Schema schema, final int messageId, final int noOfAvroRecs) throws IOException { - // Create Avro records - GenericRecord avroRecord; - - // Serialize Avro records to byte arrays - final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); - try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) { + try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) { dataFileWriter.create(schema, outputStream); for (int i = 0; i < noOfAvroRecs; i++) { - avroRecord = new GenericData.Record(schema); + final GenericRecord avroRecord = new GenericData.Record(schema); // NOPMD avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + i); avroRecord.put("id", messageId); - dataFileWriter.append(avroRecord); } dataFileWriter.flush(); + return outputStream.toByteArray(); } - outputStream.close(); - return outputStream; } - private static String writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) - throws IOException { - final String filePrefix = topicName + "-" + partitionId + "-" + System.currentTimeMillis(); - final String fileSuffix = ".txt"; - - final Path testFilePath = File.createTempFile(filePrefix, fileSuffix).toPath(); - final String objectKey = filePrefix + fileSuffix; - try { - Files.write(testFilePath, testDataBytes); - saveToS3(TEST_BUCKET_NAME, "", objectKey, testFilePath.toFile()); - return OBJECT_KEY + SEPARATOR + objectKey; - } finally { - Files.delete(testFilePath); - } + private static String writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) { + final String objectKey = topicName + "-" + partitionId + "-" + System.currentTimeMillis() + ".txt"; + final PutObjectRequest request = new PutObjectRequest(TEST_BUCKET_NAME, objectKey, + new ByteArrayInputStream(testDataBytes), new ObjectMetadata()); + s3Client.putObject(request); + return OBJECT_KEY + SEPARATOR + objectKey; } - private Map<String, String> getConfig(final Map<String, String> config, final String topics) { + private Map<String, String> getConfig(final String connectorName, final String topics) { + final Map<String, String> config = new HashMap<>(basicConnectorConfig(connectorName)); config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); config.put(AWS_ACCESS_KEY_ID_CONFIG, S3_ACCESS_KEY_ID); config.put(AWS_SECRET_ACCESS_KEY_CONFIG, S3_SECRET_ACCESS_KEY); @@ -387,20 +364,15 @@ private Map<String, String> basicConnectorConfig(final String connectorName) { final Map<String, String> config = new HashMap<>(); config.put("name", connectorName); config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); - config.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.converters.ByteArrayConverter"); config.put("tasks.max", "1"); return config; } - public static void saveToS3(final String bucketName, final String folderName, final String fileNameInS3, - final File fileToWrite) { - final PutObjectRequest request = new PutObjectRequest(bucketName, folderName + fileNameInS3, fileToWrite); - s3Client.putObject(request); - } - private void verifyOffsetPositions(final Set<String> offsetKeys, final int messagesCount) throws ConnectException { final Map<String, Object> offsetRecs = IntegrationBase.consumeOffsetStorageMessages( "connect-offset-topic-" + CONNECTOR_NAME, messagesCount, connectRunner.getBootstrapServers()); + assertThat(offsetRecs.keySet()).hasSize(messagesCount).isSubsetOf(offsetKeys); } } From eba30cff9ece912d93c8b2736e634bd99b921e89 Mon Sep 17 00:00:00 2001 From: Ryan Skraba <ryan.skraba@aiven.io> Date: Fri, 8 Nov 2024 15:21:53 +0100 Subject: [PATCH 65/90] feat: Use karapace schema registry for testing --- .../s3/source/SchemaRegistryContainer.java | 35 ++++++++++++++----- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java index 0755cb8d1..f18a37719 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java @@ -16,31 +16,50 @@ package io.aiven.kafka.connect.s3.source; -import java.util.List; +import java.time.Duration; import com.github.dockerjava.api.model.Ulimit; import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.wait.strategy.Wait; public final class SchemaRegistryContainer extends GenericContainer<SchemaRegistryContainer> { public static final int SCHEMA_REGISTRY_PORT = 8081; public SchemaRegistryContainer(final String bootstrapServer) { - this("5.0.4", bootstrapServer); + this("3.7.1", bootstrapServer); } - public SchemaRegistryContainer(final String confluentPlatformVersion, final String bootstrapServer) { - super("confluentinc/cp-schema-registry:" + confluentPlatformVersion); + public SchemaRegistryContainer(final String karapaceVersion, final String bootstrapServer) { + super("ghcr.io/aiven/karapace:" + karapaceVersion); withAccessToHost(true); - withEnv("SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS", "PLAINTEXT://" + bootstrapServer); - withEnv("SCHEMA_REGISTRY_LOG4J_LOGLEVEL=DEBUG", "DEBUG"); + withEnv("KARAPACE_ADVERTISED_HOSTNAME", "karapace-registry"); + withEnv("KARAPACE_BOOTSTRAP_URI", bootstrapServer); + withEnv("KARAPACE_PORT", String.valueOf(SCHEMA_REGISTRY_PORT)); + withEnv("KARAPACE_HOST", "0.0.0.0"); + withEnv("KARAPACE_CLIENT_ID", "karapace"); + withEnv("KARAPACE_GROUP_ID", "karapace-registry"); + withEnv("KARAPACE_MASTER_ELIGIBILITY", "true"); + withEnv("KARAPACE_TOPIC_NAME", "_schemas"); + withEnv("KARAPACE_LOG_LEVEL", "WARNING");// This can be set to DEBUG for more verbose logging + withEnv("KARAPACE_COMPATIBILITY", "FULL"); + withEnv("KARAPACE_KAFKA_SCHEMA_READER_STRICT_MODE", "false"); + withEnv("KARAPACE_KAFKA_RETRIABLE_ERRORS_SILENCED", "true"); withExposedPorts(SCHEMA_REGISTRY_PORT); - withEnv("SCHEMA_REGISTRY_HOST_NAME", "localhost"); + withCommand("/bin/bash", "/opt/karapace/start.sh", "registry"); + + // When started, check any API to see if the service is ready, which also indicates that it is connected to the + // Kafka bootstrap server. + waitingFor(Wait.forHttp("/_health") + .forPort(8081) + .withReadTimeout(Duration.ofMinutes(1)) + .forResponsePredicate(response -> response.contains("\"schema_registry_ready\":true"))); withCreateContainerCmdModifier( - cmd -> cmd.getHostConfig().withUlimits(List.of(new Ulimit("nofile", 30_000L, 30_000L)))); + cmd -> cmd.getHostConfig().withUlimits(new Ulimit[] { new Ulimit("nofile", 30_000L, 30_000L) })); } public String getSchemaRegistryUrl() { return String.format("http://%s:%s", getHost(), getMappedPort(SCHEMA_REGISTRY_PORT)); + } } From 4940cbce9468d332b21fff73424f144b152f7229 Mon Sep 17 00:00:00 2001 From: Ryan Skraba <ryan.skraba@aiven.io> Date: Thu, 14 Nov 2024 13:58:46 +0100 Subject: [PATCH 66/90] Use recent version of Karapace docker --- .../kafka/connect/s3/source/SchemaRegistryContainer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java index f18a37719..5e2e1201b 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java @@ -26,11 +26,11 @@ public final class SchemaRegistryContainer extends GenericContainer<SchemaRegist public static final int SCHEMA_REGISTRY_PORT = 8081; public SchemaRegistryContainer(final String bootstrapServer) { - this("3.7.1", bootstrapServer); + this("4.1.0", bootstrapServer); } public SchemaRegistryContainer(final String karapaceVersion, final String bootstrapServer) { - super("ghcr.io/aiven/karapace:" + karapaceVersion); + super("ghcr.io/aiven-open/karapace:" + karapaceVersion); withAccessToHost(true); withEnv("KARAPACE_ADVERTISED_HOSTNAME", "karapace-registry"); withEnv("KARAPACE_BOOTSTRAP_URI", bootstrapServer); From 49ec94ed01ac9544d5080dc56a0379a873a17a36 Mon Sep 17 00:00:00 2001 From: Anatolii Popov <anatolii.popov@aiven.io> Date: Wed, 13 Nov 2024 17:48:56 +0200 Subject: [PATCH 67/90] Flaky integration tests fix --- .github/workflows/main_push_workflow.yml | 2 +- .../kafka/connect/s3/source/IntegrationBase.java | 14 +++++++++++--- .../kafka/connect/s3/source/IntegrationTest.java | 5 +++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main_push_workflow.yml b/.github/workflows/main_push_workflow.yml index 8244ec088..393534842 100644 --- a/.github/workflows/main_push_workflow.yml +++ b/.github/workflows/main_push_workflow.yml @@ -34,4 +34,4 @@ jobs: run: ./gradlew build test - name: Build in Linux if: runner.os == 'Linux' - run: ./gradlew build check test integrationTest + run: ./gradlew build check test integrationTest -i diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index 9629b07c0..523bae1f2 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -28,6 +28,7 @@ import java.nio.file.Path; import java.time.Duration; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -116,9 +117,16 @@ static LocalStackContainer createS3Container() { .withServices(LocalStackContainer.Service.S3); } - static int getRandomPort() throws IOException { - try (ServerSocket socket = new ServerSocket(0)) { - return socket.getLocalPort(); + /** + * Finds 2 simultaneously free port for Kafka listeners + * + * @return list of 2 ports + * @throws IOException + * when port allocation failure happens + */ + static List<Integer> getKafkaListenerPorts() throws IOException { + try (ServerSocket socket = new ServerSocket(0); ServerSocket socket2 = new ServerSocket(0)) { + return Arrays.asList(socket.getLocalPort(), socket2.getLocalPort()); } catch (IOException e) { throw new IOException("Failed to allocate port for test", e); } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index a077f0d0a..80696e858 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -124,8 +124,9 @@ void setUp(final TestInfo testInfo) throws Exception { testBucketAccessor.createBucket(); connectRunner = new ConnectRunner(OFFSET_FLUSH_INTERVAL_MS); - final int localListenerPort = IntegrationBase.getRandomPort(); - final int containerListenerPort = IntegrationBase.getRandomPort(); + final List<Integer> ports = IntegrationBase.getKafkaListenerPorts(); + final int localListenerPort = ports.get(0); + final int containerListenerPort = ports.get(1); connectRunner.startConnectCluster(CONNECTOR_NAME, localListenerPort, containerListenerPort); adminClient = newAdminClient(connectRunner.getBootstrapServers()); From 8801bbaca8778a8198e04850f7da4766e97a9928 Mon Sep 17 00:00:00 2001 From: Murali Basani <muralidhar.basani@aiven.io> Date: Thu, 21 Nov 2024 11:36:56 +0100 Subject: [PATCH 68/90] Multi tasks distribution of s3 objs (#327) Partially fixes https://aiven.atlassian.net/browse/KCON-2 Currently when max.tasks is set to above 1, then each of those tasks are processing all objects in the bucket, which should not be the case. This pr does the below (bug fix for distributed mode) * based on the hash of the object key, assigns the objects to tasks * Updated integration tests with max tasks > 1 --- .../connect/s3/source/IntegrationTest.java | 45 ++++--- .../AivenKafkaConnectS3SourceConnector.java | 4 +- .../connect/s3/source/utils/FileReader.java | 8 ++ .../s3/source/utils/FileReaderTest.java | 122 ++++++++++-------- 4 files changed, 105 insertions(+), 74 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 80696e858..bab6d1587 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -41,6 +41,7 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.time.Duration; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; import java.util.HashMap; @@ -67,6 +68,7 @@ import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumWriter; +import org.awaitility.Awaitility; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; @@ -154,7 +156,7 @@ void tearDown() { @Test void bytesTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName); + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 2); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); @@ -184,10 +186,10 @@ void bytesTest(final TestInfo testInfo) throws IOException { } @Test - void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) throws IOException { + void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) throws IOException, InterruptedException { final String testData = "AABBCCDDEE"; final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName); + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 3); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); connectorConfig.put(EXPECTED_MAX_MESSAGE_BYTES, "2"); // For above test data of 10 bytes length, with 2 bytes // each @@ -195,10 +197,7 @@ void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) throws IOException connectorConfig.put(MAX_POLL_RECORDS, "2"); // In 3 polls all the 5 records should be processed connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); - - final Set<String> offsetKeys = new HashSet<>(); - - offsetKeys.add(writeToS3(topicName, testData.getBytes(StandardCharsets.UTF_8), "00000")); + final String offsetKey = writeToS3(topicName, testData.getBytes(StandardCharsets.UTF_8), "00000"); // Poll messages from the Kafka topic and verify the consumed data final List<String> records = IntegrationBase.consumeMessages(topicName, 5, connectRunner.getBootstrapServers()); @@ -206,12 +205,15 @@ void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) throws IOException // Verify that the correct data is read from the S3 bucket and pushed to Kafka assertThat(records).containsExactly("AA", "BB", "CC", "DD", "EE"); - // Verify offset positions - verifyOffsetPositions(offsetKeys, 1); + Awaitility.await().atMost(Duration.ofMinutes(2)).untilAsserted(() -> { + final Map<String, Object> offsetRecs = IntegrationBase.consumeOffsetStorageMessages( + "connect-offset-topic-" + CONNECTOR_NAME, 1, connectRunner.getBootstrapServers()); + assertThat(offsetRecs).containsExactly(entry(offsetKey, 5)); + }); } @Test - void avroTest(final TestInfo testInfo) throws IOException { + void avroTest(final TestInfo testInfo) throws IOException, InterruptedException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getAvroConfig(topicName, InputFormat.AVRO); @@ -249,8 +251,11 @@ void avroTest(final TestInfo testInfo) throws IOException { .contains(entry(1, "Hello, Kafka Connect S3 Source! object 1"), entry(2, "Hello, Kafka Connect S3 Source! object 2")); - // Verify offset positions - verifyOffsetPositions(offsetKeys, 5); + Thread.sleep(10_000); + + final Map<String, Object> offsetRecs = IntegrationBase.consumeOffsetStorageMessages( + "connect-offset-topic-" + CONNECTOR_NAME, 5, connectRunner.getBootstrapServers()); + assertThat(offsetRecs).containsOnlyKeys(offsetKeys).values().containsOnly(100); } @Test @@ -283,7 +288,7 @@ void parquetTest(final TestInfo testInfo) throws IOException { } private Map<String, String> getAvroConfig(final String topicName, final InputFormat parquet) { - final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName); + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 4); connectorConfig.put(INPUT_FORMAT_KEY, parquet.getValue()); connectorConfig.put(SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); connectorConfig.put(VALUE_CONVERTER_KEY, "io.confluent.connect.avro.AvroConverter"); @@ -295,7 +300,7 @@ private Map<String, String> getAvroConfig(final String topicName, final InputFor @Test void jsonTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName); + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 1); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.JSONL.getValue()); connectorConfig.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.json.JsonConverter"); @@ -348,8 +353,8 @@ private static String writeToS3(final String topicName, final byte[] testDataByt return OBJECT_KEY + SEPARATOR + objectKey; } - private Map<String, String> getConfig(final String connectorName, final String topics) { - final Map<String, String> config = new HashMap<>(basicConnectorConfig(connectorName)); + private Map<String, String> getConfig(final String connectorName, final String topics, final int maxTasks) { + final Map<String, String> config = new HashMap<>(basicConnectorConfig(connectorName, maxTasks)); config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); config.put(AWS_ACCESS_KEY_ID_CONFIG, S3_ACCESS_KEY_ID); config.put(AWS_SECRET_ACCESS_KEY_CONFIG, S3_SECRET_ACCESS_KEY); @@ -361,19 +366,21 @@ private Map<String, String> getConfig(final String connectorName, final String t return config; } - private Map<String, String> basicConnectorConfig(final String connectorName) { + private Map<String, String> basicConnectorConfig(final String connectorName, final int maxTasks) { final Map<String, String> config = new HashMap<>(); config.put("name", connectorName); config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); config.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.converters.ByteArrayConverter"); - config.put("tasks.max", "1"); + config.put("tasks.max", String.valueOf(maxTasks)); return config; } - private void verifyOffsetPositions(final Set<String> offsetKeys, final int messagesCount) throws ConnectException { + private Map<String, Object> verifyOffsetPositions(final Set<String> offsetKeys, final int messagesCount) + throws ConnectException { final Map<String, Object> offsetRecs = IntegrationBase.consumeOffsetStorageMessages( "connect-offset-topic-" + CONNECTOR_NAME, messagesCount, connectRunner.getBootstrapServers()); assertThat(offsetRecs.keySet()).hasSize(messagesCount).isSubsetOf(offsetKeys); + return offsetRecs; } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java index 65b25235e..ca0d10a14 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java @@ -17,6 +17,7 @@ package io.aiven.kafka.connect.s3.source; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; @@ -60,7 +61,8 @@ public Class<? extends Task> taskClass() { public List<Map<String, String>> taskConfigs(final int maxTasks) { final var taskProps = new ArrayList<Map<String, String>>(); for (int i = 0; i < maxTasks; i++) { - final var props = Map.copyOf(configProperties); + final var props = new HashMap<>(configProperties); // NOPMD + props.put("task.id", String.valueOf(i)); taskProps.add(props); } return taskProps; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java index 561f7df21..afdf89fd6 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -62,6 +62,7 @@ Iterator<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) { .flatMap(response -> response.getObjectSummaries() .stream() .filter(objectSummary -> objectSummary.getSize() > 0) + .filter(objectSummary -> assignObjectToTask(objectSummary.getKey())) .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey()))); return s3ObjectStream.iterator(); } @@ -69,4 +70,11 @@ Iterator<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) { public void addFailedObjectKeys(final String objectKey) { this.failedObjectKeys.add(objectKey); } + + private boolean assignObjectToTask(final String objectKey) { + final int maxTasks = Integer.parseInt(s3SourceConfig.originals().get("tasks.max").toString()); + final int taskId = Integer.parseInt(s3SourceConfig.originals().get("task.id").toString()) % maxTasks; + final int taskAssignment = Math.floorMod(objectKey.hashCode(), maxTasks); + return taskAssignment == taskId; + } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java index 190cb33c9..cc0887b67 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java @@ -16,11 +16,7 @@ package io.aiven.kafka.connect.s3.source.utils; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -32,82 +28,90 @@ import java.util.List; import java.util.Map; -import io.aiven.kafka.connect.s3.source.AivenKafkaConnectS3SourceConnector; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.input.InputFormat; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.ListObjectsV2Request; import com.amazonaws.services.s3.model.ListObjectsV2Result; import com.amazonaws.services.s3.model.S3ObjectSummary; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.mockito.Mock; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; class FileReaderTest { private static final String TEST_BUCKET = "test-bucket"; - @Mock private AmazonS3 s3Client; - @Mock - private OffsetManager offsetManager; - private FileReader fileReader; - private Map<String, String> properties; - - @BeforeEach - public void setUp() { - properties = new HashMap<>(); - setBasicProperties(); - final S3SourceConfig s3SourceConfig = new S3SourceConfig(properties); - offsetManager = mock(OffsetManager.class); - fileReader = new FileReader(s3SourceConfig, TEST_BUCKET, Collections.emptySet()); - s3Client = mock(AmazonS3.class); + private static Map<String, String> getConfigMap(final int maxTasks, final int taskId) { + final Map<String, String> configMap = new HashMap<>(); + configMap.put("tasks.max", String.valueOf(maxTasks)); + configMap.put("task.id", String.valueOf(taskId)); + return configMap; } - @Test - void testFetchObjectSummariesWithNoObjects() throws IOException { + @ParameterizedTest + @CsvSource({ "3, 1" }) + void testFetchObjectSummariesWithNoObjects(final int maxTasks, final int taskId) { + initializeWithTaskConfigs(maxTasks, taskId); final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result(Collections.emptyList(), null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); - when(offsetManager.getOffsets()).thenReturn(new HashMap<>()); final Iterator<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); - assertFalse(summaries.hasNext()); + assertThat(summaries).isExhausted(); } - @Test - void testFetchObjectSummariesWithOneNonZeroByteObject() throws IOException { - final S3ObjectSummary objectSummary = createObjectSummary(1, "key1"); - final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result( - Collections.singletonList(objectSummary), null); - when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); - when(offsetManager.getOffsets()).thenReturn(new HashMap<>()); + @ParameterizedTest + @CsvSource({ "1, 0" }) + void testFetchObjectSummariesWithOneObjectWithBasicConfig(final int maxTasks, final int taskId) { + final String objectKey = "any-key"; - final Iterator<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); + initializeWithTaskConfigs(maxTasks, taskId); + final Iterator<S3ObjectSummary> summaries = getS3ObjectSummaryIterator(objectKey); + assertThat(summaries).hasNext(); + assertThat(summaries.next().getSize()).isEqualTo(1); + } - assertTrue(summaries.hasNext()); + @ParameterizedTest + @CsvSource({ "4, 2, key1", "4, 3, key2", "4, 0, key3", "4, 1, key4" }) + void testFetchObjectSummariesWithOneNonZeroByteObjectWithTaskIdAssigned(final int maxTasks, final int taskId, + final String objectKey) { + initializeWithTaskConfigs(maxTasks, taskId); + final Iterator<S3ObjectSummary> summaries = getS3ObjectSummaryIterator(objectKey); + assertThat(summaries).hasNext(); assertThat(summaries.next().getSize()).isEqualTo(1); } - @Test - void testFetchObjectSummariesWithZeroByteObject() throws IOException { - final S3ObjectSummary zeroByteObject = createObjectSummary(0, "key1"); - final S3ObjectSummary nonZeroByteObject = createObjectSummary(1, "key2"); - final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result( - List.of(zeroByteObject, nonZeroByteObject), null); + @ParameterizedTest + @CsvSource({ "4, 1, key1", "4, 3, key1", "4, 0, key1", "4, 1, key2", "4, 2, key2", "4, 0, key2", "4, 1, key3", + "4, 2, key3", "4, 3, key3", "4, 0, key4", "4, 2, key4", "4, 3, key4" }) + void testFetchObjectSummariesWithOneNonZeroByteObjectWithTaskIdUnassigned(final int maxTasks, final int taskId, + final String objectKey) { + initializeWithTaskConfigs(maxTasks, taskId); + final Iterator<S3ObjectSummary> summaries = getS3ObjectSummaryIterator(objectKey); + assertThat(summaries).isExhausted(); + } + + @ParameterizedTest + @CsvSource({ "4, 3", "4, 0" }) + void testFetchObjectSummariesWithZeroByteObject(final int maxTasks, final int taskId) { + initializeWithTaskConfigs(maxTasks, taskId); + final ListObjectsV2Result listObjectsV2Result = getListObjectsV2Result(); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); - when(offsetManager.getOffsets()).thenReturn(new HashMap<>()); final Iterator<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); - assertTrue(summaries.hasNext()); + // assigned 1 object to taskid + assertThat(summaries).hasNext(); assertThat(summaries.next().getSize()).isEqualTo(1); + assertThat(summaries).isExhausted(); } @Test void testFetchObjectSummariesWithPagination() throws IOException { + initializeWithTaskConfigs(4, 3); final S3ObjectSummary object1 = createObjectSummary(1, "key1"); final S3ObjectSummary object2 = createObjectSummary(2, "key2"); final List<S3ObjectSummary> firstBatch = List.of(object1); @@ -117,12 +121,10 @@ void testFetchObjectSummariesWithPagination() throws IOException { final ListObjectsV2Result secondResult = createListObjectsV2Result(secondBatch, null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(firstResult).thenReturn(secondResult); - when(offsetManager.getOffsets()).thenReturn(new HashMap<>()); final Iterator<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); assertThat(summaries.next()).isNotNull(); - assertThat(summaries.next()).isNotNull(); } private ListObjectsV2Result createListObjectsV2Result(final List<S3ObjectSummary> summaries, @@ -141,14 +143,26 @@ private S3ObjectSummary createObjectSummary(final long sizeOfObject, final Strin return summary; } - private void setBasicProperties() { - properties.put(S3SourceConfig.INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); - properties.put("name", "test_source_connector"); - properties.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); - properties.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); - properties.put("tasks.max", "1"); - properties.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); - properties.put(TARGET_TOPIC_PARTITIONS, "0,1"); - properties.put(TARGET_TOPICS, "testtopic"); + private Iterator<S3ObjectSummary> getS3ObjectSummaryIterator(final String objectKey) { + final S3ObjectSummary objectSummary = createObjectSummary(1, objectKey); + final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result( + Collections.singletonList(objectSummary), null); + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); + + return fileReader.fetchObjectSummaries(s3Client); + } + + public void initializeWithTaskConfigs(final int maxTasks, final int taskId) { + final Map<String, String> configMap = getConfigMap(maxTasks, taskId); + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configMap); + fileReader = new FileReader(s3SourceConfig, TEST_BUCKET, Collections.emptySet()); + s3Client = mock(AmazonS3.class); + } + + private ListObjectsV2Result getListObjectsV2Result() { + final S3ObjectSummary zeroByteObject = createObjectSummary(0, "key1"); + final S3ObjectSummary nonZeroByteObject1 = createObjectSummary(1, "key2"); + final S3ObjectSummary nonZeroByteObject2 = createObjectSummary(1, "key3"); + return createListObjectsV2Result(List.of(zeroByteObject, nonZeroByteObject1, nonZeroByteObject2), null); } } From a31b0caa473cd088e9d7f1e4ec9c6fced342140f Mon Sep 17 00:00:00 2001 From: Murali Basani <muralidhar.basani@aiven.io> Date: Fri, 22 Nov 2024 16:29:21 +0100 Subject: [PATCH 69/90] [KCON35] : Improvement : Read files with stream instead of loading it all (#351) Currently the transformers load the files and get a list of records. This could cause performance issues for large files. * With Stream/StreamSupport, only when next() is called from iterator, a record is transformed. --- s3-source-connector/build.gradle.kts | 1 - .../connect/s3/source/IntegrationTest.java | 55 ++++------- .../s3/source/input/AvroTransformer.java | 26 +++++- .../s3/source/input/ByteArrayTransformer.java | 49 +++++----- .../s3/source/input/JsonTransformer.java | 93 +++++++++++++------ .../s3/source/input/ParquetTransformer.java | 71 +++++++++----- .../connect/s3/source/input/Transformer.java | 7 +- .../s3/source/utils/SourceRecordIterator.java | 49 +++++----- .../input/ByteArrayTransformerTest.java | 48 ++++------ .../s3/source/input/JsonTransformerTest.java | 54 +++++++++-- .../source/input/ParquetTransformerTest.java | 49 +++++++++- .../utils/SourceRecordIteratorTest.java | 5 +- 12 files changed, 327 insertions(+), 180 deletions(-) diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts index ad2c69d2a..943dbc75c 100644 --- a/s3-source-connector/build.gradle.kts +++ b/s3-source-connector/build.gradle.kts @@ -117,7 +117,6 @@ dependencies { exclude(group = "org.apache.commons", module = "commons-math3") exclude(group = "org.apache.httpcomponents", module = "httpclient") exclude(group = "commons-codec", module = "commons-codec") - exclude(group = "commons-io", module = "commons-io") exclude(group = "commons-net", module = "commons-net") exclude(group = "org.eclipse.jetty") exclude(group = "org.eclipse.jetty.websocket") diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index bab6d1587..eb0e86003 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -22,9 +22,7 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_ENDPOINT_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.EXPECTED_MAX_MESSAGE_BYTES; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.INPUT_FORMAT_KEY; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; @@ -41,7 +39,6 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; -import java.time.Duration; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; import java.util.HashMap; @@ -68,7 +65,6 @@ import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumWriter; -import org.awaitility.Awaitility; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; @@ -185,33 +181,6 @@ void bytesTest(final TestInfo testInfo) throws IOException { verifyOffsetPositions(offsetKeys, 4); } - @Test - void bytesTestBasedOnMaxMessageBytes(final TestInfo testInfo) throws IOException, InterruptedException { - final String testData = "AABBCCDDEE"; - final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 3); - connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); - connectorConfig.put(EXPECTED_MAX_MESSAGE_BYTES, "2"); // For above test data of 10 bytes length, with 2 bytes - // each - // in source record, we expect 5 records. - connectorConfig.put(MAX_POLL_RECORDS, "2"); // In 3 polls all the 5 records should be processed - - connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); - final String offsetKey = writeToS3(topicName, testData.getBytes(StandardCharsets.UTF_8), "00000"); - - // Poll messages from the Kafka topic and verify the consumed data - final List<String> records = IntegrationBase.consumeMessages(topicName, 5, connectRunner.getBootstrapServers()); - - // Verify that the correct data is read from the S3 bucket and pushed to Kafka - assertThat(records).containsExactly("AA", "BB", "CC", "DD", "EE"); - - Awaitility.await().atMost(Duration.ofMinutes(2)).untilAsserted(() -> { - final Map<String, Object> offsetRecs = IntegrationBase.consumeOffsetStorageMessages( - "connect-offset-topic-" + CONNECTOR_NAME, 1, connectRunner.getBootstrapServers()); - assertThat(offsetRecs).containsExactly(entry(offsetKey, 5)); - }); - } - @Test void avroTest(final TestInfo testInfo) throws IOException, InterruptedException { final var topicName = IntegrationBase.topicName(testInfo); @@ -227,16 +196,19 @@ void avroTest(final TestInfo testInfo) throws IOException, InterruptedException final Schema schema = parser.parse(schemaJson); final byte[] outputStream1 = getAvroRecord(schema, 1, 100); - final byte[] outputStream2 = getAvroRecord(schema, 2, 100); + final byte[] outputStream2 = getAvroRecord(schema, 101, 100); + final byte[] outputStream3 = getAvroRecord(schema, 201, 100); + final byte[] outputStream4 = getAvroRecord(schema, 301, 100); + final byte[] outputStream5 = getAvroRecord(schema, 401, 100); final Set<String> offsetKeys = new HashSet<>(); offsetKeys.add(writeToS3(topicName, outputStream1, "00001")); offsetKeys.add(writeToS3(topicName, outputStream2, "00001")); - offsetKeys.add(writeToS3(topicName, outputStream1, "00002")); - offsetKeys.add(writeToS3(topicName, outputStream2, "00002")); - offsetKeys.add(writeToS3(topicName, outputStream2, "00002")); + offsetKeys.add(writeToS3(topicName, outputStream3, "00002")); + offsetKeys.add(writeToS3(topicName, outputStream4, "00002")); + offsetKeys.add(writeToS3(topicName, outputStream5, "00002")); assertThat(testBucketAccessor.listObjects()).hasSize(5); @@ -249,7 +221,12 @@ void avroTest(final TestInfo testInfo) throws IOException, InterruptedException assertThat(records).hasSize(500) .map(record -> entry(record.get("id"), String.valueOf(record.get("message")))) .contains(entry(1, "Hello, Kafka Connect S3 Source! object 1"), - entry(2, "Hello, Kafka Connect S3 Source! object 2")); + entry(2, "Hello, Kafka Connect S3 Source! object 2"), + entry(100, "Hello, Kafka Connect S3 Source! object 100"), + entry(200, "Hello, Kafka Connect S3 Source! object 200"), + entry(300, "Hello, Kafka Connect S3 Source! object 300"), + entry(400, "Hello, Kafka Connect S3 Source! object 400"), + entry(500, "Hello, Kafka Connect S3 Source! object 500")); Thread.sleep(10_000); @@ -327,17 +304,17 @@ void jsonTest(final TestInfo testInfo) throws IOException { verifyOffsetPositions(offsetKeys, 1); } - private static byte[] getAvroRecord(final Schema schema, final int messageId, final int noOfAvroRecs) - throws IOException { + private static byte[] getAvroRecord(final Schema schema, int messageId, final int noOfAvroRecs) throws IOException { final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter); ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) { dataFileWriter.create(schema, outputStream); for (int i = 0; i < noOfAvroRecs; i++) { final GenericRecord avroRecord = new GenericData.Record(schema); // NOPMD - avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + i); + avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + messageId); avroRecord.put("id", messageId); dataFileWriter.append(avroRecord); + messageId++; // NOPMD } dataFileWriter.flush(); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java index a781f6bd1..dd2516692 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java @@ -24,15 +24,21 @@ import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import com.amazonaws.util.IOUtils; import org.apache.avro.file.DataFileReader; +import org.apache.avro.file.DataFileStream; import org.apache.avro.file.SeekableByteArrayInput; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; +import org.apache.commons.io.function.IOSupplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,10 +52,10 @@ public void configureValueConverter(final Map<String, String> config, final S3So } @Override - public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition, - final S3SourceConfig s3SourceConfig) { + public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, + final int topicPartition, final S3SourceConfig s3SourceConfig) { final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); - return readAvroRecords(inputStream, datumReader); + return readAvroRecordsAsStream(inputStreamIOSupplier, datumReader); } @Override @@ -58,6 +64,20 @@ public byte[] getValueBytes(final Object record, final String topic, final S3Sou s3SourceConfig); } + private Stream<Object> readAvroRecordsAsStream(final IOSupplier<InputStream> inputStreamIOSupplier, + final DatumReader<GenericRecord> datumReader) { + try (DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inputStreamIOSupplier.get(), + datumReader)) { + // Wrap DataFileStream in a Stream using a Spliterator for lazy processing + return StreamSupport.stream( + Spliterators.spliteratorUnknownSize(dataFileStream, Spliterator.ORDERED | Spliterator.NONNULL), + false); + } catch (IOException e) { + LOGGER.error("Error in DataFileStream: {}", e.getMessage(), e); + return Stream.empty(); // Return an empty stream if initialization fails + } + } + List<Object> readAvroRecords(final InputStream content, final DatumReader<GenericRecord> datumReader) { final List<Object> records = new ArrayList<>(); try (SeekableByteArrayInput sin = new SeekableByteArrayInput(IOUtils.toByteArray(content))) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformer.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformer.java index bc53e6330..8e36cab8c 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformer.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformer.java @@ -16,16 +16,17 @@ package io.aiven.kafka.connect.s3.source.input; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.EXPECTED_MAX_MESSAGE_BYTES; - import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; import java.util.Map; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import org.apache.commons.io.function.IOSupplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,29 +38,31 @@ public void configureValueConverter(final Map<String, String> config, final S3So // For byte array transformations, ByteArrayConverter is the converter which is the default config. } - @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") @Override - public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition, - final S3SourceConfig s3SourceConfig) { + public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, + final int topicPartition, final S3SourceConfig s3SourceConfig) { - final int maxMessageBytesSize = s3SourceConfig.getInt(EXPECTED_MAX_MESSAGE_BYTES); - final byte[] buffer = new byte[maxMessageBytesSize]; - int bytesRead; + // Create a Stream that processes each chunk lazily + return StreamSupport.stream(new Spliterators.AbstractSpliterator<>(Long.MAX_VALUE, Spliterator.ORDERED) { + final byte[] buffer = new byte[4096]; - final List<Object> chunks = new ArrayList<>(); - try { - bytesRead = inputStream.read(buffer); - while (bytesRead != -1) { - final byte[] chunk = new byte[bytesRead]; - System.arraycopy(buffer, 0, chunk, 0, bytesRead); - chunks.add(chunk); - bytesRead = inputStream.read(buffer); + @Override + public boolean tryAdvance(final java.util.function.Consumer<? super Object> action) { + try (InputStream inputStream = inputStreamIOSupplier.get()) { + final int bytesRead = inputStream.read(buffer); + if (bytesRead == -1) { + return false; + } + final byte[] chunk = new byte[bytesRead]; + System.arraycopy(buffer, 0, chunk, 0, bytesRead); + action.accept(chunk); + return true; + } catch (IOException e) { + LOGGER.error("Error trying to advance byte stream: {}", e.getMessage(), e); + return false; + } } - } catch (IOException e) { - LOGGER.error("Error reading from input stream: {}", e.getMessage(), e); - } - - return chunks; + }, false); } @Override diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/JsonTransformer.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/JsonTransformer.java index 5cda04f1a..80827fd8a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/JsonTransformer.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/JsonTransformer.java @@ -23,15 +23,18 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.List; import java.util.Map; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.io.function.IOSupplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -47,30 +50,9 @@ public void configureValueConverter(final Map<String, String> config, final S3So } @Override - public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition, - final S3SourceConfig s3SourceConfig) { - final List<Object> jsonNodeList = new ArrayList<>(); - JsonNode jsonNode; - try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { - String line = reader.readLine(); - while (line != null) { - line = line.trim(); - if (!line.isEmpty()) { - try { - // Parse each line as a separate JSON object - jsonNode = objectMapper.readTree(line.trim()); // Parse the current line into a JsonNode - jsonNodeList.add(jsonNode); // Add parsed JSON object to the list - } catch (IOException e) { - LOGGER.error("Error parsing JSON record from S3 input stream: {}", e.getMessage(), e); - } - } - - line = reader.readLine(); - } - } catch (IOException e) { - LOGGER.error("Error reading S3 object stream: {}", e.getMessage()); - } - return jsonNodeList; + public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, + final int topicPartition, final S3SourceConfig s3SourceConfig) { + return readJsonRecordsAsStream(inputStreamIOSupplier); } @Override @@ -82,4 +64,63 @@ public byte[] getValueBytes(final Object record, final String topic, final S3Sou return new byte[0]; } } + + private Stream<Object> readJsonRecordsAsStream(final IOSupplier<InputStream> inputStreamIOSupplier) { + // Use a Stream that lazily processes each line as a JSON object + CustomSpliterator customSpliteratorParam; + try { + customSpliteratorParam = new CustomSpliterator(inputStreamIOSupplier); + } catch (IOException e) { + LOGGER.error("Error creating Json transformer CustomSpliterator: {}", e.getMessage(), e); + return Stream.empty(); + } + return StreamSupport.stream(customSpliteratorParam, false).onClose(() -> { + try { + customSpliteratorParam.reader.close(); // Ensure the reader is closed after streaming + } catch (IOException e) { + LOGGER.error("Error closing BufferedReader: {}", e.getMessage(), e); + } + }); + } + + /* + * This CustomSpliterator class is created so that BufferedReader instantiation is not closed before the all the + * records from stream is closed. With this now, we have a onclose method declared in parent declaration. + */ + final class CustomSpliterator extends Spliterators.AbstractSpliterator<Object> { + BufferedReader reader; + String line; + CustomSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier) throws IOException { + super(Long.MAX_VALUE, Spliterator.ORDERED | Spliterator.NONNULL); + reader = new BufferedReader(new InputStreamReader(inputStreamIOSupplier.get(), StandardCharsets.UTF_8)); + } + + @Override + public boolean tryAdvance(final java.util.function.Consumer<? super Object> action) { + try { + if (line == null) { + line = reader.readLine(); + } + while (line != null) { + line = line.trim(); + if (!line.isEmpty()) { + try { + final JsonNode jsonNode = objectMapper.readTree(line); // Parse the JSON + // line + action.accept(jsonNode); // Provide the parsed JSON node to the stream + } catch (IOException e) { + LOGGER.error("Error parsing JSON record: {}", e.getMessage(), e); + } + line = null; // NOPMD + return true; + } + line = reader.readLine(); + } + return false; // End of file + } catch (IOException e) { + LOGGER.error("Error reading S3 object stream: {}", e.getMessage(), e); + return false; + } + } + } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformer.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformer.java index 39fec83de..48b0abd33 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformer.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformer.java @@ -25,15 +25,18 @@ import java.nio.file.Files; import java.nio.file.Path; import java.time.Instant; -import java.util.ArrayList; import java.util.Collections; -import java.util.List; import java.util.Map; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import org.apache.avro.generic.GenericRecord; import org.apache.commons.compress.utils.IOUtils; +import org.apache.commons.io.function.IOSupplier; import org.apache.parquet.avro.AvroParquetReader; import org.apache.parquet.io.InputFile; import org.apache.parquet.io.LocalInputFile; @@ -50,9 +53,9 @@ public void configureValueConverter(final Map<String, String> config, final S3So } @Override - public List<Object> getRecords(final InputStream inputStream, final String topic, final int topicPartition, - final S3SourceConfig s3SourceConfig) { - return getParquetRecords(inputStream, topic, topicPartition); + public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, + final int topicPartition, final S3SourceConfig s3SourceConfig) { + return getParquetStreamRecords(inputStreamIOSupplier, topic, topicPartition); } @Override @@ -61,35 +64,59 @@ public byte[] getValueBytes(final Object record, final String topic, final S3Sou s3SourceConfig); } - private List<Object> getParquetRecords(final InputStream inputStream, final String topic, - final int topicPartition) { + private Stream<Object> getParquetStreamRecords(final IOSupplier<InputStream> inputStreamIOSupplier, + final String topic, final int topicPartition) { final String timestamp = String.valueOf(Instant.now().toEpochMilli()); File parquetFile; - final List<Object> records = new ArrayList<>(); + try { + // Create a temporary file for the Parquet data parquetFile = File.createTempFile(topic + "_" + topicPartition + "_" + timestamp, ".parquet"); } catch (IOException e) { - LOGGER.error("Error in reading s3 object stream {}", e.getMessage(), e); - return records; + LOGGER.error("Error creating temp file for Parquet data: {}", e.getMessage(), e); + return Stream.empty(); } - try (OutputStream outputStream = Files.newOutputStream(parquetFile.toPath())) { - IOUtils.copy(inputStream, outputStream); + try (OutputStream outputStream = Files.newOutputStream(parquetFile.toPath()); + InputStream inputStream = inputStreamIOSupplier.get();) { + IOUtils.copy(inputStream, outputStream); // Copy input stream to temporary file + final InputFile inputFile = new LocalInputFile(parquetFile.toPath()); - try (var parquetReader = AvroParquetReader.<GenericRecord>builder(inputFile).build()) { - GenericRecord record; - record = parquetReader.read(); - while (record != null) { - records.add(record); - record = parquetReader.read(); + final var parquetReader = AvroParquetReader.<GenericRecord>builder(inputFile).build(); + + return StreamSupport.stream(new Spliterators.AbstractSpliterator<Object>(Long.MAX_VALUE, + Spliterator.ORDERED | Spliterator.NONNULL) { + @Override + public boolean tryAdvance(final java.util.function.Consumer<? super Object> action) { + try { + final GenericRecord record = parquetReader.read(); + if (record != null) { + action.accept(record); // Pass record to the stream + return true; + } else { + parquetReader.close(); // Close reader at end of file + deleteTmpFile(parquetFile.toPath()); + return false; + } + } catch (IOException | RuntimeException e) { // NOPMD + LOGGER.error("Error reading Parquet record: {}", e.getMessage(), e); + deleteTmpFile(parquetFile.toPath()); + return false; + } } - } + }, false).onClose(() -> { + try { + parquetReader.close(); // Ensure reader is closed when the stream is closed + } catch (IOException e) { + LOGGER.error("Error closing Parquet reader: {}", e.getMessage(), e); + } + deleteTmpFile(parquetFile.toPath()); + }); } catch (IOException | RuntimeException e) { // NOPMD - LOGGER.error("Error in reading s3 object stream {}", e.getMessage(), e); - } finally { + LOGGER.error("Error processing Parquet data: {}", e.getMessage(), e); deleteTmpFile(parquetFile.toPath()); + return Stream.empty(); } - return records; } static void deleteTmpFile(final Path parquetFile) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/Transformer.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/Transformer.java index 70fe28d96..616cfdb77 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/Transformer.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/Transformer.java @@ -17,16 +17,19 @@ package io.aiven.kafka.connect.s3.source.input; import java.io.InputStream; -import java.util.List; import java.util.Map; +import java.util.stream.Stream; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import org.apache.commons.io.function.IOSupplier; + public interface Transformer { void configureValueConverter(Map<String, String> config, S3SourceConfig s3SourceConfig); - List<Object> getRecords(InputStream inputStream, String topic, int topicPartition, S3SourceConfig s3SourceConfig); + Stream<Object> getRecords(IOSupplier<InputStream> inputStreamIOSupplier, String topic, int topicPartition, + S3SourceConfig s3SourceConfig); byte[] getValueBytes(Object record, String topic, S3SourceConfig s3SourceConfig); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 8c1fcb77d..3a6c40812 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -19,7 +19,6 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; import java.io.IOException; -import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; @@ -28,6 +27,7 @@ import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Stream; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.input.Transformer; @@ -35,7 +35,6 @@ import com.amazonaws.AmazonClientException; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.S3Object; -import com.amazonaws.services.s3.model.S3ObjectInputStream; import com.amazonaws.services.s3.model.S3ObjectSummary; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -95,8 +94,7 @@ private void nextS3Object() { } private Iterator<S3SourceRecord> createIteratorForCurrentFile() throws IOException { - try (S3Object s3Object = s3Client.getObject(bucketName, currentObjectKey); - S3ObjectInputStream inputStream = s3Object.getObjectContent()) { + try (S3Object s3Object = s3Client.getObject(bucketName, currentObjectKey);) { final Matcher fileMatcher = FILE_DEFAULT_PATTERN.matcher(currentObjectKey); String topicName; @@ -107,7 +105,6 @@ private Iterator<S3SourceRecord> createIteratorForCurrentFile() throws IOExcepti defaultPartitionId = Integer.parseInt(fileMatcher.group(PATTERN_PARTITION_KEY)); } else { LOGGER.error("File naming doesn't match to any topic. {}", currentObjectKey); - inputStream.abort(); s3Object.close(); return Collections.emptyIterator(); } @@ -118,13 +115,13 @@ private Iterator<S3SourceRecord> createIteratorForCurrentFile() throws IOExcepti final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(topicName, defaultPartitionId, bucketName); - return getObjectIterator(inputStream, finalTopic, defaultPartitionId, defaultStartOffsetId, transformer, + return getObjectIterator(s3Object, finalTopic, defaultPartitionId, defaultStartOffsetId, transformer, partitionMap); } } @SuppressWarnings("PMD.CognitiveComplexity") - private Iterator<S3SourceRecord> getObjectIterator(final InputStream valueInputStream, final String topic, + private Iterator<S3SourceRecord> getObjectIterator(final S3Object s3Object, final String topic, final int topicPartition, final long startOffset, final Transformer transformer, final Map<String, Object> partitionMap) { return new Iterator<>() { @@ -136,24 +133,34 @@ private List<S3SourceRecord> readNext() { int numOfProcessedRecs = 1; boolean checkOffsetMap = true; - for (final Object record : transformer.getRecords(valueInputStream, topic, topicPartition, - s3SourceConfig)) { - if (offsetManager.shouldSkipRecord(partitionMap, currentObjectKey, numOfProcessedRecs) - && checkOffsetMap) { + try (Stream<Object> recordStream = transformer.getRecords(s3Object::getObjectContent, topic, + topicPartition, s3SourceConfig)) { + final Iterator<Object> recordIterator = recordStream.iterator(); + while (recordIterator.hasNext()) { + final Object record = recordIterator.next(); + + // Check if the record should be skipped based on the offset + if (offsetManager.shouldSkipRecord(partitionMap, currentObjectKey, numOfProcessedRecs) + && checkOffsetMap) { + numOfProcessedRecs++; + continue; + } + + final byte[] valueBytes = transformer.getValueBytes(record, topic, s3SourceConfig); + checkOffsetMap = false; + + sourceRecords.add(getSourceRecord(keyBytes, valueBytes, topic, topicPartition, offsetManager, + startOffset, partitionMap)); + numOfProcessedRecs++; - continue; - } - final byte[] valueBytes = transformer.getValueBytes(record, topic, s3SourceConfig); - checkOffsetMap = false; - sourceRecords.add(getSourceRecord(keyBytes, valueBytes, topic, topicPartition, offsetManager, - startOffset, partitionMap)); - if (sourceRecords.size() >= s3SourceConfig.getInt(MAX_POLL_RECORDS)) { - break; + // Break if we have reached the max records per poll + if (sourceRecords.size() >= s3SourceConfig.getInt(MAX_POLL_RECORDS)) { + break; + } } - - numOfProcessedRecs++; } + return sourceRecords; } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformerTest.java index db743748f..2486cfadd 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformerTest.java @@ -16,18 +16,17 @@ package io.aiven.kafka.connect.s3.source.input; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.EXPECTED_MAX_MESSAGE_BYTES; -import static org.junit.jupiter.api.Assertions.assertArrayEquals; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.mockito.Mockito.when; +import static org.assertj.core.api.Assertions.assertThat; import java.io.ByteArrayInputStream; -import java.io.IOException; import java.io.InputStream; import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import org.apache.commons.io.function.IOSupplier; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -37,6 +36,7 @@ @ExtendWith(MockitoExtension.class) final class ByteArrayTransformerTest { + public static final String TEST_TOPIC = "test-topic"; private ByteArrayTransformer byteArrayTransformer; @Mock @@ -51,45 +51,33 @@ void setUp() { void testGetRecordsSingleChunk() { final byte[] data = { 1, 2, 3, 4, 5 }; final InputStream inputStream = new ByteArrayInputStream(data); + final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; - when(s3SourceConfig.getInt(EXPECTED_MAX_MESSAGE_BYTES)).thenReturn(10_000); // Larger than data size + final Stream<Object> records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, + s3SourceConfig); - final List<Object> records = byteArrayTransformer.getRecords(inputStream, "test-topic", 0, s3SourceConfig); - - assertEquals(1, records.size()); - assertArrayEquals(data, (byte[]) records.get(0)); - } - - @Test - void testGetRecordsMultipleChunks() { - final byte[] data = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; - final InputStream inputStream = new ByteArrayInputStream(data); - - when(s3SourceConfig.getInt(EXPECTED_MAX_MESSAGE_BYTES)).thenReturn(5); // Smaller than data size - - final List<Object> records = byteArrayTransformer.getRecords(inputStream, "test-topic", 0, s3SourceConfig); - - assertEquals(2, records.size()); - assertArrayEquals(new byte[] { 1, 2, 3, 4, 5 }, (byte[]) records.get(0)); - assertArrayEquals(new byte[] { 6, 7, 8, 9, 10 }, (byte[]) records.get(1)); + final List<Object> recs = records.collect(Collectors.toList()); + assertThat(recs).hasSize(1); + assertThat((byte[]) recs.get(0)).isEqualTo(data); } @Test - void testGetRecordsEmptyInputStream() throws IOException { + void testGetRecordsEmptyInputStream() { final InputStream inputStream = new ByteArrayInputStream(new byte[] {}); - when(s3SourceConfig.getInt(EXPECTED_MAX_MESSAGE_BYTES)).thenReturn(5); + final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; - final List<Object> records = byteArrayTransformer.getRecords(inputStream, "test-topic", 0, s3SourceConfig); + final Stream<Object> records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, + s3SourceConfig); - assertEquals(0, records.size()); + assertThat(records).hasSize(0); } @Test void testGetValueBytes() { final byte[] record = { 1, 2, 3 }; - final byte[] result = byteArrayTransformer.getValueBytes(record, "test-topic", s3SourceConfig); + final byte[] result = byteArrayTransformer.getValueBytes(record, TEST_TOPIC, s3SourceConfig); - assertArrayEquals(record, result); + assertThat(result).containsExactlyInAnyOrder(record); } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/JsonTransformerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/JsonTransformerTest.java index e24711f36..bdf4780d1 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/JsonTransformerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/JsonTransformerTest.java @@ -20,31 +20,39 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.HashMap; -import java.util.List; import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.io.function.IOSupplier; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; @ExtendWith(MockitoExtension.class) final class JsonTransformerTest { + public static final String TESTTOPIC = "testtopic"; JsonTransformer jsonTransformer; S3SourceConfig s3SourceConfig; + @Mock + private IOSupplier<InputStream> inputStreamIOSupplierMock; + @BeforeEach void setUp() { jsonTransformer = new JsonTransformer(); @@ -63,32 +71,64 @@ void testConfigureValueConverter() { void testHandleValueDataWithValidJson() { final InputStream validJsonInputStream = new ByteArrayInputStream( "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); - final List<Object> jsonNodes = jsonTransformer.getRecords(validJsonInputStream, "testtopic", 1, s3SourceConfig); + final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; + final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + s3SourceConfig); - assertThat(jsonNodes.size()).isEqualTo(1); + assertThat(jsonNodes.collect(Collectors.toList())).hasSize(1); } @Test void testHandleValueDataWithInvalidJson() { final InputStream invalidJsonInputStream = new ByteArrayInputStream( "invalid-json".getBytes(StandardCharsets.UTF_8)); + final IOSupplier<InputStream> inputStreamIOSupplier = () -> invalidJsonInputStream; - final List<Object> jsonNodes = jsonTransformer.getRecords(invalidJsonInputStream, "testtopic", 1, + final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, s3SourceConfig); - assertThat(jsonNodes.size()).isEqualTo(0); + assertThat(jsonNodes.collect(Collectors.toList())).hasSize(0); } @Test void testSerializeJsonDataValid() throws IOException { final InputStream validJsonInputStream = new ByteArrayInputStream( "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); - final List<Object> jsonNodes = jsonTransformer.getRecords(validJsonInputStream, "testtopic", 1, s3SourceConfig); - final byte[] serializedData = jsonTransformer.getValueBytes(jsonNodes.get(0), "testtopic", s3SourceConfig); + final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; + final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + s3SourceConfig); + final byte[] serializedData = jsonTransformer.getValueBytes(jsonNodes.findFirst().get(), TESTTOPIC, + s3SourceConfig); final ObjectMapper objectMapper = new ObjectMapper(); final JsonNode expectedData = objectMapper.readTree(serializedData); assertThat(expectedData.get("key").asText()).isEqualTo("value"); } + + @Test + void testGetRecordsWithIOException() throws IOException { + when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException")); + final Stream<Object> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null); + + assertThat(resultStream).isEmpty(); + } + + @Test + void testCustomSpliteratorStreamProcessing() throws IOException { + final String jsonContent = "{\"key\":\"value\"}\n{\"key2\":\"value2\"}"; + final InputStream inputStream = new ByteArrayInputStream(jsonContent.getBytes(StandardCharsets.UTF_8)); + final IOSupplier<InputStream> supplier = () -> inputStream; + + final JsonTransformer.CustomSpliterator spliterator = jsonTransformer.new CustomSpliterator(supplier); + assertThat(spliterator.tryAdvance(jsonNode -> assertThat(jsonNode).isNotNull())).isTrue(); + } + + @Test + void testCustomSpliteratorWithIOExceptionDuringInitialization() throws IOException { + when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException during initialization")); + final Stream<Object> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null); + + assertThat(resultStream).isEmpty(); + } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformerTest.java index 69d7ac493..08f462595 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformerTest.java @@ -17,24 +17,32 @@ package io.aiven.kafka.connect.s3.source.input; import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; import java.io.ByteArrayInputStream; +import java.io.File; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; import com.amazonaws.util.IOUtils; import org.apache.avro.generic.GenericRecord; +import org.apache.commons.io.function.IOSupplier; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; @ExtendWith(MockitoExtension.class) @@ -50,11 +58,13 @@ public void setUp() { void testHandleValueDataWithZeroBytes() { final byte[] mockParquetData = new byte[0]; final InputStream inputStream = new ByteArrayInputStream(mockParquetData); + final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); final String topic = "test-topic"; final int topicPartition = 0; - final List<Object> recs = parquetTransformer.getRecords(inputStream, topic, topicPartition, s3SourceConfig); + final Stream<Object> recs = parquetTransformer.getRecords(inputStreamIOSupplier, topic, topicPartition, + s3SourceConfig); assertThat(recs).isEmpty(); } @@ -63,12 +73,15 @@ void testHandleValueDataWithZeroBytes() { void testGetRecordsWithValidData() throws Exception { final byte[] mockParquetData = generateMockParquetData(); final InputStream inputStream = new ByteArrayInputStream(mockParquetData); + final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); final String topic = "test-topic"; final int topicPartition = 0; - final List<Object> records = parquetTransformer.getRecords(inputStream, topic, topicPartition, s3SourceConfig); + final List<Object> records = parquetTransformer + .getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig) + .collect(Collectors.toList()); assertThat(records).isNotEmpty(); assertThat(records).extracting(record -> ((GenericRecord) record).get("name").toString()) @@ -80,12 +93,15 @@ void testGetRecordsWithValidData() throws Exception { void testGetRecordsWithInvalidData() { final byte[] invalidData = "invalid data".getBytes(StandardCharsets.UTF_8); final InputStream inputStream = new ByteArrayInputStream(invalidData); + final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; + final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); final String topic = "test-topic"; final int topicPartition = 0; - final List<Object> records = parquetTransformer.getRecords(inputStream, topic, topicPartition, s3SourceConfig); + final Stream<Object> records = parquetTransformer.getRecords(inputStreamIOSupplier, topic, topicPartition, + s3SourceConfig); assertThat(records).isEmpty(); } @@ -102,4 +118,31 @@ private byte[] generateMockParquetData() throws IOException { final Path path = ContentUtils.getTmpFilePath("name"); return IOUtils.toByteArray(Files.newInputStream(path)); } + + @Test + void testIOExceptionCreatingTempFile() { + try (var mockStatic = Mockito.mockStatic(File.class)) { + mockStatic.when(() -> File.createTempFile(anyString(), anyString())) + .thenThrow(new IOException("Test IOException for temp file")); + + final IOSupplier<InputStream> inputStreamSupplier = mock(IOSupplier.class); + final Stream<Object> resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", 1, + null); + + assertThat(resultStream).isEmpty(); + } + } + + @Test + void testIOExceptionDuringDataCopy() throws IOException { + try (InputStream inputStreamMock = mock(InputStream.class)) { + when(inputStreamMock.read(any(byte[].class))).thenThrow(new IOException("Test IOException during copy")); + + final IOSupplier<InputStream> inputStreamSupplier = () -> inputStreamMock; + final Stream<Object> resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", 1, + null); + + assertThat(resultStream).isEmpty(); + } + } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java index a2fb31d9f..e6ba44756 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -30,6 +30,7 @@ import java.nio.charset.StandardCharsets; import java.util.Collections; import java.util.List; +import java.util.stream.Stream; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.input.Transformer; @@ -78,8 +79,7 @@ void testIteratorProcessesS3Objects() throws Exception { when(mockS3Client.getObject(anyString(), anyString())).thenReturn(mockS3Object); when(mockS3Object.getObjectContent()).thenReturn(mockInputStream); - when(mockTransformer.getRecords(any(), anyString(), anyInt(), any())) - .thenReturn(Collections.singletonList(new Object())); + when(mockTransformer.getRecords(any(), anyString(), anyInt(), any())).thenReturn(Stream.of(new Object())); final String outStr = "this is a test"; when(mockTransformer.getValueBytes(any(), anyString(), any())) @@ -102,7 +102,6 @@ void testIteratorProcessesS3Objects() throws Exception { assertTrue(iterator.hasNext()); assertNotNull(iterator.next()); } - } private ListObjectsV2Result mockListObjectsResult(final List<S3ObjectSummary> summaries) { From 4afb06113899f10ef57834b493fc91288cbcb7b8 Mon Sep 17 00:00:00 2001 From: Ryan Skraba <ryan.skraba@aiven.io> Date: Mon, 25 Nov 2024 16:03:41 +0100 Subject: [PATCH 70/90] Use AssertJ for tests --- .../s3/source/input/JsonTransformerTest.java | 9 ++++----- .../s3/source/utils/RecordProcessorTest.java | 7 +++---- .../s3/source/utils/SourceRecordIteratorTest.java | 13 +++++-------- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/JsonTransformerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/JsonTransformerTest.java index bdf4780d1..0abf61c29 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/JsonTransformerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/JsonTransformerTest.java @@ -18,7 +18,6 @@ import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMAS_ENABLE; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.jupiter.api.Assertions.assertEquals; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -28,7 +27,6 @@ import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; -import java.util.stream.Collectors; import java.util.stream.Stream; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; @@ -64,7 +62,8 @@ void testConfigureValueConverter() { final Map<String, String> config = new HashMap<>(); jsonTransformer.configureValueConverter(config, s3SourceConfig); - assertEquals("false", config.get(SCHEMAS_ENABLE), "SCHEMAS_ENABLE should be set to false"); + assertThat(config).as("%s should be set to false", SCHEMAS_ENABLE) + .containsEntry(SCHEMAS_ENABLE, Boolean.FALSE.toString()); } @Test @@ -75,7 +74,7 @@ void testHandleValueDataWithValidJson() { final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, s3SourceConfig); - assertThat(jsonNodes.collect(Collectors.toList())).hasSize(1); + assertThat(jsonNodes).hasSize(1); } @Test @@ -87,7 +86,7 @@ void testHandleValueDataWithInvalidJson() { final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, s3SourceConfig); - assertThat(jsonNodes.collect(Collectors.toList())).hasSize(0); + assertThat(jsonNodes).isEmpty(); } @Test diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java index cc7d765c6..f6aea18d5 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -17,7 +17,6 @@ package io.aiven.kafka.connect.s3.source.utils; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.mock; @@ -90,7 +89,7 @@ void testProcessRecordsNoRecords() { transformer, fileReader, offsetManager ); - assertTrue(processedRecords.isEmpty(), "Processed records should be empty when there are no records."); + assertThat(processedRecords).as("Processed records should be empty when there are no records.").isEmpty(); } @Test @@ -112,7 +111,7 @@ void testProcessRecordsWithRecords() throws ConnectException { transformer, fileReader, offsetManager ); - assertThat(results.size()).isEqualTo(1); + assertThat(results).hasSize(1); verify(sourceRecordIterator, times(1)).next(); } @@ -132,7 +131,7 @@ void testProcessRecordsConnectorStopped() { transformer, fileReader, offsetManager ); - assertTrue(processedRecords.isEmpty(), "Processed records should be empty when connector is stopped."); + assertThat(processedRecords).as("Processed records should be empty when connector is stopped.").isEmpty(); verify(sourceRecordIterator, never()).next(); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java index e6ba44756..bb4ca8ead 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -16,10 +16,7 @@ package io.aiven.kafka.connect.s3.source.utils; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertNull; -import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.any; import static org.mockito.Mockito.anyInt; import static org.mockito.Mockito.anyString; @@ -91,16 +88,16 @@ void testIteratorProcessesS3Objects() throws Exception { SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockS3Client, "test-bucket", mockOffsetManager, mockTransformer, mockFileReader); - assertFalse(iterator.hasNext()); - assertNull(iterator.next()); + assertThat(iterator.hasNext()).isFalse(); + assertThat(iterator.next()).isNull(); when(mockFileReader.fetchObjectSummaries(any())).thenReturn(mockObjectSummaries.listIterator()); iterator = new SourceRecordIterator(mockConfig, mockS3Client, "test-bucket", mockOffsetManager, mockTransformer, mockFileReader); - assertTrue(iterator.hasNext()); - assertNotNull(iterator.next()); + assertThat(iterator.hasNext()).isTrue(); + assertThat(iterator.next()).isNotNull(); } } From 413b1fe400cbca89f4eddc1fe6858f9f17f34732 Mon Sep 17 00:00:00 2001 From: Anatoly Popov <anatolii.popov@aiven.io> Date: Tue, 26 Nov 2024 15:28:03 +0200 Subject: [PATCH 71/90] Integration tests improvements (#354) * Migrating tests to Awaitility instead of plain Thread.sleep * Some refactoring to unify message consumption logic in tests where possible. --- .../connect/s3/source/IntegrationBase.java | 142 +++++++----------- .../connect/s3/source/IntegrationTest.java | 113 ++++++++------ 2 files changed, 114 insertions(+), 141 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index 523bae1f2..8312673cb 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -21,9 +21,7 @@ import java.io.File; import java.io.IOException; -import java.net.ConnectException; import java.net.ServerSocket; -import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.time.Duration; @@ -45,6 +43,7 @@ import org.apache.kafka.clients.consumer.ConsumerRecords; import org.apache.kafka.clients.consumer.KafkaConsumer; import org.apache.kafka.common.serialization.ByteArrayDeserializer; +import org.apache.kafka.common.serialization.Deserializer; import org.apache.kafka.common.serialization.StringDeserializer; import org.apache.kafka.connect.json.JsonDeserializer; @@ -132,113 +131,74 @@ static List<Integer> getKafkaListenerPorts() throws IOException { } } - static List<String> consumeMessages(final String topic, final int expectedMessageCount, - final String bootstrapServers) { - final Properties props = new Properties(); - props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); - props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group"); - props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); - props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); - props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); - - try (KafkaConsumer<byte[], byte[]> consumer = new KafkaConsumer<>(props)) { - consumer.subscribe(Collections.singletonList(topic)); - final List<String> messages = new ArrayList<>(); - - // Poll messages from the topic - while (messages.size() < expectedMessageCount) { - final ConsumerRecords<byte[], byte[]> records = consumer.poll(5L); - for (final ConsumerRecord<byte[], byte[]> record : records) { - messages.add(new String(record.value(), StandardCharsets.UTF_8)); - } - } - - return messages; - } + static List<String> consumeByteMessages(final String topic, final int expectedMessageCount, + String bootstrapServers) { + final Properties consumerProperties = getConsumerProperties(bootstrapServers, ByteArrayDeserializer.class, + ByteArrayDeserializer.class); + final List<byte[]> objects = consumeMessages(topic, expectedMessageCount, consumerProperties); + return objects.stream().map(String::new).collect(Collectors.toList()); } static List<GenericRecord> consumeAvroMessages(final String topic, final int expectedMessageCount, final String bootstrapServers, final String schemaRegistryUrl) { - final Properties props = new Properties(); - props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); - props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group-avro"); - // Assuming string key - props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); - // Avro deserializer for values - props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, KafkaAvroDeserializer.class.getName()); - props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); - props.put("schema.registry.url", schemaRegistryUrl); // URL of the schema registry - props.put("specific.avro.reader", "false"); // Use GenericRecord instead of specific Avro classes - - try (KafkaConsumer<String, GenericRecord> consumer = new KafkaConsumer<>(props)) { - consumer.subscribe(Collections.singletonList(topic)); - final List<GenericRecord> recordsList = new ArrayList<>(); - - // Poll messages from the topic - while (recordsList.size() < expectedMessageCount) { - final ConsumerRecords<String, GenericRecord> records = consumer.poll(500L); - for (final ConsumerRecord<String, GenericRecord> record : records) { - recordsList.add(record.value()); - } - } - - return recordsList; - } + final Properties consumerProperties = getConsumerProperties(bootstrapServers, StringDeserializer.class, + KafkaAvroDeserializer.class, schemaRegistryUrl); + return consumeMessages(topic, expectedMessageCount, consumerProperties); } static List<JsonNode> consumeJsonMessages(final String topic, final int expectedMessageCount, final String bootstrapServers) { - final Properties props = new Properties(); - props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); - props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group-json"); - // Assuming string key - props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); - // Json deserializer for values - props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, JsonDeserializer.class.getName()); - props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); + final Properties consumerProperties = getConsumerProperties(bootstrapServers, StringDeserializer.class, + JsonDeserializer.class); + return consumeMessages(topic, expectedMessageCount, consumerProperties); + } - try (KafkaConsumer<String, JsonNode> consumer = new KafkaConsumer<>(props)) { + static <K, V> List<V> consumeMessages(final String topic, final int expectedMessageCount, + final Properties consumerProperties) { + try (KafkaConsumer<K, V> consumer = new KafkaConsumer<>(consumerProperties)) { consumer.subscribe(Collections.singletonList(topic)); - final List<JsonNode> recordsList = new ArrayList<>(); - // Poll messages from the topic - while (recordsList.size() < expectedMessageCount) { - final ConsumerRecords<String, JsonNode> records = consumer.poll(500L); - for (final ConsumerRecord<String, JsonNode> record : records) { - recordsList.add(record.value()); // Add the GenericRecord to the list + final List<V> recordValues = new ArrayList<>(); + await().atMost(Duration.ofMinutes(2)).pollInterval(Duration.ofSeconds(1)).untilAsserted(() -> { + final ConsumerRecords<K, V> records = consumer.poll(Duration.ofMillis(500L)); + for (final ConsumerRecord<K, V> record : records) { + recordValues.add(record.value()); } - } + assertThat(recordValues).hasSize(expectedMessageCount); + }); + return recordValues; + } + } - return recordsList; + static Map<String, Object> consumeOffsetMessages(KafkaConsumer<byte[], byte[]> consumer) throws IOException { + // Poll messages from the topic + final Map<String, Object> messages = new HashMap<>(); + final ConsumerRecords<byte[], byte[]> records = consumer.poll(Duration.ofSeconds(1)); + for (final ConsumerRecord<byte[], byte[]> record : records) { + Map<String, Object> offsetRec = OBJECT_MAPPER.readValue(record.value(), new TypeReference<>() { // NOPMD + }); + messages.putAll(offsetRec); } + return messages; + } + + static <K, V> Properties getConsumerProperties(String bootstrapServers, + Class<? extends Deserializer<K>> keyDeserializer, Class<? extends Deserializer<V>> valueDeserializer, + String schemaRegistryUrl) { + final Properties props = getConsumerProperties(bootstrapServers, keyDeserializer, valueDeserializer); + props.put("specific.avro.reader", "false"); // Use GenericRecord instead of specific Avro classes + props.put("schema.registry.url", schemaRegistryUrl); // URL of the schema registry + return props; } - static Map<String, Object> consumeOffsetStorageMessages(final String topic, final int expectedMessageCount, - final String bootstrapServer) throws ConnectException { + static <K, V> Properties getConsumerProperties(String bootstrapServers, + Class<? extends Deserializer<K>> keyDeserializer, Class<? extends Deserializer<V>> valueDeserializer) { final Properties props = new Properties(); - props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServer); + props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group"); - props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); - props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); + props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, keyDeserializer.getName()); + props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, valueDeserializer.getName()); props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); - - try (KafkaConsumer<byte[], byte[]> consumer = new KafkaConsumer<>(props)) { - consumer.subscribe(Collections.singletonList(topic)); - - // Poll messages from the topic - final Map<String, Object> messages = new HashMap<>(); - while (messages.size() < expectedMessageCount) { - final ConsumerRecords<byte[], byte[]> records = consumer.poll(Duration.ofMillis(5L)); - for (final ConsumerRecord<byte[], byte[]> record : records) { - Map<String, Object> offsetRec = OBJECT_MAPPER.readValue(record.value(), new TypeReference<>() { // NOPMD - }); - messages.putAll(offsetRec); - } - } - return messages; - - } catch (IOException e) { - throw new ConnectException("Error while consuming messages " + e.getMessage()); - } + return props; } } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index eb0e86003..84c494e66 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -31,25 +31,32 @@ import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; import static java.util.Map.entry; import static org.assertj.core.api.Assertions.assertThat; +import static org.awaitility.Awaitility.await; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.net.ConnectException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.time.Duration; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Properties; import java.util.Set; +import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.IntStream; import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; import io.aiven.kafka.connect.s3.source.input.InputFormat; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; @@ -150,7 +157,7 @@ void tearDown() { } @Test - void bytesTest(final TestInfo testInfo) throws IOException { + void bytesTest(final TestInfo testInfo) { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 2); @@ -160,7 +167,7 @@ void bytesTest(final TestInfo testInfo) throws IOException { final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; - final Set<String> offsetKeys = new HashSet<>(); + final List<String> offsetKeys = new ArrayList<>(); // write 2 objects to s3 offsetKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000")); @@ -172,17 +179,21 @@ void bytesTest(final TestInfo testInfo) throws IOException { assertThat(testBucketAccessor.listObjects()).hasSize(5); // Poll messages from the Kafka topic and verify the consumed data - final List<String> records = IntegrationBase.consumeMessages(topicName, 4, connectRunner.getBootstrapServers()); + final List<String> records = IntegrationBase.consumeByteMessages(topicName, 4, + connectRunner.getBootstrapServers()); // Verify that the correct data is read from the S3 bucket and pushed to Kafka assertThat(records).containsOnly(testData1, testData2); // Verify offset positions - verifyOffsetPositions(offsetKeys, 4); + final Map<String, Object> expectedOffsetRecords = offsetKeys.subList(0, offsetKeys.size() - 1) + .stream() + .collect(Collectors.toMap(Function.identity(), s -> 1)); + verifyOffsetPositions(expectedOffsetRecords, connectRunner.getBootstrapServers()); } @Test - void avroTest(final TestInfo testInfo) throws IOException, InterruptedException { + void avroTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getAvroConfig(topicName, InputFormat.AVRO); @@ -195,11 +206,11 @@ void avroTest(final TestInfo testInfo) throws IOException, InterruptedException final Schema.Parser parser = new Schema.Parser(); final Schema schema = parser.parse(schemaJson); - final byte[] outputStream1 = getAvroRecord(schema, 1, 100); - final byte[] outputStream2 = getAvroRecord(schema, 101, 100); - final byte[] outputStream3 = getAvroRecord(schema, 201, 100); - final byte[] outputStream4 = getAvroRecord(schema, 301, 100); - final byte[] outputStream5 = getAvroRecord(schema, 401, 100); + final byte[] outputStream1 = generateNextAvroMessagesStartingFromId(1, 100, schema); + final byte[] outputStream2 = generateNextAvroMessagesStartingFromId(101, 100, schema); + final byte[] outputStream3 = generateNextAvroMessagesStartingFromId(201, 100, schema); + final byte[] outputStream4 = generateNextAvroMessagesStartingFromId(301, 100, schema); + final byte[] outputStream5 = generateNextAvroMessagesStartingFromId(401, 100, schema); final Set<String> offsetKeys = new HashSet<>(); @@ -218,8 +229,7 @@ void avroTest(final TestInfo testInfo) throws IOException, InterruptedException // deserializes Avro // Verify that the correct data is read from the S3 bucket and pushed to Kafka - assertThat(records).hasSize(500) - .map(record -> entry(record.get("id"), String.valueOf(record.get("message")))) + assertThat(records).map(record -> entry(record.get("id"), String.valueOf(record.get("message")))) .contains(entry(1, "Hello, Kafka Connect S3 Source! object 1"), entry(2, "Hello, Kafka Connect S3 Source! object 2"), entry(100, "Hello, Kafka Connect S3 Source! object 100"), @@ -228,11 +238,8 @@ void avroTest(final TestInfo testInfo) throws IOException, InterruptedException entry(400, "Hello, Kafka Connect S3 Source! object 400"), entry(500, "Hello, Kafka Connect S3 Source! object 500")); - Thread.sleep(10_000); - - final Map<String, Object> offsetRecs = IntegrationBase.consumeOffsetStorageMessages( - "connect-offset-topic-" + CONNECTOR_NAME, 5, connectRunner.getBootstrapServers()); - assertThat(offsetRecs).containsOnlyKeys(offsetKeys).values().containsOnly(100); + verifyOffsetPositions(offsetKeys.stream().collect(Collectors.toMap(Function.identity(), s -> 100)), + connectRunner.getBootstrapServers()); } @Test @@ -264,9 +271,9 @@ void parquetTest(final TestInfo testInfo) throws IOException { .containsExactlyInAnyOrderElementsOf(expectedRecordNames); } - private Map<String, String> getAvroConfig(final String topicName, final InputFormat parquet) { + private Map<String, String> getAvroConfig(final String topicName, final InputFormat inputFormat) { final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 4); - connectorConfig.put(INPUT_FORMAT_KEY, parquet.getValue()); + connectorConfig.put(INPUT_FORMAT_KEY, inputFormat.getValue()); connectorConfig.put(SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); connectorConfig.put(VALUE_CONVERTER_KEY, "io.confluent.connect.avro.AvroConverter"); connectorConfig.put(VALUE_CONVERTER_SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); @@ -275,7 +282,7 @@ private Map<String, String> getAvroConfig(final String topicName, final InputFor } @Test - void jsonTest(final TestInfo testInfo) throws IOException { + void jsonTest(final TestInfo testInfo) { final var topicName = IntegrationBase.topicName(testInfo); final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 1); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.JSONL.getValue()); @@ -289,32 +296,33 @@ void jsonTest(final TestInfo testInfo) throws IOException { jsonBuilder.append(jsonContent).append("\n"); // NOPMD } final byte[] jsonBytes = jsonBuilder.toString().getBytes(StandardCharsets.UTF_8); - final Set<String> offsetKeys = new HashSet<>(); - offsetKeys.add(writeToS3(topicName, jsonBytes, "00001")); + final String offsetKey = writeToS3(topicName, jsonBytes, "00001"); + // Poll Json messages from the Kafka topic and deserialize them final List<JsonNode> records = IntegrationBase.consumeJsonMessages(topicName, 500, connectRunner.getBootstrapServers()); - assertThat(records).hasSize(500); - assertThat(records).extracting(record -> record.get("payload").get("message").asText()).contains(testMessage); - assertThat(records).extracting(record -> record.get("payload").get("id").asText()).contains("1"); + assertThat(records).map(jsonNode -> jsonNode.get("payload")).anySatisfy(jsonNode -> { + assertThat(jsonNode.get("message").asText()).contains(testMessage); + assertThat(jsonNode.get("id").asText()).contains("1"); + }); // Verify offset positions - verifyOffsetPositions(offsetKeys, 1); + verifyOffsetPositions(Map.of(offsetKey, 500), connectRunner.getBootstrapServers()); } - private static byte[] getAvroRecord(final Schema schema, int messageId, final int noOfAvroRecs) throws IOException { + private static byte[] generateNextAvroMessagesStartingFromId(final int messageId, final int noOfAvroRecs, + final Schema schema) throws IOException { final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter); ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) { dataFileWriter.create(schema, outputStream); - for (int i = 0; i < noOfAvroRecs; i++) { + for (int i = messageId; i < messageId + noOfAvroRecs; i++) { final GenericRecord avroRecord = new GenericData.Record(schema); // NOPMD - avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + messageId); - avroRecord.put("id", messageId); + avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + i); + avroRecord.put("id", i); dataFileWriter.append(avroRecord); - messageId++; // NOPMD } dataFileWriter.flush(); @@ -331,7 +339,17 @@ private static String writeToS3(final String topicName, final byte[] testDataByt } private Map<String, String> getConfig(final String connectorName, final String topics, final int maxTasks) { - final Map<String, String> config = new HashMap<>(basicConnectorConfig(connectorName, maxTasks)); + final Map<String, String> config = new HashMap<>(basicS3ConnectorConfig()); + config.put("name", connectorName); + config.put(TARGET_TOPICS, topics); + config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put("tasks.max", String.valueOf(maxTasks)); + return config; + } + + private static Map<String, String> basicS3ConnectorConfig() { + final Map<String, String> config = new HashMap<>(); config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); config.put(AWS_ACCESS_KEY_ID_CONFIG, S3_ACCESS_KEY_ID); config.put(AWS_SECRET_ACCESS_KEY_CONFIG, S3_SECRET_ACCESS_KEY); @@ -339,25 +357,20 @@ private Map<String, String> getConfig(final String connectorName, final String t config.put(AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET_NAME); config.put(AWS_S3_PREFIX_CONFIG, s3Prefix); config.put(TARGET_TOPIC_PARTITIONS, "0,1"); - config.put(TARGET_TOPICS, topics); return config; } - private Map<String, String> basicConnectorConfig(final String connectorName, final int maxTasks) { - final Map<String, String> config = new HashMap<>(); - config.put("name", connectorName); - config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); - config.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.converters.ByteArrayConverter"); - config.put("tasks.max", String.valueOf(maxTasks)); - return config; - } - - private Map<String, Object> verifyOffsetPositions(final Set<String> offsetKeys, final int messagesCount) - throws ConnectException { - final Map<String, Object> offsetRecs = IntegrationBase.consumeOffsetStorageMessages( - "connect-offset-topic-" + CONNECTOR_NAME, messagesCount, connectRunner.getBootstrapServers()); - - assertThat(offsetRecs.keySet()).hasSize(messagesCount).isSubsetOf(offsetKeys); - return offsetRecs; + static void verifyOffsetPositions(final Map<String, Object> expectedRecords, final String bootstrapServers) { + final Properties consumerProperties = IntegrationBase.getConsumerProperties(bootstrapServers, + ByteArrayDeserializer.class, ByteArrayDeserializer.class); + + final Map<String, Object> offsetRecs = new HashMap<>(); + try (KafkaConsumer<byte[], byte[]> consumer = new KafkaConsumer<>(consumerProperties)) { + consumer.subscribe(Collections.singletonList("connect-offset-topic-" + CONNECTOR_NAME)); + await().atMost(Duration.ofMinutes(1)).pollInterval(Duration.ofSeconds(1)).untilAsserted(() -> { + offsetRecs.putAll(IntegrationBase.consumeOffsetMessages(consumer)); + assertThat(offsetRecs).containsExactlyInAnyOrderEntriesOf(expectedRecords); + }); + } } } From d8263066909009a4553db178fc18dbd82b8c534f Mon Sep 17 00:00:00 2001 From: Murali Basani <muralidhar.basani@aiven.io> Date: Thu, 28 Nov 2024 16:50:02 +0100 Subject: [PATCH 72/90] Read large avro files [KCON-64] (#359) Current implementation cannot handle large Avro files, due to the initialisation of stream in try resources within transformer. - In this custom splitter - The tryAdvance method reads one record at a time and processes it. - Updated integration test with large number of avro records in one object --- .../connect/s3/source/IntegrationBase.java | 2 +- .../connect/s3/source/IntegrationTest.java | 30 +++++---- .../s3/source/input/AvroTransformer.java | 63 ++++++++++++++++--- 3 files changed, 75 insertions(+), 20 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index 8312673cb..9ce09172b 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -159,7 +159,7 @@ static <K, V> List<V> consumeMessages(final String topic, final int expectedMess consumer.subscribe(Collections.singletonList(topic)); final List<V> recordValues = new ArrayList<>(); - await().atMost(Duration.ofMinutes(2)).pollInterval(Duration.ofSeconds(1)).untilAsserted(() -> { + await().atMost(Duration.ofMinutes(5)).pollInterval(Duration.ofSeconds(5)).untilAsserted(() -> { final ConsumerRecords<K, V> records = consumer.poll(Duration.ofMillis(500L)); for (final ConsumerRecord<K, V> record : records) { recordValues.add(record.value()); diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 84c494e66..05514430d 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -206,11 +206,17 @@ void avroTest(final TestInfo testInfo) throws IOException { final Schema.Parser parser = new Schema.Parser(); final Schema schema = parser.parse(schemaJson); - final byte[] outputStream1 = generateNextAvroMessagesStartingFromId(1, 100, schema); - final byte[] outputStream2 = generateNextAvroMessagesStartingFromId(101, 100, schema); - final byte[] outputStream3 = generateNextAvroMessagesStartingFromId(201, 100, schema); - final byte[] outputStream4 = generateNextAvroMessagesStartingFromId(301, 100, schema); - final byte[] outputStream5 = generateNextAvroMessagesStartingFromId(401, 100, schema); + final int numOfRecsFactor = 5000; + + final byte[] outputStream1 = generateNextAvroMessagesStartingFromId(1, numOfRecsFactor, schema); + final byte[] outputStream2 = generateNextAvroMessagesStartingFromId(numOfRecsFactor + 1, numOfRecsFactor, + schema); + final byte[] outputStream3 = generateNextAvroMessagesStartingFromId(2 * numOfRecsFactor + 1, numOfRecsFactor, + schema); + final byte[] outputStream4 = generateNextAvroMessagesStartingFromId(3 * numOfRecsFactor + 1, numOfRecsFactor, + schema); + final byte[] outputStream5 = generateNextAvroMessagesStartingFromId(4 * numOfRecsFactor + 1, numOfRecsFactor, + schema); final Set<String> offsetKeys = new HashSet<>(); @@ -224,7 +230,7 @@ void avroTest(final TestInfo testInfo) throws IOException { assertThat(testBucketAccessor.listObjects()).hasSize(5); // Poll Avro messages from the Kafka topic and deserialize them - final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 500, + final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, numOfRecsFactor * 5, connectRunner.getBootstrapServers(), schemaRegistry.getSchemaRegistryUrl()); // Ensure this method // deserializes Avro @@ -232,13 +238,13 @@ void avroTest(final TestInfo testInfo) throws IOException { assertThat(records).map(record -> entry(record.get("id"), String.valueOf(record.get("message")))) .contains(entry(1, "Hello, Kafka Connect S3 Source! object 1"), entry(2, "Hello, Kafka Connect S3 Source! object 2"), - entry(100, "Hello, Kafka Connect S3 Source! object 100"), - entry(200, "Hello, Kafka Connect S3 Source! object 200"), - entry(300, "Hello, Kafka Connect S3 Source! object 300"), - entry(400, "Hello, Kafka Connect S3 Source! object 400"), - entry(500, "Hello, Kafka Connect S3 Source! object 500")); + entry(numOfRecsFactor, "Hello, Kafka Connect S3 Source! object " + numOfRecsFactor), + entry(2 * numOfRecsFactor, "Hello, Kafka Connect S3 Source! object " + (2 * numOfRecsFactor)), + entry(3 * numOfRecsFactor, "Hello, Kafka Connect S3 Source! object " + (3 * numOfRecsFactor)), + entry(4 * numOfRecsFactor, "Hello, Kafka Connect S3 Source! object " + (4 * numOfRecsFactor)), + entry(5 * numOfRecsFactor, "Hello, Kafka Connect S3 Source! object " + (5 * numOfRecsFactor))); - verifyOffsetPositions(offsetKeys.stream().collect(Collectors.toMap(Function.identity(), s -> 100)), + verifyOffsetPositions(offsetKeys.stream().collect(Collectors.toMap(Function.identity(), s -> numOfRecsFactor)), connectRunner.getBootstrapServers()); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java index dd2516692..223c8f61e 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java @@ -25,7 +25,7 @@ import java.util.List; import java.util.Map; import java.util.Spliterator; -import java.util.Spliterators; +import java.util.function.Consumer; import java.util.stream.Stream; import java.util.stream.StreamSupport; @@ -66,18 +66,67 @@ public byte[] getValueBytes(final Object record, final String topic, final S3Sou private Stream<Object> readAvroRecordsAsStream(final IOSupplier<InputStream> inputStreamIOSupplier, final DatumReader<GenericRecord> datumReader) { - try (DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inputStreamIOSupplier.get(), - datumReader)) { - // Wrap DataFileStream in a Stream using a Spliterator for lazy processing - return StreamSupport.stream( - Spliterators.spliteratorUnknownSize(dataFileStream, Spliterator.ORDERED | Spliterator.NONNULL), - false); + InputStream inputStream; // NOPMD CloseResource: being closed in try resources iterator + DataFileStream<GenericRecord> dataFileStream; // NOPMD CloseResource: being closed in try resources iterator + try { + // Open input stream from S3 + inputStream = inputStreamIOSupplier.get(); + + // Ensure the DataFileStream is initialized correctly with the open stream + dataFileStream = new DataFileStream<>(inputStream, datumReader); + + // Wrap DataFileStream in a Stream using a custom Spliterator for lazy processing + return StreamSupport.stream(new AvroRecordSpliterator<>(dataFileStream), false).onClose(() -> { + try { + dataFileStream.close(); // Ensure the reader is closed after streaming + } catch (IOException e) { + LOGGER.error("Error closing BufferedReader: {}", e.getMessage(), e); + } + }); } catch (IOException e) { LOGGER.error("Error in DataFileStream: {}", e.getMessage(), e); return Stream.empty(); // Return an empty stream if initialization fails } } + private static class AvroRecordSpliterator<T> implements Spliterator<T> { + private final DataFileStream<GenericRecord> dataFileStream; + + public AvroRecordSpliterator(final DataFileStream<GenericRecord> dataFileStream) { + this.dataFileStream = dataFileStream; + } + + @Override + public boolean tryAdvance(final Consumer<? super T> action) { + try { + if (dataFileStream.hasNext()) { + final GenericRecord record = dataFileStream.next(); + action.accept((T) record); + return true; + } + } catch (Exception e) { // NOPMD AvoidCatchingGenericException + LOGGER.error("Error while reading Avro record: {}", e.getMessage(), e); + return false; + } + return false; + } + + @Override + public Spliterator<T> trySplit() { + return null; // Can't split the data stream as DataFileStream is sequential + } + + @Override + public long estimateSize() { + return Long.MAX_VALUE; // We don't know the size upfront + } + + @Override + public int characteristics() { + return Spliterator.ORDERED | Spliterator.NONNULL; + } + } + List<Object> readAvroRecords(final InputStream content, final DatumReader<GenericRecord> datumReader) { final List<Object> records = new ArrayList<>(); try (SeekableByteArrayInput sin = new SeekableByteArrayInput(IOUtils.toByteArray(content))) { From d2b8e55b08fab86d3bb89677d3e91a15c2f8d80a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aindri=C3=BA=20Lavelle?= <121855584+aindriu-aiven@users.noreply.github.com> Date: Thu, 5 Dec 2024 16:39:30 +0000 Subject: [PATCH 73/90] Move source configuration and transformers into common config (#360) Move all the source common config so it can be re-used by other source connectors. * Adds two config fragments which are logically groupings for source connector configuration * Parquet is moved with library changes to use the same parquet version as the sink connector. --------- Signed-off-by: Aindriu Lavelle <aindriu.lavelle@aiven.io> --- commons/build.gradle.kts | 5 +- .../common/config/SchemaRegistryFragment.java | 76 ++++ .../common/config/SourceCommonConfig.java | 42 ++ .../common/config/SourceConfigFragment.java | 80 ++++ .../common}/source/input/AvroTransformer.java | 18 +- .../source/input/ByteArrayTransformer.java | 10 +- .../common}/source/input/InputFormat.java | 2 +- .../common}/source/input/JsonTransformer.java | 12 +- .../source/input/ParquetTransformer.java | 19 +- .../source/input/TransformationUtils.java | 18 +- .../common}/source/input/Transformer.java | 10 +- .../source/input/TransformerFactory.java | 13 +- .../source/input/parquet/LocalInputFile.java | 103 +++++ .../source/input/AvroTransformerTest.java | 12 +- .../input/ByteArrayTransformerTest.java | 12 +- .../common/source/input/ContentUtils.java | 99 +++++ .../source/input/JsonTransformerTest.java | 20 +- .../source/input/ParquetTransformerTest.java | 13 +- .../connect/config/s3/S3ConfigFragment.java | 10 +- .../connect/config/s3/S3SourceBaseConfig.java | 113 ------ s3-source-connector/build.gradle.kts | 19 +- .../connect/s3/source/IntegrationTest.java | 26 +- .../kafka/connect/s3/source/S3SourceTask.java | 8 +- .../s3/source/config/AwsAccessSecret.java | 43 -- .../config/AwsCredentialProviderFactory.java | 61 --- .../source/config/AwsStsEndpointConfig.java | 43 -- .../connect/s3/source/config/AwsStsRole.java | 62 --- .../s3/source/config/S3ClientFactory.java | 4 +- .../s3/source/config/S3SourceConfig.java | 369 +++--------------- .../s3/source/config/S3SourceConfigDef.java | 4 +- .../connect/s3/source/utils/FileReader.java | 2 +- .../s3/source/utils/OffsetManager.java | 6 +- .../s3/source/utils/RecordProcessor.java | 4 +- .../s3/source/utils/SourceRecordIterator.java | 6 +- .../connect/s3/source/S3SourceTaskTest.java | 34 +- .../s3/source/config/S3SourceConfigTest.java | 34 +- .../s3/source/testutils/ContentUtils.java | 72 ++-- .../s3/source/utils/FileReaderTest.java | 2 + .../s3/source/utils/OffsetManagerTest.java | 7 +- .../s3/source/utils/RecordProcessorTest.java | 8 +- .../utils/SourceRecordIteratorTest.java | 2 +- settings.gradle.kts | 1 + 42 files changed, 678 insertions(+), 826 deletions(-) create mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/config/SchemaRegistryFragment.java create mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java rename {s3-source-connector/src/main/java/io/aiven/kafka/connect/s3 => commons/src/main/java/io/aiven/kafka/connect/common}/source/input/AvroTransformer.java (91%) rename {s3-source-connector/src/main/java/io/aiven/kafka/connect/s3 => commons/src/main/java/io/aiven/kafka/connect/common}/source/input/ByteArrayTransformer.java (89%) rename {s3-source-connector/src/main/java/io/aiven/kafka/connect/s3 => commons/src/main/java/io/aiven/kafka/connect/common}/source/input/InputFormat.java (94%) rename {s3-source-connector/src/main/java/io/aiven/kafka/connect/s3 => commons/src/main/java/io/aiven/kafka/connect/common}/source/input/JsonTransformer.java (92%) rename {s3-source-connector/src/main/java/io/aiven/kafka/connect/s3 => commons/src/main/java/io/aiven/kafka/connect/common}/source/input/ParquetTransformer.java (89%) rename {s3-source-connector/src/main/java/io/aiven/kafka/connect/s3 => commons/src/main/java/io/aiven/kafka/connect/common}/source/input/TransformationUtils.java (75%) rename {s3-source-connector/src/main/java/io/aiven/kafka/connect/s3 => commons/src/main/java/io/aiven/kafka/connect/common}/source/input/Transformer.java (72%) rename {s3-source-connector/src/main/java/io/aiven/kafka/connect/s3 => commons/src/main/java/io/aiven/kafka/connect/common}/source/input/TransformerFactory.java (65%) create mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/input/parquet/LocalInputFile.java rename {s3-source-connector/src/test/java/io/aiven/kafka/connect/s3 => commons/src/test/java/io/aiven/kafka/connect/common}/source/input/AvroTransformerTest.java (91%) rename {s3-source-connector/src/test/java/io/aiven/kafka/connect/s3 => commons/src/test/java/io/aiven/kafka/connect/common}/source/input/ByteArrayTransformerTest.java (89%) create mode 100644 commons/src/test/java/io/aiven/kafka/connect/common/source/input/ContentUtils.java rename {s3-source-connector/src/test/java/io/aiven/kafka/connect/s3 => commons/src/test/java/io/aiven/kafka/connect/common}/source/input/JsonTransformerTest.java (90%) rename {s3-source-connector/src/test/java/io/aiven/kafka/connect/s3 => commons/src/test/java/io/aiven/kafka/connect/common}/source/input/ParquetTransformerTest.java (92%) delete mode 100644 s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3SourceBaseConfig.java delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsAccessSecret.java delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsCredentialProviderFactory.java delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsEndpointConfig.java delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsRole.java diff --git a/commons/build.gradle.kts b/commons/build.gradle.kts index 9bdc06b78..232404466 100644 --- a/commons/build.gradle.kts +++ b/commons/build.gradle.kts @@ -27,7 +27,7 @@ dependencies { implementation(confluent.kafka.connect.avro.data) { exclude(group = "org.apache.kafka", module = "kafka-clients") } - + implementation("commons-io:commons-io:2.18.0") implementation(tools.spotbugs.annotations) implementation(compressionlibs.snappy) implementation(compressionlibs.zstd.jni) @@ -41,6 +41,7 @@ dependencies { exclude(group = "org.slf4j", module = "slf4j-api") exclude(group = "org.apache.avro", module = "avro") } + implementation(apache.hadoop.common) { exclude(group = "org.apache.hadoop.thirdparty", module = "hadoop-shaded-protobuf_3_7") exclude(group = "com.google.guava", module = "guava") @@ -87,10 +88,10 @@ dependencies { testImplementation(testinglibs.mockito.core) testImplementation(testinglibs.assertj.core) testImplementation(testFixtures(project(":commons"))) - testImplementation(testinglibs.woodstox.stax2.api) testImplementation(apache.hadoop.mapreduce.client.core) testImplementation(confluent.kafka.connect.avro.converter) + testImplementation("org.mockito:mockito-junit-jupiter:5.14.2") testRuntimeOnly(testinglibs.junit.jupiter.engine) testRuntimeOnly(logginglibs.logback.classic) diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SchemaRegistryFragment.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SchemaRegistryFragment.java new file mode 100644 index 000000000..8ea7b7f95 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SchemaRegistryFragment.java @@ -0,0 +1,76 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.config; + +import java.util.Locale; + +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.common.config.ConfigDef; + +import io.aiven.kafka.connect.common.source.input.InputFormat; + +public final class SchemaRegistryFragment extends ConfigFragment { + private static final String SCHEMAREGISTRY_GROUP = "Schema registry group"; + public static final String SCHEMA_REGISTRY_URL = "schema.registry.url"; + public static final String VALUE_CONVERTER_SCHEMA_REGISTRY_URL = "value.converter.schema.registry.url"; + public static final String AVRO_VALUE_SERIALIZER = "value.serializer"; + public static final String INPUT_FORMAT_KEY = "input.format"; + public static final String SCHEMAS_ENABLE = "schemas.enable"; + + /** + * Construct the ConfigFragment.. + * + * @param cfg + * the configuration that this fragment is associated with. + */ + public SchemaRegistryFragment(final AbstractConfig cfg) { + super(cfg); + } + + public static ConfigDef update(final ConfigDef configDef) { + int srCounter = 0; + configDef.define(SCHEMA_REGISTRY_URL, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "SCHEMA REGISTRY URL", SCHEMAREGISTRY_GROUP, srCounter++, + ConfigDef.Width.NONE, SCHEMA_REGISTRY_URL); + configDef.define(VALUE_CONVERTER_SCHEMA_REGISTRY_URL, ConfigDef.Type.STRING, null, + new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "SCHEMA REGISTRY URL", + SCHEMAREGISTRY_GROUP, srCounter++, ConfigDef.Width.NONE, VALUE_CONVERTER_SCHEMA_REGISTRY_URL); + configDef.define(INPUT_FORMAT_KEY, ConfigDef.Type.STRING, InputFormat.BYTES.getValue(), + new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, + "Input format of messages read from source avro/json/parquet/bytes", SCHEMAREGISTRY_GROUP, srCounter++, // NOPMD + ConfigDef.Width.NONE, INPUT_FORMAT_KEY); + + configDef.define(AVRO_VALUE_SERIALIZER, ConfigDef.Type.CLASS, null, ConfigDef.Importance.MEDIUM, + "Avro value serializer", SCHEMAREGISTRY_GROUP, srCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, AVRO_VALUE_SERIALIZER); + return configDef; + } + + public InputFormat getInputFormat() { + return InputFormat.valueOf(cfg.getString(INPUT_FORMAT_KEY).toUpperCase(Locale.ROOT)); + } + + public String getSchemaRegistryUrl() { + return cfg.getString(SCHEMA_REGISTRY_URL); + } + + public Class<?> getAvroValueSerializer() { + return cfg.getClass(AVRO_VALUE_SERIALIZER); + } + +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java index e363d7c9a..44575e5e0 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java @@ -20,8 +20,50 @@ import org.apache.kafka.common.config.ConfigDef; +import io.aiven.kafka.connect.common.source.input.InputFormat; + public class SourceCommonConfig extends CommonConfig { + + private final SchemaRegistryFragment schemaRegistryFragment; + private final SourceConfigFragment sourceConfigFragment; + private final FileNameFragment fileNameFragment; + private final OutputFormatFragment outputFormatFragment; + public SourceCommonConfig(ConfigDef definition, Map<?, ?> originals) {// NOPMD super(definition, originals); + // Construct Fragments + schemaRegistryFragment = new SchemaRegistryFragment(this); + sourceConfigFragment = new SourceConfigFragment(this); + fileNameFragment = new FileNameFragment(this); + outputFormatFragment = new OutputFormatFragment(this); + + validate(); // NOPMD ConstructorCallsOverridableMethod + } + + private void validate() { + schemaRegistryFragment.validate(); + sourceConfigFragment.validate(); + fileNameFragment.validate(); + outputFormatFragment.validate(); + } + + public InputFormat getInputFormat() { + return schemaRegistryFragment.getInputFormat(); } + + public String getSchemaRegistryUrl() { + return schemaRegistryFragment.getSchemaRegistryUrl(); + } + + public String getTargetTopics() { + return sourceConfigFragment.getTargetTopics(); + } + public String getTargetTopicPartitions() { + return sourceConfigFragment.getTargetTopicPartitions(); + } + + public int getMaxPollRecords() { + return sourceConfigFragment.getMaxPollRecords(); + } + } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java new file mode 100644 index 000000000..568610da7 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java @@ -0,0 +1,80 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.config; + +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.common.config.ConfigDef; + +public final class SourceConfigFragment extends ConfigFragment { + private static final String GROUP_OTHER = "OTHER_CFG"; + public static final String MAX_POLL_RECORDS = "max.poll.records"; + public static final String EXPECTED_MAX_MESSAGE_BYTES = "expected.max.message.bytes"; + private static final String GROUP_OFFSET_TOPIC = "OFFSET_TOPIC"; + public static final String TARGET_TOPIC_PARTITIONS = "topic.partitions"; + public static final String TARGET_TOPICS = "topics"; + + /** + * Construct the ConfigFragment.. + * + * @param cfg + * the configuration that this fragment is associated with. + */ + public SourceConfigFragment(final AbstractConfig cfg) { + super(cfg); + } + + public static ConfigDef update(final ConfigDef configDef) { + int sourcePollingConfigCounter = 0; + + configDef.define(MAX_POLL_RECORDS, ConfigDef.Type.INT, 500, ConfigDef.Range.atLeast(1), + ConfigDef.Importance.MEDIUM, "Max poll records", GROUP_OTHER, sourcePollingConfigCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, MAX_POLL_RECORDS); + configDef.define(EXPECTED_MAX_MESSAGE_BYTES, ConfigDef.Type.INT, 1_048_588, ConfigDef.Importance.MEDIUM, + "The largest record batch size allowed by Kafka config max.message.bytes", GROUP_OTHER, + sourcePollingConfigCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, EXPECTED_MAX_MESSAGE_BYTES); + + // Offset Storage config group includes target topics + int offsetStorageGroupCounter = 0; + configDef.define(TARGET_TOPIC_PARTITIONS, ConfigDef.Type.STRING, "0", new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "eg : 0,1", GROUP_OFFSET_TOPIC, offsetStorageGroupCounter++, + ConfigDef.Width.NONE, TARGET_TOPIC_PARTITIONS); + configDef.define(TARGET_TOPICS, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "eg : connect-storage-offsets", GROUP_OFFSET_TOPIC, + offsetStorageGroupCounter++, ConfigDef.Width.NONE, TARGET_TOPICS); // NOPMD + return configDef; + } + + public String getTargetTopics() { + return cfg.getString(TARGET_TOPICS); + } + + public String getTargetTopicPartitions() { + return cfg.getString(TARGET_TOPIC_PARTITIONS); + } + + public int getMaxPollRecords() { + return cfg.getInt(MAX_POLL_RECORDS); + } + + public int getExpectedMaxMessageBytes() { + return cfg.getInt(EXPECTED_MAX_MESSAGE_BYTES); + } + +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java similarity index 91% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java rename to commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java index 223c8f61e..8869acb52 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/AvroTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java @@ -14,9 +14,9 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.input; +package io.aiven.kafka.connect.common.source.input; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMA_REGISTRY_URL; import java.io.IOException; import java.io.InputStream; @@ -29,15 +29,15 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import org.apache.kafka.common.config.AbstractConfig; -import com.amazonaws.util.IOUtils; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.DataFileStream; import org.apache.avro.file.SeekableByteArrayInput; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; +import org.apache.commons.io.IOUtils; import org.apache.commons.io.function.IOSupplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -47,21 +47,21 @@ public class AvroTransformer implements Transformer { private static final Logger LOGGER = LoggerFactory.getLogger(AvroTransformer.class); @Override - public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { - config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + public void configureValueConverter(final Map<String, String> config, final AbstractConfig sourceConfig) { + config.put(SCHEMA_REGISTRY_URL, sourceConfig.getString(SCHEMA_REGISTRY_URL)); } @Override public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, - final int topicPartition, final S3SourceConfig s3SourceConfig) { + final int topicPartition, final AbstractConfig sourceConfig) { final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); return readAvroRecordsAsStream(inputStreamIOSupplier, datumReader); } @Override - public byte[] getValueBytes(final Object record, final String topic, final S3SourceConfig s3SourceConfig) { + public byte[] getValueBytes(final Object record, final String topic, final AbstractConfig sourceConfig) { return TransformationUtils.serializeAvroRecordToBytes(Collections.singletonList((GenericRecord) record), topic, - s3SourceConfig); + sourceConfig); } private Stream<Object> readAvroRecordsAsStream(final IOSupplier<InputStream> inputStreamIOSupplier, diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java similarity index 89% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformer.java rename to commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java index 8e36cab8c..644a3f719 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.input; +package io.aiven.kafka.connect.common.source.input; import java.io.IOException; import java.io.InputStream; @@ -24,7 +24,7 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import org.apache.kafka.common.config.AbstractConfig; import org.apache.commons.io.function.IOSupplier; import org.slf4j.Logger; @@ -34,13 +34,13 @@ public class ByteArrayTransformer implements Transformer { private static final Logger LOGGER = LoggerFactory.getLogger(ByteArrayTransformer.class); @Override - public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { + public void configureValueConverter(final Map<String, String> config, final AbstractConfig sourceConfig) { // For byte array transformations, ByteArrayConverter is the converter which is the default config. } @Override public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, - final int topicPartition, final S3SourceConfig s3SourceConfig) { + final int topicPartition, final AbstractConfig sourceConfig) { // Create a Stream that processes each chunk lazily return StreamSupport.stream(new Spliterators.AbstractSpliterator<>(Long.MAX_VALUE, Spliterator.ORDERED) { @@ -66,7 +66,7 @@ public boolean tryAdvance(final java.util.function.Consumer<? super Object> acti } @Override - public byte[] getValueBytes(final Object record, final String topic, final S3SourceConfig s3SourceConfig) { + public byte[] getValueBytes(final Object record, final String topic, final AbstractConfig sourceConfig) { return (byte[]) record; } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/InputFormat.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/InputFormat.java similarity index 94% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/InputFormat.java rename to commons/src/main/java/io/aiven/kafka/connect/common/source/input/InputFormat.java index 12334ba7a..8234e2c03 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/InputFormat.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/InputFormat.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.input; +package io.aiven.kafka.connect.common.source.input; import java.util.Locale; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/JsonTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java similarity index 92% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/JsonTransformer.java rename to commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java index 80827fd8a..acaa6884a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/JsonTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java @@ -14,9 +14,9 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.input; +package io.aiven.kafka.connect.common.source.input; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMAS_ENABLE; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMAS_ENABLE; import java.io.BufferedReader; import java.io.IOException; @@ -29,7 +29,7 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import org.apache.kafka.common.config.AbstractConfig; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; @@ -45,18 +45,18 @@ public class JsonTransformer implements Transformer { final ObjectMapper objectMapper = new ObjectMapper(); @Override - public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { + public void configureValueConverter(final Map<String, String> config, final AbstractConfig sourceConfig) { config.put(SCHEMAS_ENABLE, "false"); } @Override public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, - final int topicPartition, final S3SourceConfig s3SourceConfig) { + final int topicPartition, final AbstractConfig sourceConfig) { return readJsonRecordsAsStream(inputStreamIOSupplier); } @Override - public byte[] getValueBytes(final Object record, final String topic, final S3SourceConfig s3SourceConfig) { + public byte[] getValueBytes(final Object record, final String topic, final AbstractConfig sourceConfig) { try { return objectMapper.writeValueAsBytes(record); } catch (JsonProcessingException e) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java similarity index 89% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformer.java rename to commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java index 48b0abd33..24f44e1bf 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java @@ -14,9 +14,9 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.input; +package io.aiven.kafka.connect.common.source.input; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMA_REGISTRY_URL; import java.io.File; import java.io.IOException; @@ -32,14 +32,15 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import org.apache.kafka.common.config.AbstractConfig; + +import io.aiven.kafka.connect.common.source.input.parquet.LocalInputFile; import org.apache.avro.generic.GenericRecord; import org.apache.commons.compress.utils.IOUtils; import org.apache.commons.io.function.IOSupplier; import org.apache.parquet.avro.AvroParquetReader; import org.apache.parquet.io.InputFile; -import org.apache.parquet.io.LocalInputFile; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -48,20 +49,20 @@ public class ParquetTransformer implements Transformer { private static final Logger LOGGER = LoggerFactory.getLogger(ParquetTransformer.class); @Override - public void configureValueConverter(final Map<String, String> config, final S3SourceConfig s3SourceConfig) { - config.put(SCHEMA_REGISTRY_URL, s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + public void configureValueConverter(final Map<String, String> config, final AbstractConfig sourceConfig) { + config.put(SCHEMA_REGISTRY_URL, sourceConfig.getString(SCHEMA_REGISTRY_URL)); } @Override public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, - final int topicPartition, final S3SourceConfig s3SourceConfig) { + final int topicPartition, final AbstractConfig sourceConfig) { return getParquetStreamRecords(inputStreamIOSupplier, topic, topicPartition); } @Override - public byte[] getValueBytes(final Object record, final String topic, final S3SourceConfig s3SourceConfig) { + public byte[] getValueBytes(final Object record, final String topic, final AbstractConfig sourceConfig) { return TransformationUtils.serializeAvroRecordToBytes(Collections.singletonList((GenericRecord) record), topic, - s3SourceConfig); + sourceConfig); } private Stream<Object> getParquetStreamRecords(final IOSupplier<InputStream> inputStreamIOSupplier, diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformationUtils.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformationUtils.java similarity index 75% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformationUtils.java rename to commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformationUtils.java index 9c6e31f9d..9f81d4406 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformationUtils.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformationUtils.java @@ -14,10 +14,9 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.input; +package io.aiven.kafka.connect.common.source.input; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMA_REGISTRY_URL; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -26,7 +25,9 @@ import java.util.List; import java.util.Map; -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import org.apache.kafka.common.config.AbstractConfig; + +import io.aiven.kafka.connect.common.config.SchemaRegistryFragment; import io.confluent.kafka.serializers.KafkaAvroSerializer; import org.apache.avro.generic.GenericRecord; @@ -40,12 +41,13 @@ private TransformationUtils() { // hidden } - static byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String topic, - final S3SourceConfig s3SourceConfig) { + public static byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String topic, + final AbstractConfig sourceConfig) { + final SchemaRegistryFragment registryFragment = new SchemaRegistryFragment(sourceConfig); final Map<String, String> config = Collections.singletonMap(SCHEMA_REGISTRY_URL, - s3SourceConfig.getString(SCHEMA_REGISTRY_URL)); + registryFragment.getSchemaRegistryUrl()); - try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) s3SourceConfig.getClass(VALUE_SERIALIZER) + try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) registryFragment.getAvroValueSerializer() .getDeclaredConstructor() .newInstance(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { avroSerializer.configure(config, false); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/Transformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java similarity index 72% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/Transformer.java rename to commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java index 616cfdb77..8867ed6d9 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/Transformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java @@ -14,22 +14,22 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.input; +package io.aiven.kafka.connect.common.source.input; import java.io.InputStream; import java.util.Map; import java.util.stream.Stream; -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import org.apache.kafka.common.config.AbstractConfig; import org.apache.commons.io.function.IOSupplier; public interface Transformer { - void configureValueConverter(Map<String, String> config, S3SourceConfig s3SourceConfig); + void configureValueConverter(Map<String, String> config, AbstractConfig sourceConfig); Stream<Object> getRecords(IOSupplier<InputStream> inputStreamIOSupplier, String topic, int topicPartition, - S3SourceConfig s3SourceConfig); + AbstractConfig sourceConfig); - byte[] getValueBytes(Object record, String topic, S3SourceConfig s3SourceConfig); + byte[] getValueBytes(Object record, String topic, AbstractConfig sourceConfig); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformerFactory.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java similarity index 65% rename from s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformerFactory.java rename to commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java index 4033e734a..f868d7328 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/input/TransformerFactory.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java @@ -14,19 +14,20 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.input; +package io.aiven.kafka.connect.common.source.input; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.INPUT_FORMAT_KEY; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.common.config.SchemaRegistryFragment; +import io.aiven.kafka.connect.common.config.SourceCommonConfig; public final class TransformerFactory { private TransformerFactory() { // hidden } - public static Transformer getTransformer(final S3SourceConfig s3SourceConfig) { - final InputFormat inputFormatEnum = s3SourceConfig.getInputFormat(); + public static Transformer getTransformer(final SourceCommonConfig sourceConfig) { + final InputFormat inputFormatEnum = new SchemaRegistryFragment(sourceConfig).getInputFormat(); switch (inputFormatEnum) { case AVRO : return new AvroTransformer(); @@ -38,7 +39,7 @@ public static Transformer getTransformer(final S3SourceConfig s3SourceConfig) { return new ByteArrayTransformer(); default : throw new IllegalArgumentException( - "Unknown output format " + s3SourceConfig.getString(INPUT_FORMAT_KEY)); + "Unknown input format in configuration: " + sourceConfig.getString(INPUT_FORMAT_KEY)); } } } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/parquet/LocalInputFile.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/parquet/LocalInputFile.java new file mode 100644 index 000000000..bb1081ab2 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/parquet/LocalInputFile.java @@ -0,0 +1,103 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input.parquet; + +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.file.Path; + +import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.SeekableInputStream; + +/** + * {@code LocalInputFile} is an implementation needed by Parquet to read from local data files using + * {@link SeekableInputStream} instances. + */ +public class LocalInputFile implements InputFile { + + private final Path path; + private long length = -1; + + public LocalInputFile(final Path file) { + path = file; + } + + @Override + public long getLength() throws IOException { + if (length == -1) { + try (RandomAccessFile file = new RandomAccessFile(path.toFile(), "r")) { + length = file.length(); + } + } + return length; + } + + @Override + public SeekableInputStream newStream() throws IOException { + + return new SeekableInputStream() { + + private final RandomAccessFile randomAccessFile = new RandomAccessFile(path.toFile(), "r"); + + @Override + public int read() throws IOException { + return randomAccessFile.read(); + } + + @Override + public long getPos() throws IOException { + return randomAccessFile.getFilePointer(); + } + + @Override + public void seek(final long newPos) throws IOException { + randomAccessFile.seek(newPos); + } + + @Override + public void readFully(final byte[] bytes) throws IOException { + randomAccessFile.readFully(bytes); + } + + @Override + public void readFully(final byte[] bytes, final int start, final int len) throws IOException { + randomAccessFile.readFully(bytes, start, len); + } + + @Override + public int read(final ByteBuffer buf) throws IOException { + final byte[] buffer = new byte[buf.remaining()]; + final int code = read(buffer); + buf.put(buffer, buf.position() + buf.arrayOffset(), buf.remaining()); + return code; + } + + @Override + public void readFully(final ByteBuffer buf) throws IOException { + final byte[] buffer = new byte[buf.remaining()]; + readFully(buffer); + buf.put(buffer, buf.position() + buf.arrayOffset(), buf.remaining()); + } + + @Override + public void close() throws IOException { + randomAccessFile.close(); + } + }; + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/AvroTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java similarity index 91% rename from s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/AvroTransformerTest.java rename to commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java index 39b689736..a0dc3d5d9 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/AvroTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java @@ -14,9 +14,9 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.input; +package io.aiven.kafka.connect.common.source.input; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMA_REGISTRY_URL; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.when; @@ -29,7 +29,7 @@ import java.util.List; import java.util.Map; -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.common.config.SourceCommonConfig; import org.apache.avro.Schema; import org.apache.avro.file.DataFileWriter; @@ -49,7 +49,7 @@ final class AvroTransformerTest { @Mock - private S3SourceConfig s3SourceConfig; + private SourceCommonConfig sourceCommonConfig; private AvroTransformer avroTransformer; private Map<String, String> config; @@ -63,8 +63,8 @@ void setUp() { @Test void testConfigureValueConverter() { final String value = "http://localhost:8081"; - when(s3SourceConfig.getString(SCHEMA_REGISTRY_URL)).thenReturn(value); - avroTransformer.configureValueConverter(config, s3SourceConfig); + when(sourceCommonConfig.getString(SCHEMA_REGISTRY_URL)).thenReturn(value); + avroTransformer.configureValueConverter(config, sourceCommonConfig); assertThat(config.get(SCHEMA_REGISTRY_URL)).isEqualTo("http://localhost:8081") .describedAs("The schema registry URL should be correctly set in the config."); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java similarity index 89% rename from s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformerTest.java rename to commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java index 2486cfadd..81aaf7b79 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ByteArrayTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.input; +package io.aiven.kafka.connect.common.source.input; import static org.assertj.core.api.Assertions.assertThat; @@ -24,7 +24,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.common.config.SourceCommonConfig; import org.apache.commons.io.function.IOSupplier; import org.junit.jupiter.api.BeforeEach; @@ -40,7 +40,7 @@ final class ByteArrayTransformerTest { private ByteArrayTransformer byteArrayTransformer; @Mock - private S3SourceConfig s3SourceConfig; + private SourceCommonConfig sourceCommonConfig; @BeforeEach void setUp() { @@ -54,7 +54,7 @@ void testGetRecordsSingleChunk() { final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; final Stream<Object> records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, - s3SourceConfig); + sourceCommonConfig); final List<Object> recs = records.collect(Collectors.toList()); assertThat(recs).hasSize(1); @@ -68,7 +68,7 @@ void testGetRecordsEmptyInputStream() { final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; final Stream<Object> records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, - s3SourceConfig); + sourceCommonConfig); assertThat(records).hasSize(0); } @@ -76,7 +76,7 @@ void testGetRecordsEmptyInputStream() { @Test void testGetValueBytes() { final byte[] record = { 1, 2, 3 }; - final byte[] result = byteArrayTransformer.getValueBytes(record, TEST_TOPIC, s3SourceConfig); + final byte[] result = byteArrayTransformer.getValueBytes(record, TEST_TOPIC, sourceCommonConfig); assertThat(result).containsExactlyInAnyOrder(record); } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ContentUtils.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ContentUtils.java new file mode 100644 index 000000000..4b82f0a63 --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ContentUtils.java @@ -0,0 +1,99 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import static org.apache.kafka.connect.data.Schema.INT32_SCHEMA; +import static org.apache.kafka.connect.data.Schema.STRING_SCHEMA; + +import java.io.IOException; +import java.net.ConnectException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.kafka.common.record.TimestampType; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.sink.SinkRecord; + +import io.aiven.kafka.connect.common.config.OutputField; +import io.aiven.kafka.connect.common.config.OutputFieldEncodingType; +import io.aiven.kafka.connect.common.config.OutputFieldType; +import io.aiven.kafka.connect.common.output.parquet.ParquetOutputWriter; + +public final class ContentUtils { + private ContentUtils() { + } + public static Path getTmpFilePath(final String name1) throws IOException { + final String tmpFile = "users.parquet"; + final Path parquetFileDir = Files.createTempDirectory("parquet_tests"); + final String parquetFilePath = parquetFileDir.toAbsolutePath() + "/" + tmpFile; + + writeParquetFile(parquetFilePath, name1); + return Paths.get(parquetFilePath); + } + + public static void writeParquetFile(final String tempFilePath, final String name1) throws IOException { + // Define the Avro schema + final Schema schema = SchemaBuilder.struct() + .field("name", STRING_SCHEMA) + .field("age", INT32_SCHEMA) + .field("email", STRING_SCHEMA) + .build(); + // Write the Parquet file + try { + writeParquetFile(tempFilePath, schema, name1, 100); + } catch (IOException e) { + throw new ConnectException("Error writing parquet file"); + } + } + + @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") + private static void writeParquetFile(final String outputPath, final Schema schema, final String name1, + final int numOfRecords) throws IOException { + + final List<Struct> allParquetRecords = new ArrayList<>(); + // Write records to the Parquet file + for (int i = 0; i < numOfRecords; i++) { + allParquetRecords + .add(new Struct(schema).put("name", name1 + i).put("age", 30).put("email", name1 + "@test")); + } + + // Create a Parquet writer + final Path outputFilePath = Paths.get(outputPath); + try (var outputStream = Files.newOutputStream(outputFilePath.toAbsolutePath()); + var parquetWriter = new ParquetOutputWriter( + List.of(new OutputField(OutputFieldType.VALUE, OutputFieldEncodingType.NONE)), outputStream, + Collections.emptyMap(), false)) { + int counter = 0; + final var sinkRecords = new ArrayList<SinkRecord>(); + for (final var r : allParquetRecords) { + final var sinkRecord = new SinkRecord( // NOPMD AvoidInstantiatingObjectsInLoops + "some-topic", 1, STRING_SCHEMA, "some-key-" + counter, schema, r, 100L, 1000L + counter, + TimestampType.CREATE_TIME, null); + sinkRecords.add(sinkRecord); + counter++; + } + parquetWriter.writeRecords(sinkRecords); + } + + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/JsonTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java similarity index 90% rename from s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/JsonTransformerTest.java rename to commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java index 0abf61c29..cdec0ace4 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/JsonTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java @@ -14,9 +14,9 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.input; +package io.aiven.kafka.connect.common.source.input; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMAS_ENABLE; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMAS_ENABLE; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -29,7 +29,7 @@ import java.util.Map; import java.util.stream.Stream; -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.common.config.SourceCommonConfig; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; @@ -46,7 +46,7 @@ final class JsonTransformerTest { public static final String TESTTOPIC = "testtopic"; JsonTransformer jsonTransformer; - S3SourceConfig s3SourceConfig; + SourceCommonConfig sourceCommonConfig; @Mock private IOSupplier<InputStream> inputStreamIOSupplierMock; @@ -54,14 +54,14 @@ final class JsonTransformerTest { @BeforeEach void setUp() { jsonTransformer = new JsonTransformer(); - s3SourceConfig = mock(S3SourceConfig.class); + sourceCommonConfig = mock(SourceCommonConfig.class); } @Test void testConfigureValueConverter() { final Map<String, String> config = new HashMap<>(); - jsonTransformer.configureValueConverter(config, s3SourceConfig); + jsonTransformer.configureValueConverter(config, sourceCommonConfig); assertThat(config).as("%s should be set to false", SCHEMAS_ENABLE) .containsEntry(SCHEMAS_ENABLE, Boolean.FALSE.toString()); } @@ -72,7 +72,7 @@ void testHandleValueDataWithValidJson() { "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, - s3SourceConfig); + sourceCommonConfig); assertThat(jsonNodes).hasSize(1); } @@ -84,7 +84,7 @@ void testHandleValueDataWithInvalidJson() { final IOSupplier<InputStream> inputStreamIOSupplier = () -> invalidJsonInputStream; final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, - s3SourceConfig); + sourceCommonConfig); assertThat(jsonNodes).isEmpty(); } @@ -95,9 +95,9 @@ void testSerializeJsonDataValid() throws IOException { "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, - s3SourceConfig); + sourceCommonConfig); final byte[] serializedData = jsonTransformer.getValueBytes(jsonNodes.findFirst().get(), TESTTOPIC, - s3SourceConfig); + sourceCommonConfig); final ObjectMapper objectMapper = new ObjectMapper(); final JsonNode expectedData = objectMapper.readTree(serializedData); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java similarity index 92% rename from s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformerTest.java rename to commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java index 08f462595..e247adbc0 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/input/ParquetTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.s3.source.input; +package io.aiven.kafka.connect.common.source.input; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.any; @@ -33,11 +33,10 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; +import io.aiven.kafka.connect.common.config.SourceCommonConfig; -import com.amazonaws.util.IOUtils; import org.apache.avro.generic.GenericRecord; +import org.apache.commons.io.IOUtils; import org.apache.commons.io.function.IOSupplier; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -59,7 +58,7 @@ void testHandleValueDataWithZeroBytes() { final byte[] mockParquetData = new byte[0]; final InputStream inputStream = new ByteArrayInputStream(mockParquetData); final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; - final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); + final SourceCommonConfig s3SourceConfig = mock(SourceCommonConfig.class); final String topic = "test-topic"; final int topicPartition = 0; @@ -74,7 +73,7 @@ void testGetRecordsWithValidData() throws Exception { final byte[] mockParquetData = generateMockParquetData(); final InputStream inputStream = new ByteArrayInputStream(mockParquetData); final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; - final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); + final SourceCommonConfig s3SourceConfig = mock(SourceCommonConfig.class); final String topic = "test-topic"; final int topicPartition = 0; @@ -95,7 +94,7 @@ void testGetRecordsWithInvalidData() { final InputStream inputStream = new ByteArrayInputStream(invalidData); final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; - final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); + final SourceCommonConfig s3SourceConfig = mock(SourceCommonConfig.class); final String topic = "test-topic"; final int topicPartition = 0; diff --git a/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java b/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java index 4371f4658..8b38c8f4c 100644 --- a/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java +++ b/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java @@ -110,6 +110,8 @@ public final class S3ConfigFragment extends ConfigFragment { public static final String AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_CONFIG = "aws.s3.backoff.max.delay.ms"; public static final String AWS_S3_RETRY_BACKOFF_MAX_RETRIES_CONFIG = "aws.s3.backoff.max.retries"; + public static final String FETCH_PAGE_SIZE = "aws.s3.fetch.page.size"; + private static final String GROUP_AWS = "AWS"; private static final String GROUP_AWS_STS = "AWS STS"; @@ -211,9 +213,13 @@ static void addAwsConfigGroup(final ConfigDef configDef) { awsGroupCounter++, ConfigDef.Width.NONE, AWS_S3_ENDPOINT_CONFIG); configDef.define(AWS_S3_REGION_CONFIG, ConfigDef.Type.STRING, null, new AwsRegionValidator(), - ConfigDef.Importance.MEDIUM, "AWS S3 Region, e.g. us-east-1", GROUP_AWS, awsGroupCounter++, // NOPMD - // UnusedAssignment + ConfigDef.Importance.MEDIUM, "AWS S3 Region, e.g. us-east-1", GROUP_AWS, awsGroupCounter++, ConfigDef.Width.NONE, AWS_S3_REGION_CONFIG); + + configDef.define(FETCH_PAGE_SIZE, ConfigDef.Type.INT, 10, ConfigDef.Range.atLeast(1), + ConfigDef.Importance.MEDIUM, "AWS S3 Fetch page size", GROUP_AWS, awsGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, FETCH_PAGE_SIZE); } static void addAwsStsConfigGroup(final ConfigDef configDef) { diff --git a/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3SourceBaseConfig.java b/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3SourceBaseConfig.java deleted file mode 100644 index f1db8eddc..000000000 --- a/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3SourceBaseConfig.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.config.s3; - -import static io.aiven.kafka.connect.config.s3.S3CommonConfig.handleDeprecatedYyyyUppercase; - -import java.util.Map; - -import org.apache.kafka.common.config.ConfigDef; - -import io.aiven.kafka.connect.common.config.SourceCommonConfig; -import io.aiven.kafka.connect.iam.AwsStsEndpointConfig; -import io.aiven.kafka.connect.iam.AwsStsRole; - -import com.amazonaws.auth.AWSCredentialsProvider; -import com.amazonaws.auth.BasicAWSCredentials; -import com.amazonaws.client.builder.AwsClientBuilder; -import com.amazonaws.regions.Region; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -@SuppressWarnings({ "PMD.ExcessiveImports", "PMD.TooManyStaticImports" }) -public class S3SourceBaseConfig extends SourceCommonConfig { - public static final Logger LOGGER = LoggerFactory.getLogger(S3SourceBaseConfig.class); - private final S3ConfigFragment s3ConfigFragment; - protected S3SourceBaseConfig(ConfigDef definition, Map<String, String> originals) { // NOPMD UnusedAssignment - super(definition, handleDeprecatedYyyyUppercase(originals)); - s3ConfigFragment = new S3ConfigFragment(this); - validate(); - } - - private void validate() { - s3ConfigFragment.validate(); - } - - public AwsStsRole getStsRole() { - return s3ConfigFragment.getStsRole(); - } - - public boolean hasAwsStsRole() { - return s3ConfigFragment.hasAwsStsRole(); - } - - public boolean hasStsEndpointConfig() { - return s3ConfigFragment.hasStsEndpointConfig(); - } - - public AwsStsEndpointConfig getStsEndpointConfig() { - return s3ConfigFragment.getStsEndpointConfig(); - } - - public AwsClientBuilder.EndpointConfiguration getAwsEndpointConfiguration() { - return s3ConfigFragment.getAwsEndpointConfiguration(); - } - - public BasicAWSCredentials getAwsCredentials() { - return s3ConfigFragment.getAwsCredentials(); - } - - public String getAwsS3EndPoint() { - return s3ConfigFragment.getAwsS3EndPoint(); - } - - public Region getAwsS3Region() { - return s3ConfigFragment.getAwsS3Region(); - } - - public String getAwsS3BucketName() { - return s3ConfigFragment.getAwsS3BucketName(); - } - - public String getServerSideEncryptionAlgorithmName() { - return s3ConfigFragment.getServerSideEncryptionAlgorithmName(); - } - - public String getAwsS3Prefix() { - return s3ConfigFragment.getAwsS3Prefix(); - } - - public int getAwsS3PartSize() { - return s3ConfigFragment.getAwsS3PartSize(); - } - - public long getS3RetryBackoffDelayMs() { - return s3ConfigFragment.getS3RetryBackoffDelayMs(); - } - - public long getS3RetryBackoffMaxDelayMs() { - return s3ConfigFragment.getS3RetryBackoffMaxDelayMs(); - } - - public int getS3RetryBackoffMaxRetries() { - return s3ConfigFragment.getS3RetryBackoffMaxRetries(); - } - - public AWSCredentialsProvider getCustomCredentialsProvider() { - return s3ConfigFragment.getCustomCredentialsProvider(); - } - -} diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts index 943dbc75c..3530724e0 100644 --- a/s3-source-connector/build.gradle.kts +++ b/s3-source-connector/build.gradle.kts @@ -21,7 +21,6 @@ plugins { id("aiven-apache-kafka-connectors-all.java-conventions") } val amazonS3Version by extra("1.12.729") val amazonSTSVersion by extra("1.12.729") val s3mockVersion by extra("0.2.6") -val parquetVersion by extra("1.14.3") val testKafkaVersion by extra("3.7.1") val integrationTest: SourceSet = @@ -67,31 +66,17 @@ dependencies { compileOnly(apache.kafka.connect.runtime) implementation(project(":commons")) + implementation(project(":s3-commons")) implementation("com.amazonaws:aws-java-sdk-s3:$amazonS3Version") implementation("com.amazonaws:aws-java-sdk-sts:$amazonSTSVersion") - implementation("org.apache.parquet:parquet-hadoop:$parquetVersion") - testImplementation("org.apache.parquet:parquet-hadoop:$parquetVersion") - integrationTestImplementation("org.apache.parquet:parquet-hadoop:$parquetVersion") - - implementation("org.apache.parquet:parquet-avro:$parquetVersion") { - exclude(group = "org.xerial.snappy", module = "snappy-java") - exclude(group = "org.slf4j", module = "slf4j-api") - exclude(group = "org.apache.avro", module = "avro") - } - testImplementation("org.apache.parquet:parquet-avro:$parquetVersion") { - exclude(group = "org.xerial.snappy", module = "snappy-java") - exclude(group = "org.slf4j", module = "slf4j-api") - exclude(group = "org.apache.avro", module = "avro") - } - implementation(tools.spotbugs.annotations) implementation(logginglibs.slf4j) implementation(apache.avro) implementation(confluent.kafka.connect.avro.converter) { exclude(group = "org.apache.kafka", module = "kafka-clients") } - + integrationTestImplementation(apache.parquet.hadoop) testImplementation(compressionlibs.snappy) testImplementation(compressionlibs.zstd.jni) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 05514430d..3cd72f290 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -16,18 +16,18 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.AVRO_VALUE_SERIALIZER; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.VALUE_CONVERTER_SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPIC_PARTITIONS; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_ACCESS_KEY_ID_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_PREFIX_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG; import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_ACCESS_KEY_ID_CONFIG; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_ENDPOINT_CONFIG; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_PREFIX_CONFIG; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.INPUT_FORMAT_KEY; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_CONVERTER_SCHEMA_REGISTRY_URL; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.VALUE_SERIALIZER; import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; import static java.util.Map.entry; import static org.assertj.core.api.Assertions.assertThat; @@ -58,7 +58,7 @@ import org.apache.kafka.clients.consumer.KafkaConsumer; import org.apache.kafka.common.serialization.ByteArrayDeserializer; -import io.aiven.kafka.connect.s3.source.input.InputFormat; +import io.aiven.kafka.connect.common.source.input.InputFormat; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; @@ -283,7 +283,7 @@ private Map<String, String> getAvroConfig(final String topicName, final InputFor connectorConfig.put(SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); connectorConfig.put(VALUE_CONVERTER_KEY, "io.confluent.connect.avro.AvroConverter"); connectorConfig.put(VALUE_CONVERTER_SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); - connectorConfig.put(VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); + connectorConfig.put(AVRO_VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); return connectorConfig; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 087fd0451..86a870bcd 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -16,8 +16,8 @@ package io.aiven.kafka.connect.s3.source; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.MAX_POLL_RECORDS; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; @@ -35,10 +35,10 @@ import org.apache.kafka.connect.source.SourceTask; import org.apache.kafka.connect.storage.Converter; +import io.aiven.kafka.connect.common.source.input.Transformer; +import io.aiven.kafka.connect.common.source.input.TransformerFactory; import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.input.Transformer; -import io.aiven.kafka.connect.s3.source.input.TransformerFactory; import io.aiven.kafka.connect.s3.source.utils.FileReader; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import io.aiven.kafka.connect.s3.source.utils.RecordProcessor; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsAccessSecret.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsAccessSecret.java deleted file mode 100644 index 503998fc8..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsAccessSecret.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.s3.source.config; - -import java.util.Objects; - -import org.apache.kafka.common.config.types.Password; - -final class AwsAccessSecret { - private final Password accessKeyId; - private final Password secretAccessKey; - - public AwsAccessSecret(final Password accessKeyId, final Password secretAccessKey) { - this.accessKeyId = accessKeyId; - this.secretAccessKey = secretAccessKey; - } - - public Password getAccessKeyId() { - return accessKeyId; - } - - public Password getSecretAccessKey() { - return secretAccessKey; - } - - public Boolean isValid() { - return Objects.nonNull(accessKeyId) && Objects.nonNull(secretAccessKey); - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsCredentialProviderFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsCredentialProviderFactory.java deleted file mode 100644 index d0fa8f55b..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsCredentialProviderFactory.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright 2021 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.s3.source.config; - -import com.amazonaws.auth.AWSCredentialsProvider; -import com.amazonaws.auth.AWSStaticCredentialsProvider; -import com.amazonaws.auth.BasicAWSCredentials; -import com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider; -import com.amazonaws.client.builder.AwsClientBuilder; -import com.amazonaws.services.securitytoken.AWSSecurityTokenService; -import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClientBuilder; - -public class AwsCredentialProviderFactory { - public AWSCredentialsProvider getProvider(final S3SourceConfig config) { - if (config.hasAwsStsRole()) { - return getStsProvider(config); - } - final AwsAccessSecret awsCredentials = config.getAwsCredentials(); - if (!awsCredentials.isValid()) { - return config.getCustomCredentialsProvider(); - } - return new AWSStaticCredentialsProvider(new BasicAWSCredentials(awsCredentials.getAccessKeyId().value(), - awsCredentials.getSecretAccessKey().value())); - } - - private AWSCredentialsProvider getStsProvider(final S3SourceConfig config) { - final AwsStsRole awsstsRole = config.getStsRole(); - final AWSSecurityTokenService sts = securityTokenService(config); - return new STSAssumeRoleSessionCredentialsProvider.Builder(awsstsRole.getArn(), awsstsRole.getSessionName()) - .withStsClient(sts) - .withExternalId(awsstsRole.getExternalId()) - .withRoleSessionDurationSeconds(awsstsRole.getSessionDurationSeconds()) - .build(); - } - - private AWSSecurityTokenService securityTokenService(final S3SourceConfig config) { - if (config.hasStsEndpointConfig()) { - final AwsStsEndpointConfig endpointConfig = config.getStsEndpointConfig(); - final AwsClientBuilder.EndpointConfiguration stsConfig = new AwsClientBuilder.EndpointConfiguration( - endpointConfig.getServiceEndpoint(), endpointConfig.getSigningRegion()); - final AWSSecurityTokenServiceClientBuilder stsBuilder = AWSSecurityTokenServiceClientBuilder.standard(); - stsBuilder.setEndpointConfiguration(stsConfig); - return stsBuilder.build(); - } - return AWSSecurityTokenServiceClientBuilder.defaultClient(); - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsEndpointConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsEndpointConfig.java deleted file mode 100644 index 219db5114..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsEndpointConfig.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright 2021 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.s3.source.config; - -import java.util.Objects; - -final class AwsStsEndpointConfig { - public static final String AWS_STS_GLOBAL_ENDPOINT = "https://sts.amazonaws.com"; - - private final String serviceEndpoint; - private final String signingRegion; - - public AwsStsEndpointConfig(final String serviceEndpoint, final String signingRegion) { - this.serviceEndpoint = serviceEndpoint; - this.signingRegion = signingRegion; - } - - public String getServiceEndpoint() { - return serviceEndpoint; - } - - public String getSigningRegion() { - return signingRegion; - } - - public Boolean isValid() { - return Objects.nonNull(signingRegion); - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsRole.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsRole.java deleted file mode 100644 index aa4adb6da..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/AwsStsRole.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright 2021 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.s3.source.config; - -import java.util.Objects; - -import com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider; - -final class AwsStsRole { - - // AssumeRole request limit details here: - // https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html - public static final int MIN_SESSION_DURATION = STSAssumeRoleSessionCredentialsProvider.DEFAULT_DURATION_SECONDS; - public static final int MAX_SESSION_DURATION = 43_200; - - private final String arn; - private final String externalId; - private final String sessionName; - private final int sessionDurationSeconds; - - public AwsStsRole(final String arn, final String externalId, final String sessionName, - final int sessionDurationSeconds) { - this.arn = arn; - this.externalId = externalId; - this.sessionName = sessionName; - this.sessionDurationSeconds = sessionDurationSeconds; - } - - public String getArn() { - return arn; - } - - public String getExternalId() { - return externalId; - } - - public String getSessionName() { - return sessionName; - } - - public int getSessionDurationSeconds() { - return sessionDurationSeconds; - } - - public Boolean isValid() { - return Objects.nonNull(arn) && Objects.nonNull(sessionName); - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java index a9edbbc61..346ec5825 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java @@ -18,6 +18,8 @@ import java.util.Objects; +import io.aiven.kafka.connect.iam.AwsCredentialProviderFactory; + import com.amazonaws.PredefinedClientConfigurations; import com.amazonaws.client.builder.AwsClientBuilder; import com.amazonaws.retry.PredefinedBackoffStrategies; @@ -39,7 +41,7 @@ public AmazonS3 createAmazonS3Client(final S3SourceConfig config) { Math.toIntExact(config.getS3RetryBackoffMaxDelayMs())), config.getS3RetryBackoffMaxRetries(), false)); final var s3ClientBuilder = AmazonS3ClientBuilder.standard() - .withCredentials(credentialFactory.getProvider(config)) + .withCredentials(credentialFactory.getProvider(config.getS3ConfigFragment())) .withClientConfiguration(clientConfig); if (Objects.isNull(awsEndpointConfig)) { s3ClientBuilder.withRegion(config.getAwsS3Region().getName()); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 77241348e..68b9b2f98 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -16,370 +16,127 @@ package io.aiven.kafka.connect.s3.source.config; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Locale; +import static io.aiven.kafka.connect.config.s3.S3CommonConfig.handleDeprecatedYyyyUppercase; + import java.util.Map; -import java.util.Objects; -import java.util.regex.Pattern; -import java.util.stream.Collectors; -import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; -import org.apache.kafka.common.config.ConfigException; -import io.aiven.kafka.connect.common.config.validators.NonEmptyPassword; -import io.aiven.kafka.connect.common.config.validators.UrlValidator; -import io.aiven.kafka.connect.s3.source.input.InputFormat; +import io.aiven.kafka.connect.common.config.FileNameFragment; +import io.aiven.kafka.connect.common.config.OutputFieldType; +import io.aiven.kafka.connect.common.config.OutputFormatFragment; +import io.aiven.kafka.connect.common.config.SchemaRegistryFragment; +import io.aiven.kafka.connect.common.config.SourceCommonConfig; +import io.aiven.kafka.connect.common.config.SourceConfigFragment; +import io.aiven.kafka.connect.config.s3.S3ConfigFragment; +import io.aiven.kafka.connect.iam.AwsStsEndpointConfig; +import io.aiven.kafka.connect.iam.AwsStsRole; import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.BasicAWSCredentials; +import com.amazonaws.client.builder.AwsClientBuilder; import com.amazonaws.regions.Region; -import com.amazonaws.regions.RegionUtils; -import com.amazonaws.regions.Regions; -import com.amazonaws.services.s3.internal.BucketNameUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -@SuppressWarnings({ "PMD.TooManyMethods", "PMD.GodClass", "PMD.ExcessiveImports" }) -final public class S3SourceConfig extends AbstractConfig { +final public class S3SourceConfig extends SourceCommonConfig { public static final Logger LOGGER = LoggerFactory.getLogger(S3SourceConfig.class); - public static final String AWS_S3_PREFIX_CONFIG = "aws.s3.prefix"; - public static final String AWS_S3_RETRY_BACKOFF_DELAY_MS_CONFIG = "aws.s3.backoff.delay.ms"; - public static final String AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_CONFIG = "aws.s3.backoff.max.delay.ms"; - public static final String AWS_S3_RETRY_BACKOFF_MAX_RETRIES_CONFIG = "aws.s3.backoff.max.retries"; - public static final String AWS_S3_REGION_CONFIG = "aws.s3.region"; - public static final String AWS_S3_ENDPOINT_CONFIG = "aws.s3.endpoint"; - public static final String AWS_STS_ROLE_ARN = "aws.sts.role.arn"; - public static final String AWS_STS_ROLE_EXTERNAL_ID = "aws.sts.role.external.id"; - public static final String AWS_STS_ROLE_SESSION_NAME = "aws.sts.role.session.name"; - public static final String AWS_STS_ROLE_SESSION_DURATION = "aws.sts.role.session.duration"; - public static final String AWS_STS_CONFIG_ENDPOINT = "aws.sts.config.endpoint"; - private static final String GROUP_AWS = "AWS"; - private static final String GROUP_AWS_STS = "AWS_STS"; - private static final String GROUP_OTHER = "OTHER_CFG"; - private static final String GROUP_OFFSET_TOPIC = "OFFSET_TOPIC"; - private static final String GROUP_S3_RETRY_BACKOFF_POLICY = "S3 retry backoff policy"; - - // Default values from AWS SDK, since they are hidden - public static final int AWS_S3_RETRY_BACKOFF_DELAY_MS_DEFAULT = 100; - public static final int AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_DEFAULT = 20_000; - public static final String SCHEMA_REGISTRY_URL = "schema.registry.url"; - public static final String VALUE_CONVERTER_SCHEMA_REGISTRY_URL = "value.converter.schema.registry.url"; - public static final String VALUE_SERIALIZER = "value.serializer"; - public static final String AWS_ACCESS_KEY_ID_CONFIG = "aws.access.key.id"; - public static final String AWS_SECRET_ACCESS_KEY_CONFIG = "aws.secret.access.key"; - public static final String AWS_CREDENTIALS_PROVIDER_CONFIG = "aws.credentials.provider"; - public static final String AWS_CREDENTIAL_PROVIDER_DEFAULT = "com.amazonaws.auth.DefaultAWSCredentialsProviderChain"; - public static final String AWS_S3_BUCKET_NAME_CONFIG = "aws.s3.bucket.name"; - public static final String AWS_S3_SSE_ALGORITHM_CONFIG = "aws.s3.sse.algorithm"; - public static final String TARGET_TOPIC_PARTITIONS = "topic.partitions"; - public static final String TARGET_TOPICS = "topics"; - public static final String FETCH_PAGE_SIZE = "aws.s3.fetch.page.size"; - public static final String MAX_POLL_RECORDS = "max.poll.records"; - public static final String EXPECTED_MAX_MESSAGE_BYTES = "expected.max.message.bytes"; - public static final int S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT = 3; - public static final String INPUT_FORMAT_KEY = "input.format"; - public static final String SCHEMAS_ENABLE = "schemas.enable"; - + private final S3ConfigFragment s3ConfigFragment; public S3SourceConfig(final Map<String, String> properties) { - super(configDef(), preprocessProperties(properties)); + super(configDef(), handleDeprecatedYyyyUppercase(properties)); + s3ConfigFragment = new S3ConfigFragment(this); validate(); // NOPMD ConstructorCallsOverridableMethod getStsRole is called } - static Map<String, String> preprocessProperties(final Map<String, String> properties) { - // Add other preprocessings when needed here. Mind the order. - return handleDeprecatedYyyyUppercase(properties); - } - - private static Map<String, String> handleDeprecatedYyyyUppercase(final Map<String, String> properties) { - if (!properties.containsKey(AWS_S3_PREFIX_CONFIG)) { - return properties; - } - - final var result = new HashMap<>(properties); - for (final var prop : List.of(AWS_S3_PREFIX_CONFIG)) { - if (properties.containsKey(prop)) { - String template = properties.get(prop); - final String originalTemplate = template; - - final var unitYyyyPattern = Pattern.compile("\\{\\{\\s*timestamp\\s*:\\s*unit\\s*=\\s*YYYY\\s*}}"); - template = unitYyyyPattern.matcher(template) - .replaceAll(matchResult -> matchResult.group().replace("YYYY", "yyyy")); - - if (!template.equals(originalTemplate)) { - LOGGER.warn("{{timestamp:unit=YYYY}} is no longer supported, " - + "please use {{timestamp:unit=yyyy}} instead. " + "It was automatically replaced: {}", - template); - } - - result.put(prop, template); - } - } - return result; - } - public static ConfigDef configDef() { - final var configDef = new S3SourceConfigDef(); - addSchemaRegistryGroup(configDef); - addOffsetStorageConfig(configDef); - addAwsStsConfigGroup(configDef); - addAwsConfigGroup(configDef); - addDeprecatedConfiguration(configDef); - addS3RetryPolicies(configDef); - addOtherConfig(configDef); - return configDef; - } - - private static void addSchemaRegistryGroup(final ConfigDef configDef) { - int srCounter = 0; - configDef.define(SCHEMA_REGISTRY_URL, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "SCHEMA REGISTRY URL", GROUP_OTHER, srCounter++, ConfigDef.Width.NONE, - SCHEMA_REGISTRY_URL); - configDef.define(VALUE_CONVERTER_SCHEMA_REGISTRY_URL, ConfigDef.Type.STRING, null, - new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "SCHEMA REGISTRY URL", GROUP_OTHER, - srCounter++, ConfigDef.Width.NONE, VALUE_CONVERTER_SCHEMA_REGISTRY_URL); - configDef.define(INPUT_FORMAT_KEY, ConfigDef.Type.STRING, InputFormat.BYTES.getValue(), - new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "Output format avro/json/parquet/bytes", - GROUP_OTHER, srCounter++, // NOPMD - ConfigDef.Width.NONE, INPUT_FORMAT_KEY); - - configDef.define(VALUE_SERIALIZER, ConfigDef.Type.CLASS, null, ConfigDef.Importance.MEDIUM, "Value serializer", - GROUP_OTHER, srCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, VALUE_SERIALIZER); - } - private static void addOtherConfig(final S3SourceConfigDef configDef) { - int awsOtherGroupCounter = 0; - configDef.define(FETCH_PAGE_SIZE, ConfigDef.Type.INT, 10, ConfigDef.Range.atLeast(1), - ConfigDef.Importance.MEDIUM, "Fetch page size", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, FETCH_PAGE_SIZE); - configDef.define(MAX_POLL_RECORDS, ConfigDef.Type.INT, 500, ConfigDef.Range.atLeast(1), - ConfigDef.Importance.MEDIUM, "Max poll records", GROUP_OTHER, awsOtherGroupCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, MAX_POLL_RECORDS); - configDef.define(EXPECTED_MAX_MESSAGE_BYTES, ConfigDef.Type.INT, 1_048_588, ConfigDef.Importance.MEDIUM, - "The largest record batch size allowed by Kafka config max.message.bytes", GROUP_OTHER, - awsOtherGroupCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, EXPECTED_MAX_MESSAGE_BYTES); - } + final var configDef = new S3SourceConfigDef(); + S3ConfigFragment.update(configDef); + SourceConfigFragment.update(configDef); + FileNameFragment.update(configDef); + SchemaRegistryFragment.update(configDef); + OutputFormatFragment.update(configDef, OutputFieldType.VALUE); - private static void addAwsStsConfigGroup(final ConfigDef configDef) { - int awsStsGroupCounter = 0; - configDef.define(AWS_STS_ROLE_ARN, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "AWS STS Role", GROUP_AWS_STS, awsStsGroupCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, AWS_STS_ROLE_ARN); - - configDef.define(AWS_STS_ROLE_SESSION_NAME, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "AWS STS Session name", GROUP_AWS_STS, awsStsGroupCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, AWS_STS_ROLE_SESSION_NAME); - - configDef.define(AWS_STS_ROLE_SESSION_DURATION, ConfigDef.Type.INT, 3600, - ConfigDef.Range.between(AwsStsRole.MIN_SESSION_DURATION, AwsStsRole.MAX_SESSION_DURATION), - ConfigDef.Importance.MEDIUM, "AWS STS Session duration", GROUP_AWS_STS, awsStsGroupCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, AWS_STS_ROLE_SESSION_DURATION); - - configDef.define(AWS_STS_ROLE_EXTERNAL_ID, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "AWS STS External Id", GROUP_AWS_STS, awsStsGroupCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, AWS_STS_ROLE_EXTERNAL_ID); - - configDef.define(AWS_STS_CONFIG_ENDPOINT, ConfigDef.Type.STRING, AwsStsEndpointConfig.AWS_STS_GLOBAL_ENDPOINT, - new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "AWS STS Config Endpoint", GROUP_AWS_STS, - awsStsGroupCounter++, // NOPMD UnusedAssignment - ConfigDef.Width.NONE, AWS_STS_CONFIG_ENDPOINT); + return configDef; } - private static void addS3RetryPolicies(final ConfigDef configDef) { - var retryPolicyGroupCounter = 0; - configDef.define(AWS_S3_RETRY_BACKOFF_DELAY_MS_CONFIG, ConfigDef.Type.LONG, - AWS_S3_RETRY_BACKOFF_DELAY_MS_DEFAULT, ConfigDef.Range.atLeast(1L), ConfigDef.Importance.MEDIUM, - "S3 default base sleep time for non-throttled exceptions in milliseconds. " + "Default is " - + AWS_S3_RETRY_BACKOFF_DELAY_MS_DEFAULT + ".", - GROUP_S3_RETRY_BACKOFF_POLICY, retryPolicyGroupCounter++, // NOPMD UnusedAssignment - ConfigDef.Width.NONE, AWS_S3_RETRY_BACKOFF_DELAY_MS_CONFIG); - configDef.define(AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_CONFIG, ConfigDef.Type.LONG, - AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_DEFAULT, ConfigDef.Range.atLeast(1L), ConfigDef.Importance.MEDIUM, - "S3 maximum back-off time before retrying a request in milliseconds. " + "Default is " - + AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_DEFAULT + ".", - GROUP_S3_RETRY_BACKOFF_POLICY, retryPolicyGroupCounter++, // NOPMD UnusedAssignment - ConfigDef.Width.NONE, AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_CONFIG); - configDef.define(AWS_S3_RETRY_BACKOFF_MAX_RETRIES_CONFIG, ConfigDef.Type.INT, - S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT, ConfigDef.Range.between(1L, 30), ConfigDef.Importance.MEDIUM, - "Maximum retry limit " + "(if the value is greater than 30, " - + "there can be integer overflow issues during delay calculation). " + "Default is " - + S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT + ".", - GROUP_S3_RETRY_BACKOFF_POLICY, retryPolicyGroupCounter++, // NOPMD UnusedAssignment - ConfigDef.Width.NONE, AWS_S3_RETRY_BACKOFF_MAX_RETRIES_CONFIG); - } + private void validate() { - private static void addOffsetStorageConfig(final ConfigDef configDef) { - configDef.define(TARGET_TOPIC_PARTITIONS, ConfigDef.Type.STRING, "0", new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "eg : 0,1", GROUP_OFFSET_TOPIC, 0, ConfigDef.Width.NONE, - TARGET_TOPIC_PARTITIONS); - configDef.define(TARGET_TOPICS, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, "eg : connect-storage-offsets", GROUP_OFFSET_TOPIC, 0, - ConfigDef.Width.NONE, TARGET_TOPICS); + // s3ConfigFragment is validated in this method as it is created here. + // Other Fragments created in the ConfigDef are validated in the parent classes their instances are created in. + // e.g. SourceConfigFragment, FileNameFragment, SchemaRegistryFragment and OutputFormatFragment are all + // validated in SourceCommonConfig. + s3ConfigFragment.validate(); } - private static void addDeprecatedConfiguration(final ConfigDef configDef) { - configDef.define(AWS_S3_PREFIX_CONFIG, ConfigDef.Type.STRING, "prefix", new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, - "[Deprecated] Use `file.name.template` instead. Prefix for stored objects, e.g. cluster-1/", GROUP_AWS, - 0, ConfigDef.Width.NONE, AWS_S3_PREFIX_CONFIG); + public AwsStsRole getStsRole() { + return s3ConfigFragment.getStsRole(); } - private static void addAwsConfigGroup(final ConfigDef configDef) { - int awsGroupCounter = 0; - - configDef.define(AWS_ACCESS_KEY_ID_CONFIG, ConfigDef.Type.PASSWORD, null, new NonEmptyPassword(), - ConfigDef.Importance.MEDIUM, "AWS Access Key ID", GROUP_AWS, awsGroupCounter++, ConfigDef.Width.NONE, - AWS_ACCESS_KEY_ID_CONFIG); - - configDef.define(AWS_SECRET_ACCESS_KEY_CONFIG, ConfigDef.Type.PASSWORD, null, new NonEmptyPassword(), - ConfigDef.Importance.MEDIUM, "AWS Secret Access Key", GROUP_AWS, awsGroupCounter++, - ConfigDef.Width.NONE, AWS_SECRET_ACCESS_KEY_CONFIG); - - configDef.define(AWS_CREDENTIALS_PROVIDER_CONFIG, ConfigDef.Type.CLASS, AWS_CREDENTIAL_PROVIDER_DEFAULT, - ConfigDef.Importance.MEDIUM, - "When you initialize a new " + "service client without supplying any arguments, " - + "the AWS SDK for Java attempts to find temporary " - + "credentials by using the default credential " + "provider chain implemented by the " - + "DefaultAWSCredentialsProviderChain class.", - - GROUP_AWS, awsGroupCounter++, ConfigDef.Width.NONE, AWS_CREDENTIALS_PROVIDER_CONFIG); - - configDef.define(AWS_S3_BUCKET_NAME_CONFIG, ConfigDef.Type.STRING, null, new BucketNameValidator(), - ConfigDef.Importance.MEDIUM, "AWS S3 Bucket name", GROUP_AWS, awsGroupCounter++, ConfigDef.Width.NONE, - AWS_S3_BUCKET_NAME_CONFIG); - - // AWS S3 Server Side Encryption Algorithm configuration - // Example values: 'AES256' for S3-managed keys, 'aws:kms' for AWS KMS-managed keys - configDef.define(AWS_S3_SSE_ALGORITHM_CONFIG, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, - "AWS S3 Server Side Encryption Algorithm. Example values: 'AES256', 'aws:kms'.", GROUP_AWS, - awsGroupCounter++, ConfigDef.Width.NONE, AWS_S3_SSE_ALGORITHM_CONFIG); - - configDef.define(AWS_S3_ENDPOINT_CONFIG, ConfigDef.Type.STRING, null, new UrlValidator(), - ConfigDef.Importance.LOW, "Explicit AWS S3 Endpoint Address, mainly for testing", GROUP_AWS, - awsGroupCounter++, ConfigDef.Width.NONE, AWS_S3_ENDPOINT_CONFIG); - - configDef.define(AWS_S3_REGION_CONFIG, ConfigDef.Type.STRING, null, new AwsRegionValidator(), - ConfigDef.Importance.MEDIUM, "AWS S3 Region, e.g. us-east-1", GROUP_AWS, awsGroupCounter++, // NOPMD - // UnusedAssignment - ConfigDef.Width.NONE, AWS_S3_REGION_CONFIG); + public boolean hasAwsStsRole() { + return s3ConfigFragment.hasAwsStsRole(); } - protected static class AwsRegionValidator implements ConfigDef.Validator { - private static final String SUPPORTED_AWS_REGIONS = Arrays.stream(Regions.values()) - .map(Regions::getName) - .collect(Collectors.joining(", ")); - - @Override - public void ensureValid(final String name, final Object value) { - if (Objects.nonNull(value)) { - final String valueStr = (String) value; - final Region region = RegionUtils.getRegion(valueStr); - if (!RegionUtils.getRegions().contains(region)) { - throw new ConfigException(name, valueStr, "supported values are: " + SUPPORTED_AWS_REGIONS); - } - } - } + public boolean hasStsEndpointConfig() { + return s3ConfigFragment.hasStsEndpointConfig(); } - private static class BucketNameValidator implements ConfigDef.Validator { - @Override - public void ensureValid(final String name, final Object value) { - try { - if (value != null) { - BucketNameUtils.validateBucketName((String) value); - } - } catch (final IllegalArgumentException e) { - throw new ConfigException("Illegal bucket name: " + e.getMessage()); - } - } + public AwsStsEndpointConfig getStsEndpointConfig() { + return s3ConfigFragment.getStsEndpointConfig(); } - private void validate() { - LOGGER.debug("Validating config."); + public AwsClientBuilder.EndpointConfiguration getAwsEndpointConfiguration() { + return s3ConfigFragment.getAwsEndpointConfiguration(); } - public long getS3RetryBackoffDelayMs() { - return getLong(AWS_S3_RETRY_BACKOFF_DELAY_MS_CONFIG); + public BasicAWSCredentials getAwsCredentials() { + return s3ConfigFragment.getAwsCredentials(); } - public long getS3RetryBackoffMaxDelayMs() { - return getLong(AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_CONFIG); + public String getAwsS3EndPoint() { + return s3ConfigFragment.getAwsS3EndPoint(); } - public int getS3RetryBackoffMaxRetries() { - return getInt(AWS_S3_RETRY_BACKOFF_MAX_RETRIES_CONFIG); + public Region getAwsS3Region() { + return s3ConfigFragment.getAwsS3Region(); } public String getAwsS3BucketName() { - return getString(AWS_S3_BUCKET_NAME_CONFIG); - } - - public InputFormat getInputFormat() { - return InputFormat.valueOf(getString(INPUT_FORMAT_KEY).toUpperCase(Locale.ROOT)); + return s3ConfigFragment.getAwsS3BucketName(); } - Region getAwsS3Region() { - // we have priority of properties if old one not set or both old and new one set - // the new property value will be selected - if (Objects.nonNull(getString(AWS_S3_REGION_CONFIG))) { - return RegionUtils.getRegion(getString(AWS_S3_REGION_CONFIG)); - } else { - return RegionUtils.getRegion(Regions.US_EAST_1.getName()); - } + public String getServerSideEncryptionAlgorithmName() { + return s3ConfigFragment.getServerSideEncryptionAlgorithmName(); } - String getAwsS3EndPoint() { - return getString(AWS_S3_ENDPOINT_CONFIG); + public String getAwsS3Prefix() { + return s3ConfigFragment.getAwsS3Prefix(); } - boolean hasAwsStsRole() { - return getStsRole().isValid(); + public int getAwsS3PartSize() { + return s3ConfigFragment.getAwsS3PartSize(); } - AwsStsRole getStsRole() { - return new AwsStsRole(getString(AWS_STS_ROLE_ARN), getString(AWS_STS_ROLE_EXTERNAL_ID), - getString(AWS_STS_ROLE_SESSION_NAME), getInt(AWS_STS_ROLE_SESSION_DURATION)); - } - - boolean hasStsEndpointConfig() { - return getStsEndpointConfig().isValid(); - } - - AwsStsEndpointConfig getStsEndpointConfig() { - return new AwsStsEndpointConfig(getString(AWS_STS_CONFIG_ENDPOINT), getString(AWS_S3_REGION_CONFIG)); + public long getS3RetryBackoffDelayMs() { + return s3ConfigFragment.getS3RetryBackoffDelayMs(); } - AwsAccessSecret getAwsCredentials() { - return new AwsAccessSecret(getPassword(AWS_ACCESS_KEY_ID_CONFIG), getPassword(AWS_SECRET_ACCESS_KEY_CONFIG)); + public long getS3RetryBackoffMaxDelayMs() { + return s3ConfigFragment.getS3RetryBackoffMaxDelayMs(); } - AWSCredentialsProvider getCustomCredentialsProvider() { - return getConfiguredInstance(AWS_CREDENTIALS_PROVIDER_CONFIG, AWSCredentialsProvider.class); + public int getS3RetryBackoffMaxRetries() { + return s3ConfigFragment.getS3RetryBackoffMaxRetries(); } - String getTargetTopics() { - return getString(TARGET_TOPICS); + public AWSCredentialsProvider getCustomCredentialsProvider() { + return s3ConfigFragment.getCustomCredentialsProvider(); } - String getTargetTopicPartitions() { - return getString(TARGET_TOPIC_PARTITIONS); + public S3ConfigFragment getS3ConfigFragment() { + return s3ConfigFragment; } - String getSchemaRegistryUrl() { - return getString(SCHEMA_REGISTRY_URL); - } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigDef.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigDef.java index 8153213a2..e823f94a9 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigDef.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigDef.java @@ -16,6 +16,8 @@ package io.aiven.kafka.connect.s3.source.config; +import static io.aiven.kafka.connect.config.s3.S3CommonConfig.handleDeprecatedYyyyUppercase; + import java.util.List; import java.util.Map; @@ -25,6 +27,6 @@ public class S3SourceConfigDef extends ConfigDef { @Override public List<ConfigValue> validate(final Map<String, String> props) { - return super.validate(S3SourceConfig.preprocessProperties(props)); + return super.validate(handleDeprecatedYyyyUppercase(props)); } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java index afdf89fd6..d211133d7 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java @@ -16,7 +16,7 @@ package io.aiven.kafka.connect.s3.source.utils; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.FETCH_PAGE_SIZE; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.FETCH_PAGE_SIZE; import java.util.HashSet; import java.util.Iterator; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index 2fc195f03..a49978f8d 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -42,7 +42,7 @@ public class OffsetManager { private final Map<Map<String, Object>, Map<String, Object>> offsets; public OffsetManager(final SourceTaskContext context, final S3SourceConfig s3SourceConfig) { - final String s3Bucket = s3SourceConfig.getString(S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG); + final String s3Bucket = s3SourceConfig.getAwsS3BucketName(); final Set<Integer> partitions = parsePartitions(s3SourceConfig); final Set<String> topics = parseTopics(s3SourceConfig); @@ -123,12 +123,12 @@ void updateCurrentOffsets(final Map<String, Object> partitionMap, final Map<Stri } private static Set<Integer> parsePartitions(final S3SourceConfig s3SourceConfig) { - final String partitionString = s3SourceConfig.getString(S3SourceConfig.TARGET_TOPIC_PARTITIONS); + final String partitionString = s3SourceConfig.getTargetTopicPartitions(); return Arrays.stream(partitionString.split(",")).map(Integer::parseInt).collect(Collectors.toSet()); } private static Set<String> parseTopics(final S3SourceConfig s3SourceConfig) { - final String topicString = s3SourceConfig.getString(S3SourceConfig.TARGET_TOPICS); + final String topicString = s3SourceConfig.getTargetTopics(); return Arrays.stream(topicString.split(",")).collect(Collectors.toSet()); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index 40bf80bc4..f4386aefe 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -28,8 +28,8 @@ import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.storage.Converter; +import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.input.Transformer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -49,7 +49,7 @@ public static List<SourceRecord> processRecords(final Iterator<S3SourceRecord> s final OffsetManager offsetManager) { final Map<String, String> conversionConfig = new HashMap<>(); - final int maxPollRecords = s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS); + final int maxPollRecords = s3SourceConfig.getMaxPollRecords(); for (int i = 0; sourceRecordIterator.hasNext() && i < maxPollRecords && !connectorStopped.get(); i++) { final S3SourceRecord s3SourceRecord = sourceRecordIterator.next(); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 3a6c40812..5bb6bf2ff 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -16,8 +16,6 @@ package io.aiven.kafka.connect.s3.source.utils; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.MAX_POLL_RECORDS; - import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; @@ -29,8 +27,8 @@ import java.util.regex.Pattern; import java.util.stream.Stream; +import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.input.Transformer; import com.amazonaws.AmazonClientException; import com.amazonaws.services.s3.AmazonS3; @@ -155,7 +153,7 @@ private List<S3SourceRecord> readNext() { numOfProcessedRecs++; // Break if we have reached the max records per poll - if (sourceRecords.size() >= s3SourceConfig.getInt(MAX_POLL_RECORDS)) { + if (sourceRecords.size() >= s3SourceConfig.getMaxPollRecords()) { break; } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java index c839a1269..b20b713fa 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -16,8 +16,9 @@ package io.aiven.kafka.connect.s3.source; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPIC_PARTITIONS; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -35,10 +36,10 @@ import org.apache.kafka.connect.storage.Converter; import org.apache.kafka.connect.storage.OffsetStorageReader; -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.input.ByteArrayTransformer; -import io.aiven.kafka.connect.s3.source.input.InputFormat; -import io.aiven.kafka.connect.s3.source.input.Transformer; +import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; +import io.aiven.kafka.connect.common.source.input.InputFormat; +import io.aiven.kafka.connect.common.source.input.Transformer; +import io.aiven.kafka.connect.config.s3.S3ConfigFragment; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import io.aiven.kafka.connect.s3.source.utils.S3SourceRecord; import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; @@ -85,19 +86,19 @@ public static void setUpClass() { s3Api = new S3Mock.Builder().withPort(s3Port).withInMemoryBackend().build(); s3Api.start(); - commonProperties = Map.of(S3SourceConfig.AWS_ACCESS_KEY_ID_CONFIG, "test_key_id", - S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG, "test_secret_key", - S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET, S3SourceConfig.AWS_S3_ENDPOINT_CONFIG, - "http://localhost:" + s3Port, S3SourceConfig.AWS_S3_REGION_CONFIG, "us-west-2"); + commonProperties = Map.of(S3ConfigFragment.AWS_ACCESS_KEY_ID_CONFIG, "test_key_id", + S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG, "test_secret_key", + S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET, S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG, + "http://localhost:" + s3Port, S3ConfigFragment.AWS_S3_REGION_CONFIG, "us-west-2"); final AmazonS3ClientBuilder builder = AmazonS3ClientBuilder.standard(); final BasicAWSCredentials awsCreds = new BasicAWSCredentials( - commonProperties.get(S3SourceConfig.AWS_ACCESS_KEY_ID_CONFIG), - commonProperties.get(S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG)); + commonProperties.get(S3ConfigFragment.AWS_ACCESS_KEY_ID_CONFIG), + commonProperties.get(S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG)); builder.withCredentials(new AWSStaticCredentialsProvider(awsCreds)); - builder.withEndpointConfiguration( - new AwsClientBuilder.EndpointConfiguration(commonProperties.get(S3SourceConfig.AWS_S3_ENDPOINT_CONFIG), - commonProperties.get(S3SourceConfig.AWS_S3_REGION_CONFIG))); + builder.withEndpointConfiguration(new AwsClientBuilder.EndpointConfiguration( + commonProperties.get(S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG), + commonProperties.get(S3ConfigFragment.AWS_S3_REGION_CONFIG))); builder.withPathStyleAccessEnabled(true); s3Client = builder.build(); @@ -194,7 +195,7 @@ private void startSourceTask(final S3SourceTask s3SourceTask) { } private void setBasicProperties() { - properties.put(S3SourceConfig.INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); + properties.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); properties.put("name", "test_source_connector"); properties.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); properties.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); @@ -202,5 +203,6 @@ private void setBasicProperties() { properties.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); properties.put(TARGET_TOPIC_PARTITIONS, "0,1"); properties.put(TARGET_TOPICS, "testtopic"); + } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java index 6e84687eb..edbe8dc98 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java @@ -16,14 +16,16 @@ package io.aiven.kafka.connect.s3.source.config; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.SCHEMA_REGISTRY_URL; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPIC_PARTITIONS; import static org.assertj.core.api.Assertions.assertThat; import java.util.HashMap; -import io.aiven.kafka.connect.s3.source.input.InputFormat; +import io.aiven.kafka.connect.common.source.input.InputFormat; +import io.aiven.kafka.connect.config.s3.S3ConfigFragment; import com.amazonaws.regions.RegionUtils; import com.amazonaws.regions.Regions; @@ -35,15 +37,15 @@ void correctFullConfig() { final var props = new HashMap<String, String>(); // aws props - props.put(S3SourceConfig.AWS_ACCESS_KEY_ID_CONFIG, "AWS_ACCESS_KEY_ID"); - props.put(S3SourceConfig.AWS_SECRET_ACCESS_KEY_CONFIG, "AWS_SECRET_ACCESS_KEY"); - props.put(S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG, "the-bucket"); - props.put(S3SourceConfig.AWS_S3_ENDPOINT_CONFIG, "AWS_S3_ENDPOINT"); - props.put(S3SourceConfig.AWS_S3_PREFIX_CONFIG, "AWS_S3_PREFIX"); - props.put(S3SourceConfig.AWS_S3_REGION_CONFIG, Regions.US_EAST_1.getName()); + props.put(S3ConfigFragment.AWS_ACCESS_KEY_ID_CONFIG, "AWS_ACCESS_KEY_ID"); + props.put(S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG, "AWS_SECRET_ACCESS_KEY"); + props.put(S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG, "the-bucket"); + props.put(S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG, "AWS_S3_ENDPOINT"); + props.put(S3ConfigFragment.AWS_S3_PREFIX_CONFIG, "AWS_S3_PREFIX"); + props.put(S3ConfigFragment.AWS_S3_REGION_CONFIG, Regions.US_EAST_1.getName()); // record, topic specific props - props.put(S3SourceConfig.INPUT_FORMAT_KEY, InputFormat.AVRO.getValue()); + props.put(INPUT_FORMAT_KEY, InputFormat.AVRO.getValue()); props.put(TARGET_TOPIC_PARTITIONS, "0,1"); props.put(TARGET_TOPICS, "testtopic"); props.put(SCHEMA_REGISTRY_URL, "localhost:8081"); @@ -51,8 +53,8 @@ void correctFullConfig() { final var conf = new S3SourceConfig(props); final var awsCredentials = conf.getAwsCredentials(); - assertThat(awsCredentials.getAccessKeyId().value()).isEqualTo("AWS_ACCESS_KEY_ID"); - assertThat(awsCredentials.getSecretAccessKey().value()).isEqualTo("AWS_SECRET_ACCESS_KEY"); + assertThat(awsCredentials.getAWSAccessKeyId()).isEqualTo("AWS_ACCESS_KEY_ID"); + assertThat(awsCredentials.getAWSSecretKey()).isEqualTo("AWS_SECRET_ACCESS_KEY"); assertThat(conf.getAwsS3BucketName()).isEqualTo("the-bucket"); assertThat(conf.getAwsS3EndPoint()).isEqualTo("AWS_S3_ENDPOINT"); assertThat(conf.getAwsS3Region()).isEqualTo(RegionUtils.getRegion("us-east-1")); @@ -62,9 +64,9 @@ void correctFullConfig() { assertThat(conf.getTargetTopicPartitions()).isEqualTo("0,1"); assertThat(conf.getSchemaRegistryUrl()).isEqualTo("localhost:8081"); - assertThat(conf.getS3RetryBackoffDelayMs()).isEqualTo(S3SourceConfig.AWS_S3_RETRY_BACKOFF_DELAY_MS_DEFAULT); + assertThat(conf.getS3RetryBackoffDelayMs()).isEqualTo(S3ConfigFragment.AWS_S3_RETRY_BACKOFF_DELAY_MS_DEFAULT); assertThat(conf.getS3RetryBackoffMaxDelayMs()) - .isEqualTo(S3SourceConfig.AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_DEFAULT); - assertThat(conf.getS3RetryBackoffMaxRetries()).isEqualTo(S3SourceConfig.S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT); + .isEqualTo(S3ConfigFragment.AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_DEFAULT); + assertThat(conf.getS3RetryBackoffMaxRetries()).isEqualTo(S3ConfigFragment.S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT); } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java index 99671781f..a2b4db378 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java @@ -16,19 +16,28 @@ package io.aiven.kafka.connect.s3.source.testutils; +import static org.apache.kafka.connect.data.Schema.INT32_SCHEMA; +import static org.apache.kafka.connect.data.Schema.STRING_SCHEMA; + import java.io.IOException; import java.net.ConnectException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.kafka.common.record.TimestampType; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.sink.SinkRecord; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericData; -import org.apache.parquet.avro.AvroParquetWriter; -import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; -import org.apache.parquet.io.LocalOutputFile; -import org.apache.parquet.io.OutputFile; +import io.aiven.kafka.connect.common.config.OutputField; +import io.aiven.kafka.connect.common.config.OutputFieldEncodingType; +import io.aiven.kafka.connect.common.config.OutputFieldType; +import io.aiven.kafka.connect.common.output.parquet.ParquetOutputWriter; public final class ContentUtils { private ContentUtils() { @@ -44,11 +53,11 @@ public static Path getTmpFilePath(final String name1) throws IOException { public static void writeParquetFile(final String tempFilePath, final String name1) throws IOException { // Define the Avro schema - final String schemaString = "{" + "\"type\":\"record\"," + "\"name\":\"User\"," + "\"fields\":[" - + "{\"name\":\"name\",\"type\":\"string\"}," + "{\"name\":\"age\",\"type\":\"int\"}," - + "{\"name\":\"email\",\"type\":\"string\"}" + "]" + "}"; - final Schema schema = new Schema.Parser().parse(schemaString); - + final Schema schema = SchemaBuilder.struct() + .field("name", STRING_SCHEMA) + .field("age", INT32_SCHEMA) + .field("email", STRING_SCHEMA) + .build(); // Write the Parquet file try { writeParquetFile(tempFilePath, schema, name1, 100); @@ -61,27 +70,30 @@ public static void writeParquetFile(final String tempFilePath, final String name private static void writeParquetFile(final String outputPath, final Schema schema, final String name1, final int numOfRecords) throws IOException { - // Create sample records - GenericData.Record user; + final List<Struct> allParquetRecords = new ArrayList<>(); - // Create a Parquet writer - final OutputFile outputFile = new LocalOutputFile(Paths.get(outputPath)); - try (ParquetWriter<GenericData.Record> writer = AvroParquetWriter.<GenericData.Record>builder(outputFile) - .withSchema(schema) - .withCompressionCodec(CompressionCodecName.SNAPPY) - .withRowGroupSize(100L * 1024L) - .withPageSize(1024 * 1024) - .build()) { - // Write records to the Parquet file - for (int i = 0; i < numOfRecords; i++) { - user = new GenericData.Record(schema); - user.put("name", name1 + i); - user.put("age", 30); - user.put("email", name1 + "@test"); + for (int i = 0; i < numOfRecords; i++) { + allParquetRecords + .add(new Struct(schema).put("name", name1 + i).put("age", 30).put("email", name1 + "@test")); + } - writer.write(user); + // Create a Parquet writer + final Path outputFilePath = Paths.get(outputPath); + try (var outputStream = Files.newOutputStream(outputFilePath.toAbsolutePath()); + var parquetWriter = new ParquetOutputWriter( + List.of(new OutputField(OutputFieldType.VALUE, OutputFieldEncodingType.NONE)), outputStream, + Collections.emptyMap(), false)) { + int counter = 0; + final var sinkRecords = new ArrayList<SinkRecord>(); + for (final var r : allParquetRecords) { + final var sinkRecord = new SinkRecord( // NOPMD AvoidInstantiatingObjectsInLoops + "some-topic", 1, STRING_SCHEMA, "some-key-" + counter, schema, r, 100L, 1000L + counter, + TimestampType.CREATE_TIME, null); + sinkRecords.add(sinkRecord); + counter++; } - + parquetWriter.writeRecords(sinkRecords); } + } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java index cc0887b67..39496b8d2 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java @@ -16,6 +16,7 @@ package io.aiven.kafka.connect.s3.source.utils; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; @@ -49,6 +50,7 @@ private static Map<String, String> getConfigMap(final int maxTasks, final int ta final Map<String, String> configMap = new HashMap<>(); configMap.put("tasks.max", String.valueOf(maxTasks)); configMap.put("task.id", String.valueOf(taskId)); + configMap.put(AWS_S3_BUCKET_NAME_CONFIG, "testbucket"); return configMap; } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java index 52d0bfd0e..1367d71f0 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java @@ -16,9 +16,9 @@ package io.aiven.kafka.connect.s3.source.utils; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPIC_PARTITIONS; import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPICS; -import static io.aiven.kafka.connect.s3.source.config.S3SourceConfig.TARGET_TOPIC_PARTITIONS; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; @@ -31,6 +31,7 @@ import org.apache.kafka.connect.source.SourceTaskContext; import org.apache.kafka.connect.storage.OffsetStorageReader; +import io.aiven.kafka.connect.config.s3.S3ConfigFragment; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import org.junit.jupiter.api.BeforeEach; @@ -139,7 +140,7 @@ void testIncrementAndUpdateOffsetMapNonExistingOffset() { } private void setBasicProperties() { - properties.put(S3SourceConfig.AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET); + properties.put(S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET); properties.put(TARGET_TOPIC_PARTITIONS, "0,1"); properties.put(TARGET_TOPICS, "topic1,topic2"); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java index f6aea18d5..a9ee18917 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -38,8 +38,8 @@ import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.storage.Converter; +import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.input.Transformer; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -75,7 +75,7 @@ void setUp() { @Test void testProcessRecordsNoRecords() { - when(s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS)).thenReturn(5); + when(s3SourceConfig.getMaxPollRecords()).thenReturn(5); when(sourceRecordIterator.hasNext()).thenReturn(false); final List<SourceRecord> results = new ArrayList<>(); @@ -94,7 +94,7 @@ void testProcessRecordsNoRecords() { @Test void testProcessRecordsWithRecords() throws ConnectException { - when(s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS)).thenReturn(5); + when(s3SourceConfig.getMaxPollRecords()).thenReturn(5); when(sourceRecordIterator.hasNext()).thenReturn(true, false); // One iteration with records final S3SourceRecord mockRecord = mock(S3SourceRecord.class); @@ -117,7 +117,7 @@ void testProcessRecordsWithRecords() throws ConnectException { @Test void testProcessRecordsConnectorStopped() { - when(s3SourceConfig.getInt(S3SourceConfig.MAX_POLL_RECORDS)).thenReturn(5); + when(s3SourceConfig.getMaxPollRecords()).thenReturn(5); connectorStopped.set(true); // Simulate connector stopped final List<SourceRecord> results = new ArrayList<>(); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java index bb4ca8ead..4630432ac 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -29,8 +29,8 @@ import java.util.List; import java.util.stream.Stream; +import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.input.Transformer; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.ListObjectsV2Result; diff --git a/settings.gradle.kts b/settings.gradle.kts index 1f4c61c96..21aca87b9 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -40,6 +40,7 @@ dependencyResolutionManagement { "org.apache.hadoop:hadoop-mapreduce-client-core:$hadoopVersion") library("parquet-avro", "org.apache.parquet:parquet-avro:$parquetVersion") library("parquet-tools", "org.apache.parquet:parquet-tools:$parquetVersion") + library("parquet-hadoop", "org.apache.parquet:parquet-hadoop:$parquetVersion") } create("compressionlibs") { library("snappy", "org.xerial.snappy:snappy-java:$snappyVersion") From 17d69bb8cf1f4cea8a098aeef72cc413d7645e14 Mon Sep 17 00:00:00 2001 From: Aindriu Lavelle <aindriu.lavelle@aiven.io> Date: Thu, 5 Dec 2024 08:38:23 +0000 Subject: [PATCH 74/90] Update to add errors.tolerance configuration Signed-off-by: Aindriu Lavelle <aindriu.lavelle@aiven.io> --- .../common/config/SourceCommonConfig.java | 5 +++ .../common/config/SourceConfigFragment.java | 30 ++++++++++++- .../common/config/enums/ErrorsTolerance.java | 44 +++++++++++++++++++ .../s3/source/utils/RecordProcessor.java | 14 ++++-- .../s3/source/utils/RecordProcessorTest.java | 41 +++++++++++++++++ 5 files changed, 129 insertions(+), 5 deletions(-) create mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/config/enums/ErrorsTolerance.java diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java index 44575e5e0..7fb8cd9b2 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java @@ -20,6 +20,7 @@ import org.apache.kafka.common.config.ConfigDef; +import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; import io.aiven.kafka.connect.common.source.input.InputFormat; public class SourceCommonConfig extends CommonConfig { @@ -62,6 +63,10 @@ public String getTargetTopicPartitions() { return sourceConfigFragment.getTargetTopicPartitions(); } + public ErrorsTolerance getErrorsTolerance() { + return ErrorsTolerance.forName(sourceConfigFragment.getErrorsTolerance()); + } + public int getMaxPollRecords() { return sourceConfigFragment.getMaxPollRecords(); } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java index 568610da7..e0c669fcf 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java @@ -19,6 +19,10 @@ import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; +import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; + +import org.codehaus.plexus.util.StringUtils; + public final class SourceConfigFragment extends ConfigFragment { private static final String GROUP_OTHER = "OTHER_CFG"; public static final String MAX_POLL_RECORDS = "max.poll.records"; @@ -26,6 +30,7 @@ public final class SourceConfigFragment extends ConfigFragment { private static final String GROUP_OFFSET_TOPIC = "OFFSET_TOPIC"; public static final String TARGET_TOPIC_PARTITIONS = "topic.partitions"; public static final String TARGET_TOPICS = "topics"; + public static final String ERRORS_TOLERANCE = "errors.tolerance"; /** * Construct the ConfigFragment.. @@ -41,9 +46,14 @@ public static ConfigDef update(final ConfigDef configDef) { int sourcePollingConfigCounter = 0; configDef.define(MAX_POLL_RECORDS, ConfigDef.Type.INT, 500, ConfigDef.Range.atLeast(1), - ConfigDef.Importance.MEDIUM, "Max poll records", GROUP_OTHER, sourcePollingConfigCounter++, // NOPMD - // UnusedAssignment + ConfigDef.Importance.MEDIUM, "Max poll records", GROUP_OTHER, sourcePollingConfigCounter++, ConfigDef.Width.NONE, MAX_POLL_RECORDS); + // KIP-298 Error Handling in Connect + configDef.define(ERRORS_TOLERANCE, ConfigDef.Type.STRING, "none", new ErrorsToleranceValidator(), + ConfigDef.Importance.MEDIUM, + "Indicates to the connector what level of exceptions are allowed before the connector stops, supported values : none,all", + GROUP_OTHER, sourcePollingConfigCounter++, ConfigDef.Width.NONE, ERRORS_TOLERANCE); + configDef.define(EXPECTED_MAX_MESSAGE_BYTES, ConfigDef.Type.INT, 1_048_588, ConfigDef.Importance.MEDIUM, "The largest record batch size allowed by Kafka config max.message.bytes", GROUP_OTHER, sourcePollingConfigCounter++, // NOPMD @@ -58,6 +68,7 @@ public static ConfigDef update(final ConfigDef configDef) { configDef.define(TARGET_TOPICS, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "eg : connect-storage-offsets", GROUP_OFFSET_TOPIC, offsetStorageGroupCounter++, ConfigDef.Width.NONE, TARGET_TOPICS); // NOPMD + return configDef; } @@ -77,4 +88,19 @@ public int getExpectedMaxMessageBytes() { return cfg.getInt(EXPECTED_MAX_MESSAGE_BYTES); } + public String getErrorsTolerance() { + return cfg.getString(ERRORS_TOLERANCE); + } + + private static class ErrorsToleranceValidator implements ConfigDef.Validator { + @Override + public void ensureValid(final String name, final Object value) { + final String errorsTolerance = (String) value; + if (StringUtils.isBlank(errorsTolerance)) { + // This will throw an Exception if not a valid value. + ErrorsTolerance.forName(errorsTolerance); + } + } + } + } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/enums/ErrorsTolerance.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/enums/ErrorsTolerance.java new file mode 100644 index 000000000..ccfe58d29 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/enums/ErrorsTolerance.java @@ -0,0 +1,44 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.config.enums; + +import java.util.Arrays; +import java.util.Objects; + +import org.apache.kafka.common.config.ConfigException; + +public enum ErrorsTolerance { + + NONE("none"), ALL("all"); + + public final String name; + + ErrorsTolerance(final String name) { + this.name = name; + } + + public static ErrorsTolerance forName(final String name) { + Objects.requireNonNull(name, "name cannot be null"); + for (final ErrorsTolerance errorsTolerance : ErrorsTolerance.values()) { + if (errorsTolerance.name.equalsIgnoreCase(name)) { + return errorsTolerance; + } + } + throw new ConfigException(String.format("Unknown errors.tolerance type: %s, allowed values %s ", name, + Arrays.toString(ErrorsTolerance.values()))); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index 13104374b..b89cdfd81 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -24,10 +24,12 @@ import java.util.concurrent.atomic.AtomicBoolean; import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.storage.Converter; +import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; @@ -79,9 +81,15 @@ static SourceRecord createSourceRecord(final S3SourceRecord s3SourceRecord, fina s3SourceRecord.setOffsetMap(offsetManager.getOffsets().get(s3SourceRecord.getPartitionMap())); return s3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue); } catch (DataException e) { - LOGGER.error("Error in reading s3 object stream {}", e.getMessage(), e); - sourceClient.addFailedObjectKeys(s3SourceRecord.getObjectKey()); - throw e; + if (ErrorsTolerance.NONE.equals(s3SourceConfig.getErrorsTolerance())) { + throw new ConnectException("Data Exception caught during S3 record to source record transformation", e); + } else { + sourceClient.addFailedObjectKeys(s3SourceRecord.getObjectKey()); + LOGGER.warn( + "Data Exception caught during S3 record to source record transformation {} . errors.tolerance set to 'all', logging warning and continuing to process.", + e.getMessage(), e); + return null; + } } } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java index d304bc59b..11dae1dc0 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -17,6 +17,7 @@ package io.aiven.kafka.connect.s3.source.utils; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.mock; @@ -35,9 +36,11 @@ import java.util.concurrent.atomic.AtomicBoolean; import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.storage.Converter; +import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; @@ -151,4 +154,42 @@ void testCreateSourceRecords() { assertThat(sourceRecords).isNotNull(); } + + @Test + void errorToleranceOnNONE() { + final S3SourceRecord mockRecord = mock(S3SourceRecord.class); + when(mockRecord.getTopic()).thenReturn("test-topic"); + when(mockRecord.key()).thenReturn("mock-key".getBytes(StandardCharsets.UTF_8)); + when(mockRecord.value()).thenReturn("mock-value".getBytes(StandardCharsets.UTF_8)); + + when(valueConverter.toConnectData(anyString(), any())) + .thenReturn(new SchemaAndValue(null, "mock-value-converted")); + when(mockRecord.getSourceRecord(anyString(), any(), any())).thenThrow(new DataException("generic issue")); + + when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.NONE); + + assertThatThrownBy(() -> RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, + Optional.of(keyConverter), valueConverter, new HashMap<>(), transformer, sourceClient, offsetManager)) + .isInstanceOf(org.apache.kafka.connect.errors.ConnectException.class) + .hasMessage("Data Exception caught during S3 record to source record transformation"); + + } + + @Test + void errorToleranceOnALL() { + final S3SourceRecord mockRecord = mock(S3SourceRecord.class); + when(mockRecord.getTopic()).thenReturn("test-topic"); + when(mockRecord.key()).thenReturn("mock-key".getBytes(StandardCharsets.UTF_8)); + when(mockRecord.value()).thenReturn("mock-value".getBytes(StandardCharsets.UTF_8)); + + when(valueConverter.toConnectData(anyString(), any())) + .thenReturn(new SchemaAndValue(null, "mock-value-converted")); + when(mockRecord.getSourceRecord(anyString(), any(), any())).thenThrow(new DataException("generic issue")); + + when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.ALL); + + assertThat(RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, Optional.of(keyConverter), + valueConverter, new HashMap<>(), transformer, sourceClient, offsetManager)).isNull(); + + } } From 076e4242b60108f16699e9c4394a181b3b98dc1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aindri=C3=BA=20Lavelle?= <121855584+aindriu-aiven@users.noreply.github.com> Date: Fri, 6 Dec 2024 16:56:54 +0000 Subject: [PATCH 75/90] Split api layer from file reader (#365) The update makes updates to the SourceRecordIterator to remove the requirement for a S3Client and specific S3 knowledge from the iterator. The iterator will now also call for more files after the initial set of files has been processed. The only remaining work to be done is to remove the construction of the S3Object into an iterator from the SourceRecordIterator in a follow up PR which will allow it to be completely re-useable. --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: Aindriu Lavelle <aindriu.lavelle@aiven.io> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gcs-sink-connector/build.gradle.kts | 2 +- .../connect/config/s3/S3ConfigFragment.java | 5 + .../kafka/connect/s3/source/S3SourceTask.java | 26 +--- .../s3/source/utils/AWSV2SourceClient.java | 133 ++++++++++++++++++ .../connect/s3/source/utils/FileReader.java | 80 ----------- .../s3/source/utils/RecordProcessor.java | 10 +- .../s3/source/utils/SourceRecordIterator.java | 72 +++++----- ...erTest.java => AWSV2SourceClientTest.java} | 37 ++--- .../s3/source/utils/RecordProcessorTest.java | 10 +- .../utils/SourceRecordIteratorTest.java | 38 ++--- 10 files changed, 223 insertions(+), 190 deletions(-) create mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java delete mode 100644 s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java rename s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/{FileReaderTest.java => AWSV2SourceClientTest.java} (84%) diff --git a/gcs-sink-connector/build.gradle.kts b/gcs-sink-connector/build.gradle.kts index 2c33f4c67..4af195ba7 100644 --- a/gcs-sink-connector/build.gradle.kts +++ b/gcs-sink-connector/build.gradle.kts @@ -98,7 +98,7 @@ dependencies { testImplementation(apache.kafka.connect.api) testImplementation(apache.kafka.connect.runtime) testImplementation(apache.kafka.connect.json) - testImplementation("com.google.cloud:google-cloud-nio:0.127.26") + testImplementation("com.google.cloud:google-cloud-nio:0.127.27") testImplementation(compressionlibs.snappy) testImplementation(compressionlibs.zstd.jni) diff --git a/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java b/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java index 8b38c8f4c..1e86638e1 100644 --- a/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java +++ b/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java @@ -483,4 +483,9 @@ public int getS3RetryBackoffMaxRetries() { public AWSCredentialsProvider getCustomCredentialsProvider() { return cfg.getConfiguredInstance(AWS_CREDENTIALS_PROVIDER_CONFIG, AWSCredentialsProvider.class); } + + public int getFetchPageSize() { + return cfg.getInt(FETCH_PAGE_SIZE); + } + } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 86a870bcd..be3d89618 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -17,7 +17,6 @@ package io.aiven.kafka.connect.s3.source; import static io.aiven.kafka.connect.common.config.SourceConfigFragment.MAX_POLL_RECORDS; -import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; @@ -37,9 +36,8 @@ import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.common.source.input.TransformerFactory; -import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.utils.FileReader; +import io.aiven.kafka.connect.s3.source.utils.AWSV2SourceClient; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import io.aiven.kafka.connect.s3.source.utils.RecordProcessor; import io.aiven.kafka.connect.s3.source.utils.S3SourceRecord; @@ -79,15 +77,12 @@ public class S3SourceTask extends SourceTask { private Transformer transformer; - private String s3Bucket; - private boolean taskInitialized; private final AtomicBoolean connectorStopped = new AtomicBoolean(); - private final S3ClientFactory s3ClientFactory = new S3ClientFactory(); private final Object pollLock = new Object(); - private FileReader fileReader; + private AWSV2SourceClient awsv2SourceClient; private final Set<String> failedObjectKeys = new HashSet<>(); private final Set<String> inProcessObjectKeys = new HashSet<>(); @@ -108,11 +103,9 @@ public void start(final Map<String, String> props) { LOGGER.info("S3 Source task started."); s3SourceConfig = new S3SourceConfig(props); initializeConverters(); - initializeS3Client(); - this.s3Bucket = s3SourceConfig.getString(AWS_S3_BUCKET_NAME_CONFIG); this.transformer = TransformerFactory.getTransformer(s3SourceConfig); offsetManager = new OffsetManager(context, s3SourceConfig); - fileReader = new FileReader(s3SourceConfig, this.s3Bucket, failedObjectKeys); + awsv2SourceClient = new AWSV2SourceClient(s3SourceConfig, failedObjectKeys); prepareReaderFromOffsetStorageReader(); this.taskInitialized = true; } @@ -132,14 +125,9 @@ private void initializeConverters() { } } - private void initializeS3Client() { - this.s3Client = s3ClientFactory.createAmazonS3Client(s3SourceConfig); - LOGGER.debug("S3 client initialized"); - } - private void prepareReaderFromOffsetStorageReader() { - sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, s3Client, this.s3Bucket, offsetManager, - this.transformer, fileReader); + sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, offsetManager, this.transformer, + awsv2SourceClient); } @Override @@ -187,7 +175,7 @@ private List<SourceRecord> extractSourceRecords(final List<SourceRecord> results return results; } return RecordProcessor.processRecords(sourceRecordIterator, results, s3SourceConfig, keyConverter, - valueConverter, connectorStopped, this.transformer, fileReader, offsetManager); + valueConverter, connectorStopped, this.transformer, awsv2SourceClient, offsetManager); } private void waitForObjects() throws InterruptedException { @@ -208,7 +196,7 @@ public void stop() { } private void closeResources() { - s3Client.shutdown(); + awsv2SourceClient.shutdown(); } // below for visibility in tests diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java new file mode 100644 index 000000000..1689ec9fa --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java @@ -0,0 +1,133 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.Objects; +import java.util.Set; +import java.util.function.Predicate; +import java.util.stream.Stream; + +import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.model.ListObjectsV2Request; +import com.amazonaws.services.s3.model.S3Object; +import com.amazonaws.services.s3.model.S3ObjectSummary; +import org.codehaus.plexus.util.StringUtils; + +/** + * Called AWSV2SourceClient as this source client implements the V2 version of the aws client library. Handles all calls + * and authentication to AWS and returns useable objects to the SourceRecordIterator. + */ +public class AWSV2SourceClient { + + public static final int PAGE_SIZE_FACTOR = 2; + private final S3SourceConfig s3SourceConfig; + private final AmazonS3 s3Client; + private final String bucketName; + + private Predicate<S3ObjectSummary> filterPredicate = summary -> summary.getSize() > 0; + private final Set<String> failedObjectKeys; + + /** + * @param s3SourceConfig + * configuration for Source connector + * @param failedObjectKeys + * all objectKeys which have already been tried but have been unable to process. + */ + public AWSV2SourceClient(final S3SourceConfig s3SourceConfig, final Set<String> failedObjectKeys) { + this.s3SourceConfig = s3SourceConfig; + final S3ClientFactory s3ClientFactory = new S3ClientFactory(); + this.s3Client = s3ClientFactory.createAmazonS3Client(s3SourceConfig); + this.bucketName = s3SourceConfig.getAwsS3BucketName(); + this.failedObjectKeys = new HashSet<>(failedObjectKeys); + } + + /** + * Valid for testing + * + * @param s3Client + * amazonS3Client + * @param s3SourceConfig + * configuration for Source connector + * @param failedObjectKeys + * all objectKeys which have already been tried but have been unable to process. + */ + AWSV2SourceClient(final AmazonS3 s3Client, final S3SourceConfig s3SourceConfig, + final Set<String> failedObjectKeys) { + this.s3SourceConfig = s3SourceConfig; + this.s3Client = s3Client; + this.bucketName = s3SourceConfig.getAwsS3BucketName(); + this.failedObjectKeys = new HashSet<>(failedObjectKeys); + } + + public Iterator<String> getListOfObjectKeys(final String startToken) { + final ListObjectsV2Request request = new ListObjectsV2Request().withBucketName(bucketName) + .withMaxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR); + + if (StringUtils.isNotBlank(startToken)) { + request.withStartAfter(startToken); + } + + final Stream<String> s3ObjectKeyStream = Stream + .iterate(s3Client.listObjectsV2(request), Objects::nonNull, response -> { + // This is called every time next() is called on the iterator. + if (response.isTruncated()) { + return s3Client.listObjectsV2(new ListObjectsV2Request().withBucketName(bucketName) + .withMaxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR) + .withContinuationToken(response.getNextContinuationToken())); + } else { + return null; + } + + }) + .flatMap(response -> response.getObjectSummaries() + .stream() + .filter(filterPredicate) + .filter(objectSummary -> assignObjectToTask(objectSummary.getKey())) + .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey()))) + .map(S3ObjectSummary::getKey); + return s3ObjectKeyStream.iterator(); + } + + public S3Object getObject(final String objectKey) { + return s3Client.getObject(bucketName, objectKey); + } + + public void addFailedObjectKeys(final String objectKey) { + this.failedObjectKeys.add(objectKey); + } + + public void setFilterPredicate(final Predicate<S3ObjectSummary> predicate) { + filterPredicate = predicate; + } + + private boolean assignObjectToTask(final String objectKey) { + final int maxTasks = Integer.parseInt(s3SourceConfig.originals().get("tasks.max").toString()); + final int taskId = Integer.parseInt(s3SourceConfig.originals().get("task.id").toString()) % maxTasks; + final int taskAssignment = Math.floorMod(objectKey.hashCode(), maxTasks); + return taskAssignment == taskId; + } + + public void shutdown() { + s3Client.shutdown(); + } + +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java deleted file mode 100644 index d211133d7..000000000 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.s3.source.utils; - -import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.FETCH_PAGE_SIZE; - -import java.util.HashSet; -import java.util.Iterator; -import java.util.Objects; -import java.util.Set; -import java.util.stream.Stream; - -import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; - -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.model.ListObjectsV2Request; -import com.amazonaws.services.s3.model.S3ObjectSummary; - -public class FileReader { - - public static final int PAGE_SIZE_FACTOR = 2; - private final S3SourceConfig s3SourceConfig; - private final String bucketName; - - private final Set<String> failedObjectKeys; - - public FileReader(final S3SourceConfig s3SourceConfig, final String bucketName, - final Set<String> failedObjectKeys) { - this.s3SourceConfig = s3SourceConfig; - this.bucketName = bucketName; - this.failedObjectKeys = new HashSet<>(failedObjectKeys); - } - - Iterator<S3ObjectSummary> fetchObjectSummaries(final AmazonS3 s3Client) { - final ListObjectsV2Request request = new ListObjectsV2Request().withBucketName(bucketName) - .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * PAGE_SIZE_FACTOR); - - final Stream<S3ObjectSummary> s3ObjectStream = Stream - .iterate(s3Client.listObjectsV2(request), Objects::nonNull, response -> { - if (response.isTruncated()) { - return s3Client.listObjectsV2(new ListObjectsV2Request().withBucketName(bucketName) - .withMaxKeys(s3SourceConfig.getInt(FETCH_PAGE_SIZE) * PAGE_SIZE_FACTOR) - .withContinuationToken(response.getNextContinuationToken())); - } else { - return null; - } - }) - .flatMap(response -> response.getObjectSummaries() - .stream() - .filter(objectSummary -> objectSummary.getSize() > 0) - .filter(objectSummary -> assignObjectToTask(objectSummary.getKey())) - .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey()))); - return s3ObjectStream.iterator(); - } - - public void addFailedObjectKeys(final String objectKey) { - this.failedObjectKeys.add(objectKey); - } - - private boolean assignObjectToTask(final String objectKey) { - final int maxTasks = Integer.parseInt(s3SourceConfig.originals().get("tasks.max").toString()); - final int taskId = Integer.parseInt(s3SourceConfig.originals().get("task.id").toString()) % maxTasks; - final int taskAssignment = Math.floorMod(objectKey.hashCode(), maxTasks); - return taskAssignment == taskId; - } -} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index f4386aefe..13104374b 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -45,7 +45,7 @@ private RecordProcessor() { public static List<SourceRecord> processRecords(final Iterator<S3SourceRecord> sourceRecordIterator, final List<SourceRecord> results, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, - final AtomicBoolean connectorStopped, final Transformer transformer, final FileReader fileReader, + final AtomicBoolean connectorStopped, final Transformer transformer, final AWSV2SourceClient sourceClient, final OffsetManager offsetManager) { final Map<String, String> conversionConfig = new HashMap<>(); @@ -55,7 +55,7 @@ public static List<SourceRecord> processRecords(final Iterator<S3SourceRecord> s final S3SourceRecord s3SourceRecord = sourceRecordIterator.next(); if (s3SourceRecord != null) { final SourceRecord sourceRecord = createSourceRecord(s3SourceRecord, s3SourceConfig, keyConverter, - valueConverter, conversionConfig, transformer, fileReader, offsetManager); + valueConverter, conversionConfig, transformer, sourceClient, offsetManager); results.add(sourceRecord); } } @@ -65,8 +65,8 @@ public static List<SourceRecord> processRecords(final Iterator<S3SourceRecord> s static SourceRecord createSourceRecord(final S3SourceRecord s3SourceRecord, final S3SourceConfig s3SourceConfig, final Optional<Converter> keyConverter, final Converter valueConverter, - final Map<String, String> conversionConfig, final Transformer transformer, final FileReader fileReader, - final OffsetManager offsetManager) { + final Map<String, String> conversionConfig, final Transformer transformer, + final AWSV2SourceClient sourceClient, final OffsetManager offsetManager) { final String topic = s3SourceRecord.getTopic(); final Optional<SchemaAndValue> keyData = keyConverter.map(c -> c.toConnectData(topic, s3SourceRecord.key())); @@ -80,7 +80,7 @@ static SourceRecord createSourceRecord(final S3SourceRecord s3SourceRecord, fina return s3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue); } catch (DataException e) { LOGGER.error("Error in reading s3 object stream {}", e.getMessage(), e); - fileReader.addFailedObjectKeys(s3SourceRecord.getObjectKey()); + sourceClient.addFailedObjectKeys(s3SourceRecord.getObjectKey()); throw e; } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 5bb6bf2ff..43ca7a717 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -31,9 +31,7 @@ import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import com.amazonaws.AmazonClientException; -import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.S3Object; -import com.amazonaws.services.s3.model.S3ObjectSummary; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -50,40 +48,43 @@ public final class SourceRecordIterator implements Iterator<S3SourceRecord> { + "(?<partitionId>\\d{5})-" + "(?<uniqueId>[a-zA-Z0-9]+)" + "\\.(?<fileExtension>[^.]+)$"); // topic-00001.txt private String currentObjectKey; - private final Iterator<S3ObjectSummary> s3ObjectSummaryIterator; + private Iterator<String> objectListIterator; private Iterator<S3SourceRecord> recordIterator = Collections.emptyIterator(); private final OffsetManager offsetManager; private final S3SourceConfig s3SourceConfig; private final String bucketName; - private final AmazonS3 s3Client; private final Transformer transformer; + // Once we decouple the S3Object from the Source Iterator we can change this to be the SourceApiClient + // At which point it will work for al our integrations. + private final AWSV2SourceClient sourceClient; // NOPMD - private final FileReader fileReader; // NOPMD - - public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final AmazonS3 s3Client, final String bucketName, - final OffsetManager offsetManager, final Transformer transformer, final FileReader fileReader) { + public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final OffsetManager offsetManager, + final Transformer transformer, final AWSV2SourceClient sourceClient) { this.s3SourceConfig = s3SourceConfig; this.offsetManager = offsetManager; - this.s3Client = s3Client; - this.bucketName = bucketName; + + this.bucketName = s3SourceConfig.getAwsS3BucketName(); this.transformer = transformer; - this.fileReader = fileReader; - s3ObjectSummaryIterator = fileReader.fetchObjectSummaries(s3Client); + this.sourceClient = sourceClient; + objectListIterator = sourceClient.getListOfObjectKeys(null); } private void nextS3Object() { - if (!s3ObjectSummaryIterator.hasNext()) { - recordIterator = Collections.emptyIterator(); - return; + if (!objectListIterator.hasNext()) { + // Start after the object Key we have just finished with. + objectListIterator = sourceClient.getListOfObjectKeys(currentObjectKey); + if (!objectListIterator.hasNext()) { + recordIterator = Collections.emptyIterator(); + return; + } } try { - final S3ObjectSummary file = s3ObjectSummaryIterator.next(); - if (file != null) { - currentObjectKey = file.getKey(); + currentObjectKey = objectListIterator.next(); + if (currentObjectKey != null) { recordIterator = createIteratorForCurrentFile(); } } catch (IOException e) { @@ -92,29 +93,30 @@ private void nextS3Object() { } private Iterator<S3SourceRecord> createIteratorForCurrentFile() throws IOException { - try (S3Object s3Object = s3Client.getObject(bucketName, currentObjectKey);) { - final Matcher fileMatcher = FILE_DEFAULT_PATTERN.matcher(currentObjectKey); - String topicName; - int defaultPartitionId; + final Matcher fileMatcher = FILE_DEFAULT_PATTERN.matcher(currentObjectKey); + String topicName; + int defaultPartitionId; + + if (fileMatcher.find()) { + // TODO move this from the SourceRecordIterator so that we can decouple it from S3 and make it API agnostic + try (S3Object s3Object = sourceClient.getObject(currentObjectKey);) { - if (fileMatcher.find()) { topicName = fileMatcher.group(PATTERN_TOPIC_KEY); defaultPartitionId = Integer.parseInt(fileMatcher.group(PATTERN_PARTITION_KEY)); - } else { - LOGGER.error("File naming doesn't match to any topic. {}", currentObjectKey); - s3Object.close(); - return Collections.emptyIterator(); - } - final long defaultStartOffsetId = 1L; + final long defaultStartOffsetId = 1L; - final String finalTopic = topicName; - final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(topicName, defaultPartitionId, - bucketName); + final String finalTopic = topicName; + final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(topicName, defaultPartitionId, + bucketName); - return getObjectIterator(s3Object, finalTopic, defaultPartitionId, defaultStartOffsetId, transformer, - partitionMap); + return getObjectIterator(s3Object, finalTopic, defaultPartitionId, defaultStartOffsetId, transformer, + partitionMap); + } + } else { + LOGGER.error("File naming doesn't match to any topic. {}", currentObjectKey); + return Collections.emptyIterator(); } } @@ -197,7 +199,7 @@ public S3SourceRecord next() { @Override public boolean hasNext() { - return recordIterator.hasNext() || s3ObjectSummaryIterator.hasNext(); + return recordIterator.hasNext() || objectListIterator.hasNext(); } @Override diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java similarity index 84% rename from s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java rename to s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java index 39496b8d2..5b5176690 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/FileReaderTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java @@ -20,6 +20,8 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; import java.io.IOException; @@ -39,18 +41,18 @@ import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.CsvSource; -class FileReaderTest { +class AWSV2SourceClientTest { - private static final String TEST_BUCKET = "test-bucket"; private AmazonS3 s3Client; - private FileReader fileReader; + private AWSV2SourceClient awsv2SourceClient; private static Map<String, String> getConfigMap(final int maxTasks, final int taskId) { final Map<String, String> configMap = new HashMap<>(); configMap.put("tasks.max", String.valueOf(maxTasks)); configMap.put("task.id", String.valueOf(taskId)); - configMap.put(AWS_S3_BUCKET_NAME_CONFIG, "testbucket"); + + configMap.put(AWS_S3_BUCKET_NAME_CONFIG, "test-bucket"); return configMap; } @@ -61,7 +63,7 @@ void testFetchObjectSummariesWithNoObjects(final int maxTasks, final int taskId) final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result(Collections.emptyList(), null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); - final Iterator<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); + final Iterator<String> summaries = awsv2SourceClient.getListOfObjectKeys(null); assertThat(summaries).isExhausted(); } @@ -71,9 +73,8 @@ void testFetchObjectSummariesWithOneObjectWithBasicConfig(final int maxTasks, fi final String objectKey = "any-key"; initializeWithTaskConfigs(maxTasks, taskId); - final Iterator<S3ObjectSummary> summaries = getS3ObjectSummaryIterator(objectKey); + final Iterator<String> summaries = getS3ObjectKeysIterator(objectKey); assertThat(summaries).hasNext(); - assertThat(summaries.next().getSize()).isEqualTo(1); } @ParameterizedTest @@ -81,9 +82,8 @@ void testFetchObjectSummariesWithOneObjectWithBasicConfig(final int maxTasks, fi void testFetchObjectSummariesWithOneNonZeroByteObjectWithTaskIdAssigned(final int maxTasks, final int taskId, final String objectKey) { initializeWithTaskConfigs(maxTasks, taskId); - final Iterator<S3ObjectSummary> summaries = getS3ObjectSummaryIterator(objectKey); + final Iterator<String> summaries = getS3ObjectKeysIterator(objectKey); assertThat(summaries).hasNext(); - assertThat(summaries.next().getSize()).isEqualTo(1); } @ParameterizedTest @@ -92,7 +92,8 @@ void testFetchObjectSummariesWithOneNonZeroByteObjectWithTaskIdAssigned(final in void testFetchObjectSummariesWithOneNonZeroByteObjectWithTaskIdUnassigned(final int maxTasks, final int taskId, final String objectKey) { initializeWithTaskConfigs(maxTasks, taskId); - final Iterator<S3ObjectSummary> summaries = getS3ObjectSummaryIterator(objectKey); + final Iterator<String> summaries = getS3ObjectKeysIterator(objectKey); + assertThat(summaries).isExhausted(); } @@ -103,11 +104,11 @@ void testFetchObjectSummariesWithZeroByteObject(final int maxTasks, final int ta final ListObjectsV2Result listObjectsV2Result = getListObjectsV2Result(); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); - final Iterator<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); + final Iterator<String> summaries = awsv2SourceClient.getListOfObjectKeys(null); // assigned 1 object to taskid assertThat(summaries).hasNext(); - assertThat(summaries.next().getSize()).isEqualTo(1); + assertThat(summaries.next()).isNotBlank(); assertThat(summaries).isExhausted(); } @@ -124,9 +125,10 @@ void testFetchObjectSummariesWithPagination() throws IOException { when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(firstResult).thenReturn(secondResult); - final Iterator<S3ObjectSummary> summaries = fileReader.fetchObjectSummaries(s3Client); - + final Iterator<String> summaries = awsv2SourceClient.getListOfObjectKeys(null); + verify(s3Client, times(1)).listObjectsV2(any(ListObjectsV2Request.class)); assertThat(summaries.next()).isNotNull(); + assertThat(summaries).isExhausted(); } private ListObjectsV2Result createListObjectsV2Result(final List<S3ObjectSummary> summaries, @@ -145,20 +147,21 @@ private S3ObjectSummary createObjectSummary(final long sizeOfObject, final Strin return summary; } - private Iterator<S3ObjectSummary> getS3ObjectSummaryIterator(final String objectKey) { + private Iterator<String> getS3ObjectKeysIterator(final String objectKey) { final S3ObjectSummary objectSummary = createObjectSummary(1, objectKey); final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result( Collections.singletonList(objectSummary), null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); - return fileReader.fetchObjectSummaries(s3Client); + return awsv2SourceClient.getListOfObjectKeys(null); } public void initializeWithTaskConfigs(final int maxTasks, final int taskId) { final Map<String, String> configMap = getConfigMap(maxTasks, taskId); final S3SourceConfig s3SourceConfig = new S3SourceConfig(configMap); - fileReader = new FileReader(s3SourceConfig, TEST_BUCKET, Collections.emptySet()); s3Client = mock(AmazonS3.class); + awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig, Collections.emptySet()); + } private ListObjectsV2Result getListObjectsV2Result() { diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java index a9ee18917..d304bc59b 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -62,7 +62,7 @@ class RecordProcessorTest { private OffsetManager offsetManager; @Mock - private FileReader fileReader; + private AWSV2SourceClient sourceClient; private AtomicBoolean connectorStopped; private Iterator<S3SourceRecord> sourceRecordIterator; @@ -86,7 +86,7 @@ void testProcessRecordsNoRecords() { Optional.of(keyConverter), valueConverter, connectorStopped, - transformer, fileReader, offsetManager + transformer, sourceClient, offsetManager ); assertThat(processedRecords).as("Processed records should be empty when there are no records.").isEmpty(); @@ -108,7 +108,7 @@ void testProcessRecordsWithRecords() throws ConnectException { Optional.of(keyConverter), valueConverter, connectorStopped, - transformer, fileReader, offsetManager + transformer, sourceClient, offsetManager ); assertThat(results).hasSize(1); @@ -128,7 +128,7 @@ void testProcessRecordsConnectorStopped() { Optional.of(keyConverter), valueConverter, connectorStopped, - transformer, fileReader, offsetManager + transformer, sourceClient, offsetManager ); assertThat(processedRecords).as("Processed records should be empty when connector is stopped.").isEmpty(); @@ -147,7 +147,7 @@ void testCreateSourceRecords() { when(mockRecord.getSourceRecord(anyString(), any(), any())).thenReturn(mock(SourceRecord.class)); final SourceRecord sourceRecords = RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, - Optional.of(keyConverter), valueConverter, new HashMap<>(), transformer, fileReader, offsetManager); + Optional.of(keyConverter), valueConverter, new HashMap<>(), transformer, sourceClient, offsetManager); assertThat(sourceRecords).isNotNull(); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java index 4630432ac..5e6d7928b 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -26,54 +26,42 @@ import java.io.ByteArrayInputStream; import java.nio.charset.StandardCharsets; import java.util.Collections; -import java.util.List; import java.util.stream.Stream; import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.model.ListObjectsV2Result; import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectInputStream; -import com.amazonaws.services.s3.model.S3ObjectSummary; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; final class SourceRecordIteratorTest { - private AmazonS3 mockS3Client; private S3SourceConfig mockConfig; private OffsetManager mockOffsetManager; private Transformer mockTransformer; - private FileReader mockFileReader; + private AWSV2SourceClient mockSourceApiClient; @BeforeEach public void setUp() { - mockS3Client = mock(AmazonS3.class); mockConfig = mock(S3SourceConfig.class); mockOffsetManager = mock(OffsetManager.class); mockTransformer = mock(Transformer.class); - mockFileReader = mock(FileReader.class); + mockSourceApiClient = mock(AWSV2SourceClient.class); } @Test void testIteratorProcessesS3Objects() throws Exception { - final S3ObjectSummary mockSummary = new S3ObjectSummary(); - mockSummary.setKey("topic-00001-abc123.txt"); - // Mock list of S3 object summaries - final List<S3ObjectSummary> mockObjectSummaries = Collections.singletonList(mockSummary); - - final ListObjectsV2Result result = mockListObjectsResult(mockObjectSummaries); - when(mockS3Client.listObjectsV2(anyString())).thenReturn(result); + final String key = "topic-00001-abc123.txt"; // Mock S3Object and InputStream try (S3Object mockS3Object = mock(S3Object.class); S3ObjectInputStream mockInputStream = new S3ObjectInputStream(new ByteArrayInputStream(new byte[] {}), null);) { - when(mockS3Client.getObject(anyString(), anyString())).thenReturn(mockS3Object); + when(mockSourceApiClient.getObject(anyString())).thenReturn(mockS3Object); when(mockS3Object.getObjectContent()).thenReturn(mockInputStream); when(mockTransformer.getRecords(any(), anyString(), anyInt(), any())).thenReturn(Stream.of(new Object())); @@ -84,26 +72,20 @@ void testIteratorProcessesS3Objects() throws Exception { when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); - when(mockFileReader.fetchObjectSummaries(any())).thenReturn(Collections.emptyIterator()); - SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockS3Client, "test-bucket", - mockOffsetManager, mockTransformer, mockFileReader); + when(mockSourceApiClient.getListOfObjectKeys(any())).thenReturn(Collections.emptyIterator()); + SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, + mockSourceApiClient); assertThat(iterator.hasNext()).isFalse(); assertThat(iterator.next()).isNull(); - when(mockFileReader.fetchObjectSummaries(any())).thenReturn(mockObjectSummaries.listIterator()); + when(mockSourceApiClient.getListOfObjectKeys(any())) + .thenReturn(Collections.singletonList(key).listIterator()); - iterator = new SourceRecordIterator(mockConfig, mockS3Client, "test-bucket", mockOffsetManager, - mockTransformer, mockFileReader); + iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, mockSourceApiClient); assertThat(iterator.hasNext()).isTrue(); assertThat(iterator.next()).isNotNull(); } } - - private ListObjectsV2Result mockListObjectsResult(final List<S3ObjectSummary> summaries) { - final ListObjectsV2Result result = mock(ListObjectsV2Result.class); - when(result.getObjectSummaries()).thenReturn(summaries); - return result; - } } From a67f27b5c73361dad7e9d71afbad8e552990e734 Mon Sep 17 00:00:00 2001 From: Aindriu Lavelle <aindriu.lavelle@aiven.io> Date: Fri, 6 Dec 2024 09:23:11 +0000 Subject: [PATCH 76/90] Correct check on validator Signed-off-by: Aindriu Lavelle <aindriu.lavelle@aiven.io> --- .../kafka/connect/common/config/SourceConfigFragment.java | 6 +++--- .../kafka/connect/common/config/enums/ErrorsTolerance.java | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java index e0c669fcf..c62431dcb 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java @@ -49,8 +49,8 @@ public static ConfigDef update(final ConfigDef configDef) { ConfigDef.Importance.MEDIUM, "Max poll records", GROUP_OTHER, sourcePollingConfigCounter++, ConfigDef.Width.NONE, MAX_POLL_RECORDS); // KIP-298 Error Handling in Connect - configDef.define(ERRORS_TOLERANCE, ConfigDef.Type.STRING, "none", new ErrorsToleranceValidator(), - ConfigDef.Importance.MEDIUM, + configDef.define(ERRORS_TOLERANCE, ConfigDef.Type.STRING, ErrorsTolerance.NONE.name(), + new ErrorsToleranceValidator(), ConfigDef.Importance.MEDIUM, "Indicates to the connector what level of exceptions are allowed before the connector stops, supported values : none,all", GROUP_OTHER, sourcePollingConfigCounter++, ConfigDef.Width.NONE, ERRORS_TOLERANCE); @@ -96,7 +96,7 @@ private static class ErrorsToleranceValidator implements ConfigDef.Validator { @Override public void ensureValid(final String name, final Object value) { final String errorsTolerance = (String) value; - if (StringUtils.isBlank(errorsTolerance)) { + if (StringUtils.isNotBlank(errorsTolerance)) { // This will throw an Exception if not a valid value. ErrorsTolerance.forName(errorsTolerance); } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/enums/ErrorsTolerance.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/enums/ErrorsTolerance.java index ccfe58d29..9c42c46d9 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/enums/ErrorsTolerance.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/enums/ErrorsTolerance.java @@ -25,7 +25,7 @@ public enum ErrorsTolerance { NONE("none"), ALL("all"); - public final String name; + private final String name; ErrorsTolerance(final String name) { this.name = name; From 3a1853a8c91fe1238798aae811cf2efe76f7a207 Mon Sep 17 00:00:00 2001 From: Murali Basani <muralidhar.basani@aiven.io> Date: Tue, 10 Dec 2024 15:40:56 +0100 Subject: [PATCH 77/90] Skip records if already processed [KCON-36] (#356) KCON-36 If a certain number of records are already processed in a file during dr, do not re process those records. Number of processed recs are already stored in offset storage. Retrieve that and skip in the stream. --- .../common/source/input/AvroTransformer.java | 43 ++++-------- .../source/input/ByteArrayTransformer.java | 21 +++++- .../common/source/input/JsonTransformer.java | 9 +-- .../source/input/ParquetTransformer.java | 8 +-- .../common/source/input/Transformer.java | 2 +- .../source/input/AvroTransformerTest.java | 67 ++++++++++++++----- .../input/ByteArrayTransformerTest.java | 4 +- .../source/input/JsonTransformerTest.java | 42 ++++++++++-- .../source/input/ParquetTransformerTest.java | 36 ++++++++-- .../s3/source/utils/OffsetManager.java | 13 +--- .../s3/source/utils/SourceRecordIterator.java | 35 ++++++---- .../utils/SourceRecordIteratorTest.java | 61 ++++++++++++++++- 12 files changed, 243 insertions(+), 98 deletions(-) diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java index 8869acb52..bdd52c4ed 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java @@ -20,9 +20,7 @@ import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; import java.util.Collections; -import java.util.List; import java.util.Map; import java.util.Spliterator; import java.util.function.Consumer; @@ -31,13 +29,10 @@ import org.apache.kafka.common.config.AbstractConfig; -import org.apache.avro.file.DataFileReader; import org.apache.avro.file.DataFileStream; -import org.apache.avro.file.SeekableByteArrayInput; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; -import org.apache.commons.io.IOUtils; import org.apache.commons.io.function.IOSupplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -53,9 +48,9 @@ public void configureValueConverter(final Map<String, String> config, final Abst @Override public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, - final int topicPartition, final AbstractConfig sourceConfig) { + final int topicPartition, final AbstractConfig sourceConfig, final long skipRecords) { final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); - return readAvroRecordsAsStream(inputStreamIOSupplier, datumReader); + return readAvroRecordsAsStream(inputStreamIOSupplier, datumReader, skipRecords); } @Override @@ -65,7 +60,7 @@ public byte[] getValueBytes(final Object record, final String topic, final Abstr } private Stream<Object> readAvroRecordsAsStream(final IOSupplier<InputStream> inputStreamIOSupplier, - final DatumReader<GenericRecord> datumReader) { + final DatumReader<GenericRecord> datumReader, final long skipRecords) { InputStream inputStream; // NOPMD CloseResource: being closed in try resources iterator DataFileStream<GenericRecord> dataFileStream; // NOPMD CloseResource: being closed in try resources iterator try { @@ -76,13 +71,15 @@ private Stream<Object> readAvroRecordsAsStream(final IOSupplier<InputStream> inp dataFileStream = new DataFileStream<>(inputStream, datumReader); // Wrap DataFileStream in a Stream using a custom Spliterator for lazy processing - return StreamSupport.stream(new AvroRecordSpliterator<>(dataFileStream), false).onClose(() -> { - try { - dataFileStream.close(); // Ensure the reader is closed after streaming - } catch (IOException e) { - LOGGER.error("Error closing BufferedReader: {}", e.getMessage(), e); - } - }); + return StreamSupport.stream(new AvroRecordSpliterator<>(dataFileStream), false) + .skip(skipRecords) + .onClose(() -> { + try { + dataFileStream.close(); // Ensure the reader is closed after streaming + } catch (IOException e) { + LOGGER.error("Error closing BufferedReader: {}", e.getMessage(), e); + } + }); } catch (IOException e) { LOGGER.error("Error in DataFileStream: {}", e.getMessage(), e); return Stream.empty(); // Return an empty stream if initialization fails @@ -126,20 +123,4 @@ public int characteristics() { return Spliterator.ORDERED | Spliterator.NONNULL; } } - - List<Object> readAvroRecords(final InputStream content, final DatumReader<GenericRecord> datumReader) { - final List<Object> records = new ArrayList<>(); - try (SeekableByteArrayInput sin = new SeekableByteArrayInput(IOUtils.toByteArray(content))) { - try (DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, datumReader)) { - reader.forEach(records::add); - } catch (IOException e) { - LOGGER.error("Failed to read records from DataFileReader for S3 object stream. Error: {}", - e.getMessage(), e); - } - } catch (IOException e) { - LOGGER.error("Failed to initialize SeekableByteArrayInput for S3 object stream. Error: {}", e.getMessage(), - e); - } - return records; - } } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java index 644a3f719..0c11770f0 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java @@ -40,15 +40,24 @@ public void configureValueConverter(final Map<String, String> config, final Abst @Override public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, - final int topicPartition, final AbstractConfig sourceConfig) { + final int topicPartition, final AbstractConfig sourceConfig, final long skipRecords) { // Create a Stream that processes each chunk lazily return StreamSupport.stream(new Spliterators.AbstractSpliterator<>(Long.MAX_VALUE, Spliterator.ORDERED) { final byte[] buffer = new byte[4096]; + InputStream inputStream; + + { + try { + inputStream = inputStreamIOSupplier.get(); // Open the InputStream once + } catch (IOException e) { + LOGGER.error("Error closing stream: {}", e.getMessage(), e); + } + } @Override public boolean tryAdvance(final java.util.function.Consumer<? super Object> action) { - try (InputStream inputStream = inputStreamIOSupplier.get()) { + try { final int bytesRead = inputStream.read(buffer); if (bytesRead == -1) { return false; @@ -62,7 +71,13 @@ public boolean tryAdvance(final java.util.function.Consumer<? super Object> acti return false; } } - }, false); + }, false).onClose(() -> { + try { + inputStreamIOSupplier.get().close(); // Ensure the reader is closed after streaming + } catch (IOException e) { + LOGGER.error("Error closing BufferedReader: {}", e.getMessage(), e); + } + }); } @Override diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java index acaa6884a..3ef3a830f 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java @@ -51,8 +51,8 @@ public void configureValueConverter(final Map<String, String> config, final Abst @Override public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, - final int topicPartition, final AbstractConfig sourceConfig) { - return readJsonRecordsAsStream(inputStreamIOSupplier); + final int topicPartition, final AbstractConfig sourceConfig, final long skipRecords) { + return readJsonRecordsAsStream(inputStreamIOSupplier, skipRecords); } @Override @@ -65,7 +65,8 @@ public byte[] getValueBytes(final Object record, final String topic, final Abstr } } - private Stream<Object> readJsonRecordsAsStream(final IOSupplier<InputStream> inputStreamIOSupplier) { + private Stream<Object> readJsonRecordsAsStream(final IOSupplier<InputStream> inputStreamIOSupplier, + final long skipRecords) { // Use a Stream that lazily processes each line as a JSON object CustomSpliterator customSpliteratorParam; try { @@ -80,7 +81,7 @@ private Stream<Object> readJsonRecordsAsStream(final IOSupplier<InputStream> inp } catch (IOException e) { LOGGER.error("Error closing BufferedReader: {}", e.getMessage(), e); } - }); + }).skip(skipRecords); } /* diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java index 24f44e1bf..9d6021a11 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java @@ -55,8 +55,8 @@ public void configureValueConverter(final Map<String, String> config, final Abst @Override public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, - final int topicPartition, final AbstractConfig sourceConfig) { - return getParquetStreamRecords(inputStreamIOSupplier, topic, topicPartition); + final int topicPartition, final AbstractConfig sourceConfig, final long skipRecords) { + return getParquetStreamRecords(inputStreamIOSupplier, topic, topicPartition, skipRecords); } @Override @@ -66,7 +66,7 @@ public byte[] getValueBytes(final Object record, final String topic, final Abstr } private Stream<Object> getParquetStreamRecords(final IOSupplier<InputStream> inputStreamIOSupplier, - final String topic, final int topicPartition) { + final String topic, final int topicPartition, final long skipRecords) { final String timestamp = String.valueOf(Instant.now().toEpochMilli()); File parquetFile; @@ -105,7 +105,7 @@ public boolean tryAdvance(final java.util.function.Consumer<? super Object> acti return false; } } - }, false).onClose(() -> { + }, false).skip(skipRecords).onClose(() -> { try { parquetReader.close(); // Ensure reader is closed when the stream is closed } catch (IOException e) { diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java index 8867ed6d9..96cda5924 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java @@ -29,7 +29,7 @@ public interface Transformer { void configureValueConverter(Map<String, String> config, AbstractConfig sourceConfig); Stream<Object> getRecords(IOSupplier<InputStream> inputStreamIOSupplier, String topic, int topicPartition, - AbstractConfig sourceConfig); + AbstractConfig sourceConfig, long skipRecords); byte[] getValueBytes(Object record, String topic, AbstractConfig sourceConfig); } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java index a0dc3d5d9..b35d73c80 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java @@ -25,19 +25,20 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; import io.aiven.kafka.connect.common.config.SourceCommonConfig; import org.apache.avro.Schema; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.DatumReader; import org.apache.avro.io.DatumWriter; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -73,46 +74,78 @@ void testConfigureValueConverter() { void testReadAvroRecordsInvalidData() { final InputStream inputStream = new ByteArrayInputStream("mock-avro-data".getBytes(StandardCharsets.UTF_8)); - final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); - final List<Object> records = avroTransformer.readAvroRecords(inputStream, datumReader); + final Stream<Object> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 0); - assertThat(records.size()).isEqualTo(0); + final List<Object> recs = records.collect(Collectors.toList()); + assertThat(recs).isEmpty(); } @Test void testReadAvroRecords() throws Exception { - final ByteArrayOutputStream avroData = generateMockAvroData(); + final ByteArrayOutputStream avroData = generateMockAvroData(25); final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); - final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); - final List<Object> records = avroTransformer.readAvroRecords(inputStream, datumReader); + final Stream<Object> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 0); - assertThat(records.size()).isEqualTo(2); + final List<Object> recs = records.collect(Collectors.toList()); + assertThat(recs).hasSize(25); } - ByteArrayOutputStream generateMockAvroData() throws IOException { + @Test + void testReadAvroRecordsSkipFew() throws Exception { + final ByteArrayOutputStream avroData = generateMockAvroData(20); + final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); + + final Stream<Object> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 5); + + final List<Object> recs = records.collect(Collectors.toList()); + assertThat(recs).hasSize(15); + // get first rec + assertThat(((GenericRecord) recs.get(0)).get("message").toString()) + .isEqualTo("Hello, Kafka Connect S3 Source! object 5"); + } + + @Test + void testReadAvroRecordsSkipMoreRecordsThanExist() throws Exception { + final ByteArrayOutputStream avroData = generateMockAvroData(20); + final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); + + final Stream<Object> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 25); + + final List<Object> recs = records.collect(Collectors.toList()); + assertThat(recs).hasSize(0); + } + + ByteArrayOutputStream generateMockAvroData(final int numRecs) throws IOException { final String schemaJson = "{\n" + " \"type\": \"record\",\n" + " \"name\": \"TestRecord\",\n" + " \"fields\": [\n" + " {\"name\": \"message\", \"type\": \"string\"},\n" + " {\"name\": \"id\", \"type\": \"int\"}\n" + " ]\n" + "}"; final Schema.Parser parser = new Schema.Parser(); final Schema schema = parser.parse(schemaJson); - return getAvroRecord(schema, 2); + return getAvroRecords(schema, numRecs); } - private static ByteArrayOutputStream getAvroRecord(final Schema schema, final int messageId) throws IOException { + private static ByteArrayOutputStream getAvroRecords(final Schema schema, final int numOfRecs) throws IOException { // Create Avro records - final GenericRecord avroRecord = new GenericData.Record(schema); - avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + messageId); - avroRecord.put("id", messageId); + final List<GenericRecord> avroRecords = new ArrayList<>(); + for (int i = 0; i < numOfRecs; i++) { + final GenericRecord avroRecord = new GenericData.Record(schema); // NOPMD AvoidInstantiatingObjectsInLoops + avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + i); + avroRecord.put("id", i); + avroRecords.add(avroRecord); + } // Serialize Avro records to byte arrays final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); + + // Append each record using a loop try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) { dataFileWriter.create(schema, outputStream); - dataFileWriter.append(avroRecord); // record 1 - dataFileWriter.append(avroRecord); // record 2 + for (final GenericRecord record : avroRecords) { + dataFileWriter.append(record); + } dataFileWriter.flush(); } outputStream.close(); diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java index 81aaf7b79..2122734bd 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java @@ -54,7 +54,7 @@ void testGetRecordsSingleChunk() { final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; final Stream<Object> records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, - sourceCommonConfig); + sourceCommonConfig, 0); final List<Object> recs = records.collect(Collectors.toList()); assertThat(recs).hasSize(1); @@ -68,7 +68,7 @@ void testGetRecordsEmptyInputStream() { final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; final Stream<Object> records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, - sourceCommonConfig); + sourceCommonConfig, 0); assertThat(records).hasSize(0); } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java index cdec0ace4..cf9a57527 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java @@ -26,7 +26,9 @@ import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import java.util.stream.Stream; import io.aiven.kafka.connect.common.config.SourceCommonConfig; @@ -72,11 +74,30 @@ void testHandleValueDataWithValidJson() { "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, - sourceCommonConfig); + sourceCommonConfig, 0); assertThat(jsonNodes).hasSize(1); } + @Test + void testHandleValueDataWithValidJsonSkipFew() { + final InputStream validJsonInputStream = new ByteArrayInputStream( + getJsonRecs(100).getBytes(StandardCharsets.UTF_8)); + final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; + final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + sourceCommonConfig, 25L); + + final List<Object> recs = jsonNodes.collect(Collectors.toList()); + assertThat(recs).hasSize(75); + assertThat(recs).extracting(record -> ((JsonNode) record).get("key").asText()) + .doesNotContain("value1") + .doesNotContain("value2") + .doesNotContain("value25") + .contains("value26") + .contains("value27") + .contains("value100"); + } + @Test void testHandleValueDataWithInvalidJson() { final InputStream invalidJsonInputStream = new ByteArrayInputStream( @@ -84,7 +105,7 @@ void testHandleValueDataWithInvalidJson() { final IOSupplier<InputStream> inputStreamIOSupplier = () -> invalidJsonInputStream; final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, - sourceCommonConfig); + sourceCommonConfig, 0); assertThat(jsonNodes).isEmpty(); } @@ -95,7 +116,7 @@ void testSerializeJsonDataValid() throws IOException { "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, - sourceCommonConfig); + sourceCommonConfig, 0); final byte[] serializedData = jsonTransformer.getValueBytes(jsonNodes.findFirst().get(), TESTTOPIC, sourceCommonConfig); @@ -108,7 +129,7 @@ void testSerializeJsonDataValid() throws IOException { @Test void testGetRecordsWithIOException() throws IOException { when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException")); - final Stream<Object> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null); + final Stream<Object> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); assertThat(resultStream).isEmpty(); } @@ -126,8 +147,19 @@ void testCustomSpliteratorStreamProcessing() throws IOException { @Test void testCustomSpliteratorWithIOExceptionDuringInitialization() throws IOException { when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException during initialization")); - final Stream<Object> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null); + final Stream<Object> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); assertThat(resultStream).isEmpty(); } + + String getJsonRecs(final int recordCount) { + final StringBuilder jsonRecords = new StringBuilder(); + for (int i = 1; i <= recordCount; i++) { + jsonRecords.append(String.format("{\"key\":\"value%d\"}", i)); + if (i < recordCount) { + jsonRecords.append("\n"); // NOPMD AppendCharacterWithChar + } + } + return jsonRecords.toString(); + } } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java index e247adbc0..02b946917 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java @@ -63,7 +63,7 @@ void testHandleValueDataWithZeroBytes() { final String topic = "test-topic"; final int topicPartition = 0; final Stream<Object> recs = parquetTransformer.getRecords(inputStreamIOSupplier, topic, topicPartition, - s3SourceConfig); + s3SourceConfig, 0L); assertThat(recs).isEmpty(); } @@ -79,15 +79,39 @@ void testGetRecordsWithValidData() throws Exception { final int topicPartition = 0; final List<Object> records = parquetTransformer - .getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig) + .getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig, 0L) .collect(Collectors.toList()); - assertThat(records).isNotEmpty(); + assertThat(records).hasSize(100); assertThat(records).extracting(record -> ((GenericRecord) record).get("name").toString()) .contains("name1") .contains("name2"); } + @Test + void testGetRecordsWithValidDataSkipFew() throws Exception { + final byte[] mockParquetData = generateMockParquetData(); + final InputStream inputStream = new ByteArrayInputStream(mockParquetData); + final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; + final SourceCommonConfig s3SourceConfig = mock(SourceCommonConfig.class); + + final String topic = "test-topic"; + final int topicPartition = 0; + + final List<Object> records = parquetTransformer + .getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig, 25L) + .collect(Collectors.toList()); + + assertThat(records).hasSize(75); + assertThat(records).extracting(record -> ((GenericRecord) record).get("name").toString()) + .doesNotContain("name1") + .doesNotContain("name2") + .doesNotContain("name24") + .contains("name25") + .contains("name26") + .contains("name99"); + } + @Test void testGetRecordsWithInvalidData() { final byte[] invalidData = "invalid data".getBytes(StandardCharsets.UTF_8); @@ -100,7 +124,7 @@ void testGetRecordsWithInvalidData() { final int topicPartition = 0; final Stream<Object> records = parquetTransformer.getRecords(inputStreamIOSupplier, topic, topicPartition, - s3SourceConfig); + s3SourceConfig, 0L); assertThat(records).isEmpty(); } @@ -126,7 +150,7 @@ void testIOExceptionCreatingTempFile() { final IOSupplier<InputStream> inputStreamSupplier = mock(IOSupplier.class); final Stream<Object> resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", 1, - null); + null, 0L); assertThat(resultStream).isEmpty(); } @@ -139,7 +163,7 @@ void testIOExceptionDuringDataCopy() throws IOException { final IOSupplier<InputStream> inputStreamSupplier = () -> inputStreamMock; final Stream<Object> resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", 1, - null); + null, 0L); assertThat(resultStream).isEmpty(); } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index a49978f8d..1b52d8d83 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -85,18 +85,11 @@ public String getObjectMapKey(final String currentObjectKey) { return OBJECT_KEY + SEPARATOR + currentObjectKey; } - public boolean shouldSkipRecord(final Map<String, Object> partitionMap, final String currentObjectKey, - final long numOfProcessedRecs) { + public long recordsProcessedForObjectKey(final Map<String, Object> partitionMap, final String currentObjectKey) { if (offsets.containsKey(partitionMap)) { - final Map<String, Object> offsetVal = offsets.get(partitionMap); - final String objectMapKey = getObjectMapKey(currentObjectKey); - - if (offsetVal.containsKey(objectMapKey)) { - final long offsetValue = (long) offsetVal.get(objectMapKey); - return numOfProcessedRecs <= offsetValue; - } + return (long) offsets.get(partitionMap).getOrDefault(getObjectMapKey(currentObjectKey), 0L); } - return false; + return 0L; } public void createNewOffsetMap(final Map<String, Object> partitionMap, final String objectKey, diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 43ca7a717..6cab0d12f 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -27,6 +27,7 @@ import java.util.regex.Pattern; import java.util.stream.Stream; +import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; @@ -46,6 +47,7 @@ public final class SourceRecordIterator implements Iterator<S3SourceRecord> { public static final Pattern FILE_DEFAULT_PATTERN = Pattern.compile("(?<topicName>[^/]+?)-" + "(?<partitionId>\\d{5})-" + "(?<uniqueId>[a-zA-Z0-9]+)" + "\\.(?<fileExtension>[^.]+)$"); // topic-00001.txt + public static final long BYTES_TRANSFORMATION_NUM_OF_RECS = 1L; private String currentObjectKey; private Iterator<String> objectListIterator; @@ -128,32 +130,30 @@ private Iterator<S3SourceRecord> getObjectIterator(final S3Object s3Object, fina private final Iterator<S3SourceRecord> internalIterator = readNext().iterator(); private List<S3SourceRecord> readNext() { - final byte[] keyBytes = currentObjectKey.getBytes(StandardCharsets.UTF_8); + final List<S3SourceRecord> sourceRecords = new ArrayList<>(); - int numOfProcessedRecs = 1; - boolean checkOffsetMap = true; + final long numberOfRecsAlreadyProcessed = offsetManager.recordsProcessedForObjectKey(partitionMap, + currentObjectKey); + + // Optimizing without reading stream again. + if (checkBytesTransformation(transformer, numberOfRecsAlreadyProcessed)) { + return sourceRecords; + } + + final byte[] keyBytes = currentObjectKey.getBytes(StandardCharsets.UTF_8); + try (Stream<Object> recordStream = transformer.getRecords(s3Object::getObjectContent, topic, - topicPartition, s3SourceConfig)) { + topicPartition, s3SourceConfig, numberOfRecsAlreadyProcessed)) { final Iterator<Object> recordIterator = recordStream.iterator(); while (recordIterator.hasNext()) { final Object record = recordIterator.next(); - // Check if the record should be skipped based on the offset - if (offsetManager.shouldSkipRecord(partitionMap, currentObjectKey, numOfProcessedRecs) - && checkOffsetMap) { - numOfProcessedRecs++; - continue; - } - final byte[] valueBytes = transformer.getValueBytes(record, topic, s3SourceConfig); - checkOffsetMap = false; sourceRecords.add(getSourceRecord(keyBytes, valueBytes, topic, topicPartition, offsetManager, startOffset, partitionMap)); - numOfProcessedRecs++; - // Break if we have reached the max records per poll if (sourceRecords.size() >= s3SourceConfig.getMaxPollRecords()) { break; @@ -164,6 +164,13 @@ private List<S3SourceRecord> readNext() { return sourceRecords; } + // For bytes transformation, read whole file as 1 record + private boolean checkBytesTransformation(final Transformer transformer, + final long numberOfRecsAlreadyProcessed) { + return transformer instanceof ByteArrayTransformer + && numberOfRecsAlreadyProcessed == BYTES_TRANSFORMATION_NUM_OF_RECS; + } + private S3SourceRecord getSourceRecord(final byte[] key, final byte[] value, final String topic, final int topicPartition, final OffsetManager offsetManager, final long startOffset, final Map<String, Object> partitionMap) { diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java index 5e6d7928b..d73068bfd 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -16,11 +16,17 @@ package io.aiven.kafka.connect.s3.source.utils; +import static io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator.BYTES_TRANSFORMATION_NUM_OF_RECS; import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.anyMap; import static org.mockito.Mockito.any; import static org.mockito.Mockito.anyInt; +import static org.mockito.Mockito.anyLong; import static org.mockito.Mockito.anyString; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; import java.io.ByteArrayInputStream; @@ -28,6 +34,8 @@ import java.util.Collections; import java.util.stream.Stream; +import io.aiven.kafka.connect.common.source.input.AvroTransformer; +import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; @@ -64,7 +72,8 @@ void testIteratorProcessesS3Objects() throws Exception { when(mockSourceApiClient.getObject(anyString())).thenReturn(mockS3Object); when(mockS3Object.getObjectContent()).thenReturn(mockInputStream); - when(mockTransformer.getRecords(any(), anyString(), anyInt(), any())).thenReturn(Stream.of(new Object())); + when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) + .thenReturn(Stream.of(new Object())); final String outStr = "this is a test"; when(mockTransformer.getValueBytes(any(), anyString(), any())) @@ -88,4 +97,54 @@ void testIteratorProcessesS3Objects() throws Exception { assertThat(iterator.next()).isNotNull(); } } + + @Test + void testIteratorProcessesS3ObjectsForByteArrayTransformer() throws Exception { + + final String key = "topic-00001-abc123.txt"; + + // Mock S3Object and InputStream + try (S3Object mockS3Object = mock(S3Object.class); + S3ObjectInputStream mockInputStream = new S3ObjectInputStream(new ByteArrayInputStream(new byte[] {}), + null);) { + when(mockSourceApiClient.getObject(anyString())).thenReturn(mockS3Object); + when(mockS3Object.getObjectContent()).thenReturn(mockInputStream); + + // With ByteArrayTransformer + mockTransformer = mock(ByteArrayTransformer.class); + when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) + .thenReturn(Stream.of(new Object())); + + final String outStr = "this is a test"; + + when(mockTransformer.getValueBytes(any(), anyString(), any())) + .thenReturn(outStr.getBytes(StandardCharsets.UTF_8)); + + when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); + + when(mockSourceApiClient.getListOfObjectKeys(any())) + .thenReturn(Collections.singletonList(key).listIterator()); + when(mockOffsetManager.recordsProcessedForObjectKey(anyMap(), anyString())) + .thenReturn(BYTES_TRANSFORMATION_NUM_OF_RECS); + + SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, + mockSourceApiClient); + assertThat(iterator.hasNext()).isTrue(); + iterator.next(); + verify(mockTransformer, never()).getRecords(any(), anyString(), anyInt(), any(), anyLong()); + + // With AvroTransformer + mockTransformer = mock(AvroTransformer.class); + when(mockSourceApiClient.getListOfObjectKeys(any())) + .thenReturn(Collections.singletonList(key).listIterator()); + when(mockOffsetManager.recordsProcessedForObjectKey(anyMap(), anyString())) + .thenReturn(BYTES_TRANSFORMATION_NUM_OF_RECS); + + iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, mockSourceApiClient); + assertThat(iterator.hasNext()).isTrue(); + iterator.next(); + + verify(mockTransformer, times(1)).getRecords(any(), anyString(), anyInt(), any(), anyLong()); + } + } } From 00064c68541b774fa8b7ff298c04e789a005f330 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C2=A8Claude?= <¨claude.warren@aiven.io¨> Date: Fri, 13 Dec 2024 17:49:02 +0000 Subject: [PATCH 78/90] Changes to fix Transformer streaming --- .../common/source/input/AvroTransformer.java | 101 ++++------- .../source/input/ByteArrayTransformer.java | 63 ++++--- .../common/source/input/JsonTransformer.java | 123 ++++++-------- .../source/input/ParquetTransformer.java | 109 ++++++------ .../common/source/input/Transformer.java | 158 +++++++++++++++++- .../source/input/AvroTransformerTest.java | 14 +- .../input/ByteArrayTransformerTest.java | 4 +- .../source/input/JsonTransformerTest.java | 32 ++-- .../source/input/ParquetTransformerTest.java | 16 +- .../input/TransformerStreamingTest.java | 126 ++++++++++++++ 10 files changed, 479 insertions(+), 267 deletions(-) create mode 100644 commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java index bdd52c4ed..770cb279c 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java @@ -22,10 +22,7 @@ import java.io.InputStream; import java.util.Collections; import java.util.Map; -import java.util.Spliterator; import java.util.function.Consumer; -import java.util.stream.Stream; -import java.util.stream.StreamSupport; import org.apache.kafka.common.config.AbstractConfig; @@ -37,7 +34,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class AvroTransformer implements Transformer { +public class AvroTransformer extends Transformer<GenericRecord> { private static final Logger LOGGER = LoggerFactory.getLogger(AvroTransformer.class); @@ -47,80 +44,42 @@ public void configureValueConverter(final Map<String, String> config, final Abst } @Override - public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, - final int topicPartition, final AbstractConfig sourceConfig, final long skipRecords) { - final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); - return readAvroRecordsAsStream(inputStreamIOSupplier, datumReader, skipRecords); - } - - @Override - public byte[] getValueBytes(final Object record, final String topic, final AbstractConfig sourceConfig) { - return TransformationUtils.serializeAvroRecordToBytes(Collections.singletonList((GenericRecord) record), topic, - sourceConfig); - } - - private Stream<Object> readAvroRecordsAsStream(final IOSupplier<InputStream> inputStreamIOSupplier, - final DatumReader<GenericRecord> datumReader, final long skipRecords) { - InputStream inputStream; // NOPMD CloseResource: being closed in try resources iterator - DataFileStream<GenericRecord> dataFileStream; // NOPMD CloseResource: being closed in try resources iterator - try { - // Open input stream from S3 - inputStream = inputStreamIOSupplier.get(); - - // Ensure the DataFileStream is initialized correctly with the open stream - dataFileStream = new DataFileStream<>(inputStream, datumReader); - - // Wrap DataFileStream in a Stream using a custom Spliterator for lazy processing - return StreamSupport.stream(new AvroRecordSpliterator<>(dataFileStream), false) - .skip(skipRecords) - .onClose(() -> { - try { - dataFileStream.close(); // Ensure the reader is closed after streaming - } catch (IOException e) { - LOGGER.error("Error closing BufferedReader: {}", e.getMessage(), e); - } - }); - } catch (IOException e) { - LOGGER.error("Error in DataFileStream: {}", e.getMessage(), e); - return Stream.empty(); // Return an empty stream if initialization fails - } - } - - private static class AvroRecordSpliterator<T> implements Spliterator<T> { - private final DataFileStream<GenericRecord> dataFileStream; + public StreamSpliterator<GenericRecord> createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, + final String topic, final int topicPartition, final AbstractConfig sourceConfig) { + return new StreamSpliterator<GenericRecord>(LOGGER, inputStreamIOSupplier) { + private DataFileStream<GenericRecord> dataFileStream; + private final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); + + @Override + protected InputStream inputOpened(final InputStream input) throws IOException { + dataFileStream = new DataFileStream<>(input, datumReader); + return input; + } - public AvroRecordSpliterator(final DataFileStream<GenericRecord> dataFileStream) { - this.dataFileStream = dataFileStream; - } + @Override + public void doClose() { + if (dataFileStream != null) { + try { + dataFileStream.close(); + } catch (IOException e) { + LOGGER.error("Error closing reader: {}", e.getMessage(), e); + } + } + } - @Override - public boolean tryAdvance(final Consumer<? super T> action) { - try { + @Override + protected boolean doAdvance(final Consumer<? super GenericRecord> action) { if (dataFileStream.hasNext()) { - final GenericRecord record = dataFileStream.next(); - action.accept((T) record); + action.accept(dataFileStream.next()); return true; } - } catch (Exception e) { // NOPMD AvoidCatchingGenericException - LOGGER.error("Error while reading Avro record: {}", e.getMessage(), e); return false; } - return false; - } - - @Override - public Spliterator<T> trySplit() { - return null; // Can't split the data stream as DataFileStream is sequential - } - - @Override - public long estimateSize() { - return Long.MAX_VALUE; // We don't know the size upfront - } + }; + } - @Override - public int characteristics() { - return Spliterator.ORDERED | Spliterator.NONNULL; - } + @Override + public byte[] getValueBytes(final GenericRecord record, final String topic, final AbstractConfig sourceConfig) { + return TransformationUtils.serializeAvroRecordToBytes(Collections.singletonList(record), topic, sourceConfig); } } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java index 0c11770f0..d220f686f 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java @@ -18,70 +18,65 @@ import java.io.IOException; import java.io.InputStream; +import java.util.Arrays; import java.util.Map; -import java.util.Spliterator; -import java.util.Spliterators; -import java.util.stream.Stream; -import java.util.stream.StreamSupport; +import java.util.function.Consumer; import org.apache.kafka.common.config.AbstractConfig; +import org.apache.commons.io.IOUtils; import org.apache.commons.io.function.IOSupplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class ByteArrayTransformer implements Transformer { +public class ByteArrayTransformer extends Transformer<byte[]> { private static final Logger LOGGER = LoggerFactory.getLogger(ByteArrayTransformer.class); + private static final int MAX_BUFFER_SIZE = 4096; + @Override public void configureValueConverter(final Map<String, String> config, final AbstractConfig sourceConfig) { // For byte array transformations, ByteArrayConverter is the converter which is the default config. } @Override - public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, - final int topicPartition, final AbstractConfig sourceConfig, final long skipRecords) { - - // Create a Stream that processes each chunk lazily - return StreamSupport.stream(new Spliterators.AbstractSpliterator<>(Long.MAX_VALUE, Spliterator.ORDERED) { - final byte[] buffer = new byte[4096]; - InputStream inputStream; + public StreamSpliterator<byte[]> createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, + final String topic, final int topicPartition, final AbstractConfig sourceConfig) { + return new StreamSpliterator<byte[]>(LOGGER, inputStreamIOSupplier) { + @Override + protected InputStream inputOpened(final InputStream input) { + return input; + } - { - try { - inputStream = inputStreamIOSupplier.get(); // Open the InputStream once - } catch (IOException e) { - LOGGER.error("Error closing stream: {}", e.getMessage(), e); - } + @Override + protected void doClose() { + // nothing to do. } @Override - public boolean tryAdvance(final java.util.function.Consumer<? super Object> action) { + protected boolean doAdvance(final Consumer<? super byte[]> action) { + final byte[] buffer = new byte[MAX_BUFFER_SIZE]; try { - final int bytesRead = inputStream.read(buffer); - if (bytesRead == -1) { + final int bytesRead = IOUtils.read(inputStream, buffer); + if (bytesRead == 0) { return false; } - final byte[] chunk = new byte[bytesRead]; - System.arraycopy(buffer, 0, chunk, 0, bytesRead); - action.accept(chunk); + if (bytesRead < MAX_BUFFER_SIZE) { + action.accept(Arrays.copyOf(buffer, bytesRead)); + } else { + action.accept(buffer); + } return true; } catch (IOException e) { - LOGGER.error("Error trying to advance byte stream: {}", e.getMessage(), e); + LOGGER.error("Error trying to advance inputStream: {}", e.getMessage(), e); return false; } } - }, false).onClose(() -> { - try { - inputStreamIOSupplier.get().close(); // Ensure the reader is closed after streaming - } catch (IOException e) { - LOGGER.error("Error closing BufferedReader: {}", e.getMessage(), e); - } - }); + }; } @Override - public byte[] getValueBytes(final Object record, final String topic, final AbstractConfig sourceConfig) { - return (byte[]) record; + public byte[] getValueBytes(final byte[] record, final String topic, final AbstractConfig sourceConfig) { + return record; } } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java index 3ef3a830f..9ba4e3678 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java @@ -24,10 +24,7 @@ import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.Map; -import java.util.Spliterator; -import java.util.Spliterators; -import java.util.stream.Stream; -import java.util.stream.StreamSupport; +import java.util.function.Consumer; import org.apache.kafka.common.config.AbstractConfig; @@ -35,10 +32,11 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.io.function.IOSupplier; +import org.codehaus.plexus.util.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class JsonTransformer implements Transformer { +public class JsonTransformer extends Transformer<JsonNode> { private static final Logger LOGGER = LoggerFactory.getLogger(JsonTransformer.class); @@ -50,78 +48,65 @@ public void configureValueConverter(final Map<String, String> config, final Abst } @Override - public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, - final int topicPartition, final AbstractConfig sourceConfig, final long skipRecords) { - return readJsonRecordsAsStream(inputStreamIOSupplier, skipRecords); - } + public StreamSpliterator<JsonNode> createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, + final String topic, final int topicPartition, final AbstractConfig sourceConfig) { + final StreamSpliterator<JsonNode> spliterator = new StreamSpliterator<JsonNode>(LOGGER, inputStreamIOSupplier) { + BufferedReader reader; - @Override - public byte[] getValueBytes(final Object record, final String topic, final AbstractConfig sourceConfig) { - try { - return objectMapper.writeValueAsBytes(record); - } catch (JsonProcessingException e) { - LOGGER.error("Failed to serialize record to JSON bytes. Error: {}", e.getMessage(), e); - return new byte[0]; - } - } - - private Stream<Object> readJsonRecordsAsStream(final IOSupplier<InputStream> inputStreamIOSupplier, - final long skipRecords) { - // Use a Stream that lazily processes each line as a JSON object - CustomSpliterator customSpliteratorParam; - try { - customSpliteratorParam = new CustomSpliterator(inputStreamIOSupplier); - } catch (IOException e) { - LOGGER.error("Error creating Json transformer CustomSpliterator: {}", e.getMessage(), e); - return Stream.empty(); - } - return StreamSupport.stream(customSpliteratorParam, false).onClose(() -> { - try { - customSpliteratorParam.reader.close(); // Ensure the reader is closed after streaming - } catch (IOException e) { - LOGGER.error("Error closing BufferedReader: {}", e.getMessage(), e); + @Override + protected InputStream inputOpened(final InputStream input) throws IOException { + reader = new BufferedReader(new InputStreamReader(input, StandardCharsets.UTF_8)); + return input; } - }).skip(skipRecords); - } - - /* - * This CustomSpliterator class is created so that BufferedReader instantiation is not closed before the all the - * records from stream is closed. With this now, we have a onclose method declared in parent declaration. - */ - final class CustomSpliterator extends Spliterators.AbstractSpliterator<Object> { - BufferedReader reader; - String line; - CustomSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier) throws IOException { - super(Long.MAX_VALUE, Spliterator.ORDERED | Spliterator.NONNULL); - reader = new BufferedReader(new InputStreamReader(inputStreamIOSupplier.get(), StandardCharsets.UTF_8)); - } - @Override - public boolean tryAdvance(final java.util.function.Consumer<? super Object> action) { - try { - if (line == null) { - line = reader.readLine(); + @Override + public void doClose() { + if (reader != null) { + try { + reader.close(); + } catch (IOException e) { + LOGGER.error("Error closing reader: {}", e.getMessage(), e); + } } - while (line != null) { - line = line.trim(); - if (!line.isEmpty()) { - try { - final JsonNode jsonNode = objectMapper.readTree(line); // Parse the JSON - // line - action.accept(jsonNode); // Provide the parsed JSON node to the stream - } catch (IOException e) { - LOGGER.error("Error parsing JSON record: {}", e.getMessage(), e); + } + + @Override + public boolean doAdvance(final Consumer<? super JsonNode> action) { + String line = null; + try { + // remove blank and empty lines. + while (StringUtils.isBlank(line)) { + line = reader.readLine(); + if (line == null) { + // end of file + return false; } - line = null; // NOPMD - return true; } - line = reader.readLine(); + line = line.trim(); + try { + action.accept(objectMapper.readTree(line)); // Parse the JSON + } catch (IOException e) { + LOGGER.error("Error parsing JSON record: {}", e.getMessage(), e); + return false; + } + return true; + } catch (IOException e) { + LOGGER.error("Error reading input stream: {}", e.getMessage(), e); + return false; } - return false; // End of file - } catch (IOException e) { - LOGGER.error("Error reading S3 object stream: {}", e.getMessage(), e); - return false; } + }; + + return spliterator; + } + + @Override + public byte[] getValueBytes(final JsonNode record, final String topic, final AbstractConfig sourceConfig) { + try { + return objectMapper.writeValueAsBytes(record); + } catch (JsonProcessingException e) { + LOGGER.error("Failed to serialize record to JSON bytes. Error: {}", e.getMessage(), e); + return new byte[0]; } } } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java index 9d6021a11..1477b13f8 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java @@ -27,10 +27,7 @@ import java.time.Instant; import java.util.Collections; import java.util.Map; -import java.util.Spliterator; -import java.util.Spliterators; -import java.util.stream.Stream; -import java.util.stream.StreamSupport; +import java.util.function.Consumer; import org.apache.kafka.common.config.AbstractConfig; @@ -40,11 +37,11 @@ import org.apache.commons.compress.utils.IOUtils; import org.apache.commons.io.function.IOSupplier; import org.apache.parquet.avro.AvroParquetReader; -import org.apache.parquet.io.InputFile; +import org.apache.parquet.hadoop.ParquetReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class ParquetTransformer implements Transformer { +public class ParquetTransformer extends Transformer<GenericRecord> { private static final Logger LOGGER = LoggerFactory.getLogger(ParquetTransformer.class); @@ -54,70 +51,68 @@ public void configureValueConverter(final Map<String, String> config, final Abst } @Override - public Stream<Object> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, - final int topicPartition, final AbstractConfig sourceConfig, final long skipRecords) { - return getParquetStreamRecords(inputStreamIOSupplier, topic, topicPartition, skipRecords); + public byte[] getValueBytes(final GenericRecord record, final String topic, final AbstractConfig sourceConfig) { + return TransformationUtils.serializeAvroRecordToBytes(Collections.singletonList(record), topic, sourceConfig); } @Override - public byte[] getValueBytes(final Object record, final String topic, final AbstractConfig sourceConfig) { - return TransformationUtils.serializeAvroRecordToBytes(Collections.singletonList((GenericRecord) record), topic, - sourceConfig); - } + public StreamSpliterator<GenericRecord> createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, + final String topic, final int topicPartition, final AbstractConfig sourceConfig) { - private Stream<Object> getParquetStreamRecords(final IOSupplier<InputStream> inputStreamIOSupplier, - final String topic, final int topicPartition, final long skipRecords) { - final String timestamp = String.valueOf(Instant.now().toEpochMilli()); - File parquetFile; - - try { - // Create a temporary file for the Parquet data - parquetFile = File.createTempFile(topic + "_" + topicPartition + "_" + timestamp, ".parquet"); - } catch (IOException e) { - LOGGER.error("Error creating temp file for Parquet data: {}", e.getMessage(), e); - return Stream.empty(); - } + final StreamSpliterator<GenericRecord> spliterator = new StreamSpliterator<GenericRecord>(LOGGER, + inputStreamIOSupplier) { - try (OutputStream outputStream = Files.newOutputStream(parquetFile.toPath()); - InputStream inputStream = inputStreamIOSupplier.get();) { - IOUtils.copy(inputStream, outputStream); // Copy input stream to temporary file + private ParquetReader<GenericRecord> reader; + private File parquetFile; - final InputFile inputFile = new LocalInputFile(parquetFile.toPath()); - final var parquetReader = AvroParquetReader.<GenericRecord>builder(inputFile).build(); + @Override + protected InputStream inputOpened(final InputStream input) throws IOException { + final String timestamp = String.valueOf(Instant.now().toEpochMilli()); + + try { + // Create a temporary file for the Parquet data + parquetFile = File.createTempFile(topic + "_" + topicPartition + "_" + timestamp, ".parquet"); + } catch (IOException e) { + LOGGER.error("Error creating temp file for Parquet data: {}", e.getMessage(), e); + throw e; + } - return StreamSupport.stream(new Spliterators.AbstractSpliterator<Object>(Long.MAX_VALUE, - Spliterator.ORDERED | Spliterator.NONNULL) { - @Override - public boolean tryAdvance(final java.util.function.Consumer<? super Object> action) { + try (OutputStream outputStream = Files.newOutputStream(parquetFile.toPath())) { + IOUtils.copy(input, outputStream); // Copy input stream to temporary file + } + reader = AvroParquetReader.<GenericRecord>builder(new LocalInputFile(parquetFile.toPath())).build(); + return input; + } + + @Override + protected void doClose() { + if (reader != null) { try { - final GenericRecord record = parquetReader.read(); - if (record != null) { - action.accept(record); // Pass record to the stream - return true; - } else { - parquetReader.close(); // Close reader at end of file - deleteTmpFile(parquetFile.toPath()); - return false; - } - } catch (IOException | RuntimeException e) { // NOPMD - LOGGER.error("Error reading Parquet record: {}", e.getMessage(), e); - deleteTmpFile(parquetFile.toPath()); - return false; + reader.close(); // Close reader at end of file + } catch (IOException e) { + logger.error("Error closing reader: {}", e.getMessage(), e); } } - }, false).skip(skipRecords).onClose(() -> { + if (parquetFile != null) { + deleteTmpFile(parquetFile.toPath()); + } + } + + @Override + protected boolean doAdvance(final Consumer<? super GenericRecord> action) { try { - parquetReader.close(); // Ensure reader is closed when the stream is closed + final GenericRecord record = reader.read(); + if (record != null) { + action.accept(record); // Pass record to the stream + return true; + } } catch (IOException e) { - LOGGER.error("Error closing Parquet reader: {}", e.getMessage(), e); + logger.error("Error reading record: {}", e.getMessage(), e); } - deleteTmpFile(parquetFile.toPath()); - }); - } catch (IOException | RuntimeException e) { // NOPMD - LOGGER.error("Error processing Parquet data: {}", e.getMessage(), e); - deleteTmpFile(parquetFile.toPath()); - return Stream.empty(); - } + return false; + } + }; + return spliterator; } static void deleteTmpFile(final Path parquetFile) { diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java index 96cda5924..26f46f5c6 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java @@ -16,20 +16,168 @@ package io.aiven.kafka.connect.common.source.input; +import java.io.IOException; import java.io.InputStream; import java.util.Map; +import java.util.Spliterator; +import java.util.function.Consumer; import java.util.stream.Stream; +import java.util.stream.StreamSupport; import org.apache.kafka.common.config.AbstractConfig; import org.apache.commons.io.function.IOSupplier; +import org.slf4j.Logger; -public interface Transformer { +public abstract class Transformer<T> { - void configureValueConverter(Map<String, String> config, AbstractConfig sourceConfig); + public abstract void configureValueConverter(Map<String, String> config, AbstractConfig sourceConfig); - Stream<Object> getRecords(IOSupplier<InputStream> inputStreamIOSupplier, String topic, int topicPartition, - AbstractConfig sourceConfig, long skipRecords); + public final Stream<T> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, + final int topicPartition, final AbstractConfig sourceConfig, final long skipRecords) { - byte[] getValueBytes(Object record, String topic, AbstractConfig sourceConfig); + final StreamSpliterator<T> spliterator = createSpliterator(inputStreamIOSupplier, topic, topicPartition, + sourceConfig); + return StreamSupport.stream(spliterator, false).onClose(spliterator::close).skip(skipRecords); + } + + /** + * Creates the stream spliterator for this transformer. + * + * @param inputStreamIOSupplier + * the input stream supplier. + * @param topic + * the topic. + * @param topicPartition + * the partition. + * @param sourceConfig + * the source configuraiton. + * @return a StreamSpliterator instance. + */ + protected abstract StreamSpliterator<T> createSpliterator(IOSupplier<InputStream> inputStreamIOSupplier, + String topic, int topicPartition, AbstractConfig sourceConfig); + + public abstract byte[] getValueBytes(T record, String topic, AbstractConfig sourceConfig); + + /** + * A Spliterator that performs various checks on the opening/closing of the input stream. + * + * @param <T> + * the type of item created by this Spliterator. + */ + protected abstract static class StreamSpliterator<T> implements Spliterator<T> { + /** + * The input stream supplier. + */ + private final IOSupplier<InputStream> inputStreamIOSupplier; + /** + * The logger to be used by all instances of this class. This will be the Transformer logger. + */ + protected final Logger logger; + /** + * The input stream. Will be null until {@link #inputOpened} has completed. May be used for reading but should + * not be closed or otherwise made unreadable. + */ + protected InputStream inputStream; + + /** + * A flag indicate that the input stream has been closed. + */ + private boolean closed; + + /** + * Constructor. + * + * @param logger + * The logger for this Spliterator to use. + * @param inputStreamIOSupplier + * the InputStream supplier + */ + protected StreamSpliterator(final Logger logger, final IOSupplier<InputStream> inputStreamIOSupplier) { + this.logger = logger; + this.inputStreamIOSupplier = inputStreamIOSupplier; + } + + /** + * Attempt to read the next record. If there is no record to read or an error occurred return false. If a record + * was created, call {@code action.accept()} with the record. + * + * @param action + * the Consumer to call if record is created. + * @return {@code true} if a record was processed, {@code false} otherwise. + */ + abstract protected boolean doAdvance(Consumer<? super T> action); + + /** + * Method to close additional inputs if needed. + */ + abstract protected void doClose(); + + public final void close() { + doClose(); + try { + if (inputStream != null) { + inputStream.close(); + closed = true; + } + } catch (IOException e) { + logger.error("Error trying to close inputStream: {}", e.getMessage(), e); + } + } + + /** + * Allows modification of input stream. Called immediatly after the input stream is opened. Implementations may + * modify the type of input stream by wrapping it with a specific implementation, or may create Readers from the + * input stream. The modified input stream must be returned. If a Reader or similar class is created from the + * input stream the input stream must be returned. + * + * @param input + * the input stream that was just opened. + * @return the input stream or modified input stream. + * @throws IOException + * on IO error. + */ + abstract protected InputStream inputOpened(InputStream input) throws IOException; + + @Override + public final boolean tryAdvance(final Consumer<? super T> action) { + boolean result = false; + if (closed) { + logger.error("Attempt to advance after closed"); + } + try { + if (inputStream == null) { + try { + inputStream = inputOpened(inputStreamIOSupplier.get()); + } catch (IOException e) { + logger.error("Error trying to open inputStream: {}", e.getMessage(), e); + close(); + return false; + } + } + result = doAdvance(action); + } catch (RuntimeException e) { // NOPMD must catch runtime exception here. + logger.error("Error trying to advance data: {}", e.getMessage(), e); + } + if (!result) { + close(); + } + return result; + } + + @Override + public final Spliterator<T> trySplit() { // NOPMD returning null is reqruied by API + return null; + } + + @Override + public long estimateSize() { + return Long.MAX_VALUE; + } + + @Override + public int characteristics() { + return Spliterator.ORDERED | Spliterator.NONNULL; + } + } } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java index b35d73c80..67b028283 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java @@ -74,7 +74,8 @@ void testConfigureValueConverter() { void testReadAvroRecordsInvalidData() { final InputStream inputStream = new ByteArrayInputStream("mock-avro-data".getBytes(StandardCharsets.UTF_8)); - final Stream<Object> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 0); + final Stream<GenericRecord> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + 0); final List<Object> recs = records.collect(Collectors.toList()); assertThat(recs).isEmpty(); @@ -85,7 +86,8 @@ void testReadAvroRecords() throws Exception { final ByteArrayOutputStream avroData = generateMockAvroData(25); final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); - final Stream<Object> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 0); + final Stream<GenericRecord> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + 0); final List<Object> recs = records.collect(Collectors.toList()); assertThat(recs).hasSize(25); @@ -96,7 +98,8 @@ void testReadAvroRecordsSkipFew() throws Exception { final ByteArrayOutputStream avroData = generateMockAvroData(20); final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); - final Stream<Object> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 5); + final Stream<GenericRecord> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + 5); final List<Object> recs = records.collect(Collectors.toList()); assertThat(recs).hasSize(15); @@ -110,13 +113,14 @@ void testReadAvroRecordsSkipMoreRecordsThanExist() throws Exception { final ByteArrayOutputStream avroData = generateMockAvroData(20); final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); - final Stream<Object> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 25); + final Stream<GenericRecord> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + 25); final List<Object> recs = records.collect(Collectors.toList()); assertThat(recs).hasSize(0); } - ByteArrayOutputStream generateMockAvroData(final int numRecs) throws IOException { + static ByteArrayOutputStream generateMockAvroData(final int numRecs) throws IOException { final String schemaJson = "{\n" + " \"type\": \"record\",\n" + " \"name\": \"TestRecord\",\n" + " \"fields\": [\n" + " {\"name\": \"message\", \"type\": \"string\"},\n" + " {\"name\": \"id\", \"type\": \"int\"}\n" + " ]\n" + "}"; diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java index 2122734bd..d43db66bf 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java @@ -53,7 +53,7 @@ void testGetRecordsSingleChunk() { final InputStream inputStream = new ByteArrayInputStream(data); final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; - final Stream<Object> records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, + final Stream<byte[]> records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, sourceCommonConfig, 0); final List<Object> recs = records.collect(Collectors.toList()); @@ -67,7 +67,7 @@ void testGetRecordsEmptyInputStream() { final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; - final Stream<Object> records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, + final Stream<byte[]> records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, sourceCommonConfig, 0); assertThat(records).hasSize(0); diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java index cf9a57527..6b0c05dc5 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java @@ -73,7 +73,7 @@ void testHandleValueDataWithValidJson() { final InputStream validJsonInputStream = new ByteArrayInputStream( "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; - final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + final Stream<JsonNode> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, sourceCommonConfig, 0); assertThat(jsonNodes).hasSize(1); @@ -84,7 +84,7 @@ void testHandleValueDataWithValidJsonSkipFew() { final InputStream validJsonInputStream = new ByteArrayInputStream( getJsonRecs(100).getBytes(StandardCharsets.UTF_8)); final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; - final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + final Stream<JsonNode> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, sourceCommonConfig, 25L); final List<Object> recs = jsonNodes.collect(Collectors.toList()); @@ -104,7 +104,7 @@ void testHandleValueDataWithInvalidJson() { "invalid-json".getBytes(StandardCharsets.UTF_8)); final IOSupplier<InputStream> inputStreamIOSupplier = () -> invalidJsonInputStream; - final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + final Stream<JsonNode> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, sourceCommonConfig, 0); assertThat(jsonNodes).isEmpty(); @@ -115,7 +115,7 @@ void testSerializeJsonDataValid() throws IOException { final InputStream validJsonInputStream = new ByteArrayInputStream( "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; - final Stream<Object> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + final Stream<JsonNode> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, sourceCommonConfig, 0); final byte[] serializedData = jsonTransformer.getValueBytes(jsonNodes.findFirst().get(), TESTTOPIC, sourceCommonConfig); @@ -129,30 +129,30 @@ void testSerializeJsonDataValid() throws IOException { @Test void testGetRecordsWithIOException() throws IOException { when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException")); - final Stream<Object> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); + final Stream<JsonNode> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); assertThat(resultStream).isEmpty(); } - @Test - void testCustomSpliteratorStreamProcessing() throws IOException { - final String jsonContent = "{\"key\":\"value\"}\n{\"key2\":\"value2\"}"; - final InputStream inputStream = new ByteArrayInputStream(jsonContent.getBytes(StandardCharsets.UTF_8)); - final IOSupplier<InputStream> supplier = () -> inputStream; - - final JsonTransformer.CustomSpliterator spliterator = jsonTransformer.new CustomSpliterator(supplier); - assertThat(spliterator.tryAdvance(jsonNode -> assertThat(jsonNode).isNotNull())).isTrue(); - } + // @Test + // void testCustomSpliteratorStreamProcessing() throws IOException { + // final String jsonContent = "{\"key\":\"value\"}\n{\"key2\":\"value2\"}"; + // final InputStream inputStream = new ByteArrayInputStream(jsonContent.getBytes(StandardCharsets.UTF_8)); + // final IOSupplier<InputStream> supplier = () -> inputStream; + // + // final JsonTransformer.CustomSpliterator spliterator = jsonTransformer.new CustomSpliterator(supplier); + // assertThat(spliterator.tryAdvance(jsonNode -> assertThat(jsonNode).isNotNull())).isTrue(); + // } @Test void testCustomSpliteratorWithIOExceptionDuringInitialization() throws IOException { when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException during initialization")); - final Stream<Object> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); + final Stream<JsonNode> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); assertThat(resultStream).isEmpty(); } - String getJsonRecs(final int recordCount) { + static String getJsonRecs(final int recordCount) { final StringBuilder jsonRecords = new StringBuilder(); for (int i = 1; i <= recordCount; i++) { jsonRecords.append(String.format("{\"key\":\"value%d\"}", i)); diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java index 02b946917..cde0a11ac 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java @@ -62,7 +62,7 @@ void testHandleValueDataWithZeroBytes() { final String topic = "test-topic"; final int topicPartition = 0; - final Stream<Object> recs = parquetTransformer.getRecords(inputStreamIOSupplier, topic, topicPartition, + final Stream<GenericRecord> recs = parquetTransformer.getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig, 0L); assertThat(recs).isEmpty(); @@ -123,8 +123,8 @@ void testGetRecordsWithInvalidData() { final String topic = "test-topic"; final int topicPartition = 0; - final Stream<Object> records = parquetTransformer.getRecords(inputStreamIOSupplier, topic, topicPartition, - s3SourceConfig, 0L); + final Stream<GenericRecord> records = parquetTransformer.getRecords(inputStreamIOSupplier, topic, + topicPartition, s3SourceConfig, 0L); assertThat(records).isEmpty(); } @@ -137,7 +137,7 @@ void testTemporaryFileDeletion() throws Exception { assertThat(Files.exists(tempFile)).isFalse(); } - private byte[] generateMockParquetData() throws IOException { + static byte[] generateMockParquetData() throws IOException { final Path path = ContentUtils.getTmpFilePath("name"); return IOUtils.toByteArray(Files.newInputStream(path)); } @@ -149,8 +149,8 @@ void testIOExceptionCreatingTempFile() { .thenThrow(new IOException("Test IOException for temp file")); final IOSupplier<InputStream> inputStreamSupplier = mock(IOSupplier.class); - final Stream<Object> resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", 1, - null, 0L); + final Stream<GenericRecord> resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", + 1, null, 0L); assertThat(resultStream).isEmpty(); } @@ -162,8 +162,8 @@ void testIOExceptionDuringDataCopy() throws IOException { when(inputStreamMock.read(any(byte[].class))).thenThrow(new IOException("Test IOException during copy")); final IOSupplier<InputStream> inputStreamSupplier = () -> inputStreamMock; - final Stream<Object> resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", 1, - null, 0L); + final Stream<GenericRecord> resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", + 1, null, 0L); assertThat(resultStream).isEmpty(); } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java new file mode 100644 index 000000000..9caa098f1 --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java @@ -0,0 +1,126 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Stream; + +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.common.config.ConfigDef; + +import io.aiven.kafka.connect.common.config.CommonConfig; + +import org.apache.commons.io.function.IOSupplier; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +/** + * Abstract test class to verify that streaming data is closed properly. + */ +class TransformerStreamingTest { + + @ParameterizedTest + @MethodSource("testData") + void verifyExceptionDuringIOOpen(final Transformer<?> transformer, final byte[] testData, + final AbstractConfig config, final int expectedCount) throws IOException { + final IOSupplier<InputStream> ioSupplier = mock(IOSupplier.class); + when(ioSupplier.get()).thenThrow(new IOException("Test IOException during initialization")); + final Stream<?> objStream = transformer.getRecords(ioSupplier, "topic", 1, config, 0); + assertThat(objStream).isEmpty(); + } + + @ParameterizedTest + @MethodSource("testData") + void verifyCloseCalledAtEnd(final Transformer<?> transformer, final byte[] testData, final AbstractConfig config, + final int expectedCount) throws IOException { + final CloseTrackingStream stream = new CloseTrackingStream(new ByteArrayInputStream(testData)); + final Stream<?> objStream = transformer.getRecords(() -> stream, "topic", 1, config, 0); + final long count = objStream.count(); + assertThat(count).isEqualTo(expectedCount); + assertThat(stream.closeCount).isGreaterThan(0); + } + + @ParameterizedTest + @MethodSource("testData") + void verifyCloseCalledAtIteratorEnd(final Transformer<?> transformer, final byte[] testData, + final AbstractConfig config, final int expectedCount) throws IOException { + final CloseTrackingStream stream = new CloseTrackingStream(new ByteArrayInputStream(testData)); + final Stream<?> objStream = transformer.getRecords(() -> stream, "topic", 1, config, 0); + final Iterator<?> iter = objStream.iterator(); + long count = 0L; + while (iter.hasNext()) { + count += 1; + iter.next(); + } + assertThat(count).isEqualTo(expectedCount); + assertThat(stream.closeCount).isGreaterThan(0); + } + + static Stream<Arguments> testData() throws IOException { + final List<Arguments> lst = new ArrayList<>(); + lst.add(Arguments.of(new AvroTransformer(), AvroTransformerTest.generateMockAvroData(100).toByteArray(), + new CommonConfig(new ConfigDef(), new HashMap<>()) { + }, 100)); + lst.add(Arguments.of(new ByteArrayTransformer(), "Hello World".getBytes(StandardCharsets.UTF_8), + new CommonConfig(new ConfigDef(), new HashMap<>()) { + }, 1)); + lst.add(Arguments.of(new JsonTransformer(), + JsonTransformerTest.getJsonRecs(100).getBytes(StandardCharsets.UTF_8), + new CommonConfig(new ConfigDef(), new HashMap<>()) { + }, 100)); + lst.add(Arguments.of(new ParquetTransformer(), ParquetTransformerTest.generateMockParquetData(), + new CommonConfig(new ConfigDef(), new HashMap<>()) { + }, 100)); + return lst.stream(); + } + + private static class CloseTrackingStream extends InputStream { + InputStream delegate; + int closeCount; + + CloseTrackingStream(final InputStream stream) { + super(); + this.delegate = stream; + } + + @Override + public int read() throws IOException { + if (closeCount > 0) { + throw new IOException("ERROR Read after close"); + } + return delegate.read(); + } + + @Override + public void close() throws IOException { + closeCount++; + delegate.close(); + } + } +} From eb519c354e3566840e8c8e1b3038b00232d21ebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C2=A8Claude?= <¨claude.warren@aiven.io¨> Date: Mon, 16 Dec 2024 08:33:46 +0000 Subject: [PATCH 79/90] removed unneeded test --- .../common/source/input/JsonTransformerTest.java | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java index 6b0c05dc5..73a659575 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java @@ -134,16 +134,6 @@ void testGetRecordsWithIOException() throws IOException { assertThat(resultStream).isEmpty(); } - // @Test - // void testCustomSpliteratorStreamProcessing() throws IOException { - // final String jsonContent = "{\"key\":\"value\"}\n{\"key2\":\"value2\"}"; - // final InputStream inputStream = new ByteArrayInputStream(jsonContent.getBytes(StandardCharsets.UTF_8)); - // final IOSupplier<InputStream> supplier = () -> inputStream; - // - // final JsonTransformer.CustomSpliterator spliterator = jsonTransformer.new CustomSpliterator(supplier); - // assertThat(spliterator.tryAdvance(jsonNode -> assertThat(jsonNode).isNotNull())).isTrue(); - // } - @Test void testCustomSpliteratorWithIOExceptionDuringInitialization() throws IOException { when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException during initialization")); From 24df6aba071460254c10a9eb884f2ba266369885 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aindri=C3=BA=20Lavelle?= <121855584+aindriu-aiven@users.noreply.github.com> Date: Mon, 16 Dec 2024 08:43:33 +0000 Subject: [PATCH 80/90] Configure s3 api to use AWS Prefix (#370) * This update means we can now use the PREFIX in the AWS API allowing users to configure it to be more specific about what they want processed by the connector. --------- Signed-off-by: Aindriu Lavelle <aindriu.lavelle@aiven.io> --- .../connect/config/s3/S3ConfigFragment.java | 8 +-- .../connect/s3/source/IntegrationTest.java | 11 ++- .../s3/source/utils/AWSV2SourceClient.java | 9 ++- .../source/utils/AWSV2SourceClientTest.java | 72 +++++++++++++++++++ 4 files changed, 91 insertions(+), 9 deletions(-) diff --git a/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java b/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java index 1e86638e1..1e87265b9 100644 --- a/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java +++ b/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java @@ -216,6 +216,10 @@ static void addAwsConfigGroup(final ConfigDef configDef) { ConfigDef.Importance.MEDIUM, "AWS S3 Region, e.g. us-east-1", GROUP_AWS, awsGroupCounter++, ConfigDef.Width.NONE, AWS_S3_REGION_CONFIG); + configDef.define(AWS_S3_PREFIX_CONFIG, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "Prefix for stored objects, e.g. cluster-1/", GROUP_AWS, awsGroupCounter++, + ConfigDef.Width.NONE, AWS_S3_PREFIX_CONFIG); + configDef.define(FETCH_PAGE_SIZE, ConfigDef.Type.INT, 10, ConfigDef.Range.atLeast(1), ConfigDef.Importance.MEDIUM, "AWS S3 Fetch page size", GROUP_AWS, awsGroupCounter++, // NOPMD // UnusedAssignment @@ -252,10 +256,6 @@ static void addAwsStsConfigGroup(final ConfigDef configDef) { } static void addDeprecatedConfiguration(final ConfigDef configDef) { - configDef.define(AWS_S3_PREFIX_CONFIG, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, - "[Deprecated] Use `file.name.template` instead. Prefix for stored objects, e.g. cluster-1/", GROUP_AWS, - 0, ConfigDef.Width.NONE, AWS_S3_PREFIX_CONFIG); configDef.define(AWS_ACCESS_KEY_ID, ConfigDef.Type.PASSWORD, null, new NonEmptyPassword() { @Override diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 3cd72f290..7f96842f3 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -77,6 +77,7 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; +import org.junit.platform.commons.util.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testcontainers.containers.localstack.LocalStackContainer; @@ -253,7 +254,8 @@ void parquetTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); final String partition = "00000"; - final String fileName = topicName + "-" + partition + "-" + System.currentTimeMillis() + ".txt"; + final String fileName = addPrefixOrDefault("") + topicName + "-" + partition + "-" + System.currentTimeMillis() + + ".txt"; final String name = "testuser"; final Map<String, String> connectorConfig = getAvroConfig(topicName, InputFormat.PARQUET); @@ -337,13 +339,18 @@ private static byte[] generateNextAvroMessagesStartingFromId(final int messageId } private static String writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) { - final String objectKey = topicName + "-" + partitionId + "-" + System.currentTimeMillis() + ".txt"; + final String objectKey = addPrefixOrDefault("") + topicName + "-" + partitionId + "-" + + System.currentTimeMillis() + ".txt"; final PutObjectRequest request = new PutObjectRequest(TEST_BUCKET_NAME, objectKey, new ByteArrayInputStream(testDataBytes), new ObjectMetadata()); s3Client.putObject(request); return OBJECT_KEY + SEPARATOR + objectKey; } + private static String addPrefixOrDefault(final String defaultValue) { + return StringUtils.isNotBlank(s3Prefix) ? s3Prefix : defaultValue; + } + private Map<String, String> getConfig(final String connectorName, final String topics, final int maxTasks) { final Map<String, String> config = new HashMap<>(basicS3ConnectorConfig()); config.put("name", connectorName); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java index 1689ec9fa..1bbc477ee 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java @@ -85,14 +85,17 @@ public Iterator<String> getListOfObjectKeys(final String startToken) { if (StringUtils.isNotBlank(startToken)) { request.withStartAfter(startToken); } + // Prefix is optional so only use if supplied + if (StringUtils.isNotBlank(s3SourceConfig.getAwsS3Prefix())) { + request.withPrefix(s3SourceConfig.getAwsS3Prefix()); + } final Stream<String> s3ObjectKeyStream = Stream .iterate(s3Client.listObjectsV2(request), Objects::nonNull, response -> { // This is called every time next() is called on the iterator. if (response.isTruncated()) { - return s3Client.listObjectsV2(new ListObjectsV2Request().withBucketName(bucketName) - .withMaxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR) - .withContinuationToken(response.getNextContinuationToken())); + return s3Client.listObjectsV2( + new ListObjectsV2Request().withContinuationToken(response.getNextContinuationToken())); } else { return null; } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java index 5b5176690..a8174a15c 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java @@ -17,6 +17,7 @@ package io.aiven.kafka.connect.s3.source.utils; import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_PREFIX_CONFIG; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; @@ -40,6 +41,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.CsvSource; +import org.mockito.ArgumentCaptor; +import org.mockito.Captor; class AWSV2SourceClientTest { @@ -47,6 +50,9 @@ class AWSV2SourceClientTest { private AWSV2SourceClient awsv2SourceClient; + @Captor + ArgumentCaptor<ListObjectsV2Request> requestCaptor; + private static Map<String, String> getConfigMap(final int maxTasks, final int taskId) { final Map<String, String> configMap = new HashMap<>(); configMap.put("tasks.max", String.valueOf(maxTasks)); @@ -131,6 +137,72 @@ void testFetchObjectSummariesWithPagination() throws IOException { assertThat(summaries).isExhausted(); } + @Test + void testFetchObjectWithPrefix() { + final Map<String, String> configMap = getConfigMap(1, 0); + configMap.put(AWS_S3_PREFIX_CONFIG, "test/"); + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configMap); + s3Client = mock(AmazonS3.class); + awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig, Collections.emptySet()); + requestCaptor = ArgumentCaptor.forClass(ListObjectsV2Request.class); + final S3ObjectSummary object1 = createObjectSummary(1, "key1"); + final S3ObjectSummary object2 = createObjectSummary(1, "key2"); + + final ListObjectsV2Result firstResult = createListObjectsV2Result(List.of(object1), "nextToken"); + final ListObjectsV2Result secondResult = createListObjectsV2Result(List.of(object2), null); + + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(firstResult).thenReturn(secondResult); + + final Iterator<String> summaries = awsv2SourceClient.getListOfObjectKeys(null); + verify(s3Client, times(1)).listObjectsV2(any(ListObjectsV2Request.class)); + + assertThat(summaries.next()).isNotNull(); + assertThat(summaries.next()).isNotNull(); + + verify(s3Client, times(2)).listObjectsV2(requestCaptor.capture()); + final List<ListObjectsV2Request> allRequests = requestCaptor.getAllValues(); + assertThat(summaries).isExhausted(); + + assertThat(allRequests.get(0).getPrefix()).isEqualTo(s3SourceConfig.getAwsS3Prefix()); + // Not required with continuation token + assertThat(allRequests.get(1).getPrefix()).isNull(); + assertThat(allRequests.get(1).getContinuationToken()).isEqualTo("nextToken"); + + } + + @Test + void testFetchObjectWithInitialStartAfter() { + final Map<String, String> configMap = getConfigMap(1, 0); + final String startAfter = "file-option-1-12000.txt"; + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configMap); + s3Client = mock(AmazonS3.class); + awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig, Collections.emptySet()); + requestCaptor = ArgumentCaptor.forClass(ListObjectsV2Request.class); + final S3ObjectSummary object1 = createObjectSummary(1, "key1"); + final S3ObjectSummary object2 = createObjectSummary(1, "key2"); + + final ListObjectsV2Result firstResult = createListObjectsV2Result(List.of(object1), "nextToken"); + final ListObjectsV2Result secondResult = createListObjectsV2Result(List.of(object2), null); + + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(firstResult).thenReturn(secondResult); + + final Iterator<String> summaries = awsv2SourceClient.getListOfObjectKeys(startAfter); + verify(s3Client, times(1)).listObjectsV2(any(ListObjectsV2Request.class)); + + assertThat(summaries.next()).isNotNull(); + assertThat(summaries.next()).isNotNull(); + + verify(s3Client, times(2)).listObjectsV2(requestCaptor.capture()); + final List<ListObjectsV2Request> allRequests = requestCaptor.getAllValues(); + assertThat(summaries).isExhausted(); + + assertThat(allRequests.get(0).getStartAfter()).isEqualTo(startAfter); + // Not required with continuation token + assertThat(allRequests.get(1).getStartAfter()).isNull(); + assertThat(allRequests.get(1).getContinuationToken()).isEqualTo("nextToken"); + + } + private ListObjectsV2Result createListObjectsV2Result(final List<S3ObjectSummary> summaries, final String nextToken) { final ListObjectsV2Result result = mock(ListObjectsV2Result.class); From 7ef4e318f52ce841fca18c69902911a8f01ba628 Mon Sep 17 00:00:00 2001 From: Murali Basani <muralidhar.basani@aiven.io> Date: Tue, 17 Dec 2024 11:30:16 +0100 Subject: [PATCH 81/90] Remove converters instantiation [KCON-25] (#368) KCON-25 Addresses https://github.com/Aiven-Open/cloud-storage-connectors-for-apache-kafka/pull/316#discussion_r1807110452 - Removed the converters instantiation - For avro using AvroData utils to create SchemaAndValue - For json, as there are no utils, relying on json converter - Deleted the transformation of data (serialization, toConnectData) in transformers With this change, redundant transformation is removed, making it flexible for consumers --- .../common/source/input/AvroTransformer.java | 26 ++++++-- .../source/input/ByteArrayTransformer.java | 12 +++- .../common/source/input/JsonTransformer.java | 44 ++++++------- .../source/input/ParquetTransformer.java | 25 ++++++-- .../source/input/TransformationUtils.java | 64 ------------------- .../common/source/input/Transformer.java | 5 +- .../source/input/TransformerFactory.java | 24 ++++++- .../source/input/AvroTransformerTest.java | 3 +- .../input/ByteArrayTransformerTest.java | 3 +- .../source/input/JsonTransformerTest.java | 61 ++++++++++-------- .../source/input/ParquetTransformerTest.java | 3 +- .../input/TransformerStreamingTest.java | 9 ++- .../kafka/connect/s3/source/S3SourceTask.java | 36 +---------- .../s3/source/utils/RecordProcessor.java | 28 ++------ .../s3/source/utils/S3SourceRecord.java | 32 +++------- .../s3/source/utils/SourceRecordIterator.java | 21 +++--- .../connect/s3/source/S3SourceTaskTest.java | 16 ++--- .../s3/source/utils/RecordProcessorTest.java | 54 ++++------------ .../utils/SourceRecordIteratorTest.java | 10 --- 19 files changed, 190 insertions(+), 286 deletions(-) delete mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformationUtils.java diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java index 770cb279c..de770cbc2 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java @@ -20,12 +20,15 @@ import java.io.IOException; import java.io.InputStream; -import java.util.Collections; +import java.nio.charset.StandardCharsets; import java.util.Map; import java.util.function.Consumer; import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaAndValue; +import io.confluent.connect.avro.AvroData; import org.apache.avro.file.DataFileStream; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; @@ -36,8 +39,15 @@ public class AvroTransformer extends Transformer<GenericRecord> { + private final AvroData avroData; + private static final Logger LOGGER = LoggerFactory.getLogger(AvroTransformer.class); + AvroTransformer(final AvroData avroData) { + super(); + this.avroData = avroData; + } + @Override public void configureValueConverter(final Map<String, String> config, final AbstractConfig sourceConfig) { config.put(SCHEMA_REGISTRY_URL, sourceConfig.getString(SCHEMA_REGISTRY_URL)); @@ -46,7 +56,7 @@ public void configureValueConverter(final Map<String, String> config, final Abst @Override public StreamSpliterator<GenericRecord> createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, final int topicPartition, final AbstractConfig sourceConfig) { - return new StreamSpliterator<GenericRecord>(LOGGER, inputStreamIOSupplier) { + return new StreamSpliterator<>(LOGGER, inputStreamIOSupplier) { private DataFileStream<GenericRecord> dataFileStream; private final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); @@ -79,7 +89,15 @@ protected boolean doAdvance(final Consumer<? super GenericRecord> action) { } @Override - public byte[] getValueBytes(final GenericRecord record, final String topic, final AbstractConfig sourceConfig) { - return TransformationUtils.serializeAvroRecordToBytes(Collections.singletonList(record), topic, sourceConfig); + public SchemaAndValue getValueData(final GenericRecord record, final String topic, + final AbstractConfig sourceConfig) { + return avroData.toConnectData(record.getSchema(), record); + } + + @Override + public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topic, + final AbstractConfig sourceConfig) { + return new SchemaAndValue(Schema.OPTIONAL_BYTES_SCHEMA, + ((String) cloudStorageKey).getBytes(StandardCharsets.UTF_8)); } } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java index d220f686f..f571062d9 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java @@ -18,11 +18,13 @@ import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Map; import java.util.function.Consumer; import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.connect.data.SchemaAndValue; import org.apache.commons.io.IOUtils; import org.apache.commons.io.function.IOSupplier; @@ -76,7 +78,13 @@ protected boolean doAdvance(final Consumer<? super byte[]> action) { } @Override - public byte[] getValueBytes(final byte[] record, final String topic, final AbstractConfig sourceConfig) { - return record; + public SchemaAndValue getValueData(final byte[] record, final String topic, final AbstractConfig sourceConfig) { + return new SchemaAndValue(null, record); + } + + @Override + public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topic, + final AbstractConfig sourceConfig) { + return new SchemaAndValue(null, ((String) cloudStorageKey).getBytes(StandardCharsets.UTF_8)); } } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java index 9ba4e3678..4ff0f1a24 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java @@ -16,8 +16,6 @@ package io.aiven.kafka.connect.common.source.input; -import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMAS_ENABLE; - import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; @@ -27,30 +25,36 @@ import java.util.function.Consumer; import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.json.JsonConverter; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.io.function.IOSupplier; import org.codehaus.plexus.util.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class JsonTransformer extends Transformer<JsonNode> { +public class JsonTransformer extends Transformer<byte[]> { + + private final JsonConverter jsonConverter; private static final Logger LOGGER = LoggerFactory.getLogger(JsonTransformer.class); final ObjectMapper objectMapper = new ObjectMapper(); + JsonTransformer(final JsonConverter jsonConverter) { + super(); + this.jsonConverter = jsonConverter; + } + @Override public void configureValueConverter(final Map<String, String> config, final AbstractConfig sourceConfig) { - config.put(SCHEMAS_ENABLE, "false"); } @Override - public StreamSpliterator<JsonNode> createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, + public StreamSpliterator<byte[]> createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, final int topicPartition, final AbstractConfig sourceConfig) { - final StreamSpliterator<JsonNode> spliterator = new StreamSpliterator<JsonNode>(LOGGER, inputStreamIOSupplier) { + final StreamSpliterator<byte[]> spliterator = new StreamSpliterator<>(LOGGER, inputStreamIOSupplier) { BufferedReader reader; @Override @@ -71,7 +75,7 @@ public void doClose() { } @Override - public boolean doAdvance(final Consumer<? super JsonNode> action) { + public boolean doAdvance(final Consumer<? super byte[]> action) { String line = null; try { // remove blank and empty lines. @@ -83,12 +87,7 @@ public boolean doAdvance(final Consumer<? super JsonNode> action) { } } line = line.trim(); - try { - action.accept(objectMapper.readTree(line)); // Parse the JSON - } catch (IOException e) { - LOGGER.error("Error parsing JSON record: {}", e.getMessage(), e); - return false; - } + action.accept(line.getBytes(StandardCharsets.UTF_8)); return true; } catch (IOException e) { LOGGER.error("Error reading input stream: {}", e.getMessage(), e); @@ -101,12 +100,13 @@ public boolean doAdvance(final Consumer<? super JsonNode> action) { } @Override - public byte[] getValueBytes(final JsonNode record, final String topic, final AbstractConfig sourceConfig) { - try { - return objectMapper.writeValueAsBytes(record); - } catch (JsonProcessingException e) { - LOGGER.error("Failed to serialize record to JSON bytes. Error: {}", e.getMessage(), e); - return new byte[0]; - } + public SchemaAndValue getValueData(final byte[] record, final String topic, final AbstractConfig sourceConfig) { + return jsonConverter.toConnectData(topic, record); + } + + @Override + public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topic, + final AbstractConfig sourceConfig) { + return new SchemaAndValue(null, ((String) cloudStorageKey).getBytes(StandardCharsets.UTF_8)); } } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java index 1477b13f8..7da61c412 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java @@ -22,17 +22,19 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.time.Instant; -import java.util.Collections; import java.util.Map; import java.util.function.Consumer; import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.connect.data.SchemaAndValue; import io.aiven.kafka.connect.common.source.input.parquet.LocalInputFile; +import io.confluent.connect.avro.AvroData; import org.apache.avro.generic.GenericRecord; import org.apache.commons.compress.utils.IOUtils; import org.apache.commons.io.function.IOSupplier; @@ -43,24 +45,37 @@ public class ParquetTransformer extends Transformer<GenericRecord> { + private final AvroData avroData; + private static final Logger LOGGER = LoggerFactory.getLogger(ParquetTransformer.class); + ParquetTransformer(final AvroData avroData) { + super(); + this.avroData = avroData; + } + @Override public void configureValueConverter(final Map<String, String> config, final AbstractConfig sourceConfig) { config.put(SCHEMA_REGISTRY_URL, sourceConfig.getString(SCHEMA_REGISTRY_URL)); } @Override - public byte[] getValueBytes(final GenericRecord record, final String topic, final AbstractConfig sourceConfig) { - return TransformationUtils.serializeAvroRecordToBytes(Collections.singletonList(record), topic, sourceConfig); + public SchemaAndValue getValueData(final GenericRecord record, final String topic, + final AbstractConfig sourceConfig) { + return avroData.toConnectData(record.getSchema(), record); + } + + @Override + public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topic, + final AbstractConfig sourceConfig) { + return new SchemaAndValue(null, ((String) cloudStorageKey).getBytes(StandardCharsets.UTF_8)); } @Override public StreamSpliterator<GenericRecord> createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, final int topicPartition, final AbstractConfig sourceConfig) { - final StreamSpliterator<GenericRecord> spliterator = new StreamSpliterator<GenericRecord>(LOGGER, - inputStreamIOSupplier) { + final StreamSpliterator<GenericRecord> spliterator = new StreamSpliterator<>(LOGGER, inputStreamIOSupplier) { private ParquetReader<GenericRecord> reader; private File parquetFile; diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformationUtils.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformationUtils.java deleted file mode 100644 index 9f81d4406..000000000 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformationUtils.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.input; - -import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMA_REGISTRY_URL; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.lang.reflect.InvocationTargetException; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -import org.apache.kafka.common.config.AbstractConfig; - -import io.aiven.kafka.connect.common.config.SchemaRegistryFragment; - -import io.confluent.kafka.serializers.KafkaAvroSerializer; -import org.apache.avro.generic.GenericRecord; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -final public class TransformationUtils { - private static final Logger LOGGER = LoggerFactory.getLogger(TransformationUtils.class); - - private TransformationUtils() { - // hidden - } - - public static byte[] serializeAvroRecordToBytes(final List<GenericRecord> avroRecords, final String topic, - final AbstractConfig sourceConfig) { - final SchemaRegistryFragment registryFragment = new SchemaRegistryFragment(sourceConfig); - final Map<String, String> config = Collections.singletonMap(SCHEMA_REGISTRY_URL, - registryFragment.getSchemaRegistryUrl()); - - try (KafkaAvroSerializer avroSerializer = (KafkaAvroSerializer) registryFragment.getAvroValueSerializer() - .getDeclaredConstructor() - .newInstance(); ByteArrayOutputStream out = new ByteArrayOutputStream()) { - avroSerializer.configure(config, false); - for (final GenericRecord avroRecord : avroRecords) { - out.write(avroSerializer.serialize(topic, avroRecord)); - } - return out.toByteArray(); - } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException - | IOException e) { - LOGGER.error("Error in reading s3 object stream for topic {} with error : {}", topic, e.getMessage(), e); - } - return new byte[0]; - } -} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java index 26f46f5c6..196d9ae3c 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java @@ -25,6 +25,7 @@ import java.util.stream.StreamSupport; import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.connect.data.SchemaAndValue; import org.apache.commons.io.function.IOSupplier; import org.slf4j.Logger; @@ -57,7 +58,9 @@ public final Stream<T> getRecords(final IOSupplier<InputStream> inputStreamIOSup protected abstract StreamSpliterator<T> createSpliterator(IOSupplier<InputStream> inputStreamIOSupplier, String topic, int topicPartition, AbstractConfig sourceConfig); - public abstract byte[] getValueBytes(T record, String topic, AbstractConfig sourceConfig); + public abstract SchemaAndValue getValueData(T record, String topic, AbstractConfig sourceConfig); + + public abstract SchemaAndValue getKeyData(Object cloudStorageKey, String topic, AbstractConfig sourceConfig); /** * A Spliterator that performs various checks on the opening/closing of the input stream. diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java index f868d7328..43a1b0ef7 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java @@ -17,12 +17,22 @@ package io.aiven.kafka.connect.common.source.input; import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMAS_ENABLE; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.kafka.connect.json.JsonConverter; import io.aiven.kafka.connect.common.config.SchemaRegistryFragment; import io.aiven.kafka.connect.common.config.SourceCommonConfig; +import io.confluent.connect.avro.AvroData; + public final class TransformerFactory { + public static final int CACHE_SIZE = 100; + private TransformerFactory() { // hidden } @@ -30,11 +40,13 @@ public static Transformer getTransformer(final SourceCommonConfig sourceConfig) final InputFormat inputFormatEnum = new SchemaRegistryFragment(sourceConfig).getInputFormat(); switch (inputFormatEnum) { case AVRO : - return new AvroTransformer(); + return new AvroTransformer(new AvroData(CACHE_SIZE)); case PARQUET : - return new ParquetTransformer(); + return new ParquetTransformer(new AvroData(CACHE_SIZE)); case JSONL : - return new JsonTransformer(); + final JsonConverter jsonConverter = new JsonConverter(); + configureJsonConverter(jsonConverter); + return new JsonTransformer(jsonConverter); case BYTES : return new ByteArrayTransformer(); default : @@ -42,4 +54,10 @@ public static Transformer getTransformer(final SourceCommonConfig sourceConfig) "Unknown input format in configuration: " + sourceConfig.getString(INPUT_FORMAT_KEY)); } } + + private static void configureJsonConverter(final JsonConverter jsonConverter) { + final Map<String, String> config = new HashMap<>(); + config.put(SCHEMAS_ENABLE, "false"); + jsonConverter.configure(config, false); + } } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java index 67b028283..50e54a284 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java @@ -34,6 +34,7 @@ import io.aiven.kafka.connect.common.config.SourceCommonConfig; +import io.confluent.connect.avro.AvroData; import org.apache.avro.Schema; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericData; @@ -57,7 +58,7 @@ final class AvroTransformerTest { @BeforeEach void setUp() { - avroTransformer = new AvroTransformer(); + avroTransformer = new AvroTransformer(new AvroData(100)); config = new HashMap<>(); } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java index d43db66bf..ee6b76001 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java @@ -76,7 +76,8 @@ void testGetRecordsEmptyInputStream() { @Test void testGetValueBytes() { final byte[] record = { 1, 2, 3 }; - final byte[] result = byteArrayTransformer.getValueBytes(record, TEST_TOPIC, sourceCommonConfig); + final byte[] result = (byte[]) byteArrayTransformer.getValueData(record, TEST_TOPIC, sourceCommonConfig) + .value(); assertThat(result).containsExactlyInAnyOrder(record); } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java index 73a659575..a38a2bc8a 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java @@ -18,6 +18,7 @@ import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMAS_ENABLE; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -31,11 +32,13 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.kafka.connect.errors.DataException; +import org.apache.kafka.connect.json.JsonConverter; + import io.aiven.kafka.connect.common.config.SourceCommonConfig; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.io.function.IOSupplier; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -53,19 +56,22 @@ final class JsonTransformerTest { @Mock private IOSupplier<InputStream> inputStreamIOSupplierMock; + JsonConverter jsonConverter; + @BeforeEach void setUp() { - jsonTransformer = new JsonTransformer(); + jsonConverter = new JsonConverter(); + final Map<String, String> config = new HashMap<>(); + config.put(SCHEMAS_ENABLE, "false"); + jsonConverter.configure(config, false); + + jsonTransformer = new JsonTransformer(jsonConverter); sourceCommonConfig = mock(SourceCommonConfig.class); } - @Test - void testConfigureValueConverter() { - final Map<String, String> config = new HashMap<>(); - - jsonTransformer.configureValueConverter(config, sourceCommonConfig); - assertThat(config).as("%s should be set to false", SCHEMAS_ENABLE) - .containsEntry(SCHEMAS_ENABLE, Boolean.FALSE.toString()); + @AfterEach + void destroy() { + jsonConverter.close(); } @Test @@ -73,7 +79,7 @@ void testHandleValueDataWithValidJson() { final InputStream validJsonInputStream = new ByteArrayInputStream( "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; - final Stream<JsonNode> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + final Stream<byte[]> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, sourceCommonConfig, 0); assertThat(jsonNodes).hasSize(1); @@ -84,12 +90,12 @@ void testHandleValueDataWithValidJsonSkipFew() { final InputStream validJsonInputStream = new ByteArrayInputStream( getJsonRecs(100).getBytes(StandardCharsets.UTF_8)); final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; - final Stream<JsonNode> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + final Stream<byte[]> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, sourceCommonConfig, 25L); - final List<Object> recs = jsonNodes.collect(Collectors.toList()); + final List<byte[]> recs = jsonNodes.collect(Collectors.toList()); assertThat(recs).hasSize(75); - assertThat(recs).extracting(record -> ((JsonNode) record).get("key").asText()) + assertThat(recs).extracting(record -> ((Map) jsonTransformer.getValueData(record, "", null).value()).get("key")) .doesNotContain("value1") .doesNotContain("value2") .doesNotContain("value25") @@ -104,10 +110,12 @@ void testHandleValueDataWithInvalidJson() { "invalid-json".getBytes(StandardCharsets.UTF_8)); final IOSupplier<InputStream> inputStreamIOSupplier = () -> invalidJsonInputStream; - final Stream<JsonNode> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + final Stream<byte[]> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, sourceCommonConfig, 0); - assertThat(jsonNodes).isEmpty(); + assertThatThrownBy(() -> jsonTransformer.getValueData(jsonNodes.findAny().get(), "", null)) + .isInstanceOf(DataException.class) + .hasMessage("Converting byte[] to Kafka Connect data failed due to serialization error: "); } @Test @@ -115,21 +123,22 @@ void testSerializeJsonDataValid() throws IOException { final InputStream validJsonInputStream = new ByteArrayInputStream( "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; - final Stream<JsonNode> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + final Stream<byte[]> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, sourceCommonConfig, 0); - final byte[] serializedData = jsonTransformer.getValueBytes(jsonNodes.findFirst().get(), TESTTOPIC, - sourceCommonConfig); - - final ObjectMapper objectMapper = new ObjectMapper(); - final JsonNode expectedData = objectMapper.readTree(serializedData); - - assertThat(expectedData.get("key").asText()).isEqualTo("value"); + final Object serializedData = jsonTransformer + .getValueData( + jsonNodes.findFirst().orElseThrow(() -> new AssertionError("No records found in stream!")), + TESTTOPIC, sourceCommonConfig) + .value(); + + // Assert: Verify the serialized data + assertThat(serializedData).isInstanceOf(Map.class).extracting("key").isEqualTo("value"); } @Test void testGetRecordsWithIOException() throws IOException { when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException")); - final Stream<JsonNode> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); + final Stream<byte[]> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); assertThat(resultStream).isEmpty(); } @@ -137,7 +146,7 @@ void testGetRecordsWithIOException() throws IOException { @Test void testCustomSpliteratorWithIOExceptionDuringInitialization() throws IOException { when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException during initialization")); - final Stream<JsonNode> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); + final Stream<byte[]> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); assertThat(resultStream).isEmpty(); } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java index cde0a11ac..154baf45a 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java @@ -35,6 +35,7 @@ import io.aiven.kafka.connect.common.config.SourceCommonConfig; +import io.confluent.connect.avro.AvroData; import org.apache.avro.generic.GenericRecord; import org.apache.commons.io.IOUtils; import org.apache.commons.io.function.IOSupplier; @@ -50,7 +51,7 @@ final class ParquetTransformerTest { @BeforeEach public void setUp() { - parquetTransformer = new ParquetTransformer(); + parquetTransformer = new ParquetTransformer(new AvroData(100)); } @Test diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java index 9caa098f1..f61dd9423 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java @@ -32,9 +32,11 @@ import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.connect.json.JsonConverter; import io.aiven.kafka.connect.common.config.CommonConfig; +import io.confluent.connect.avro.AvroData; import org.apache.commons.io.function.IOSupplier; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; @@ -84,17 +86,18 @@ void verifyCloseCalledAtIteratorEnd(final Transformer<?> transformer, final byte static Stream<Arguments> testData() throws IOException { final List<Arguments> lst = new ArrayList<>(); - lst.add(Arguments.of(new AvroTransformer(), AvroTransformerTest.generateMockAvroData(100).toByteArray(), + final AvroData avroData = new AvroData(100); + lst.add(Arguments.of(new AvroTransformer(avroData), AvroTransformerTest.generateMockAvroData(100).toByteArray(), new CommonConfig(new ConfigDef(), new HashMap<>()) { }, 100)); lst.add(Arguments.of(new ByteArrayTransformer(), "Hello World".getBytes(StandardCharsets.UTF_8), new CommonConfig(new ConfigDef(), new HashMap<>()) { }, 1)); - lst.add(Arguments.of(new JsonTransformer(), + lst.add(Arguments.of(new JsonTransformer(new JsonConverter()), JsonTransformerTest.getJsonRecs(100).getBytes(StandardCharsets.UTF_8), new CommonConfig(new ConfigDef(), new HashMap<>()) { }, 100)); - lst.add(Arguments.of(new ParquetTransformer(), ParquetTransformerTest.generateMockParquetData(), + lst.add(Arguments.of(new ParquetTransformer(avroData), ParquetTransformerTest.generateMockParquetData(), new CommonConfig(new ConfigDef(), new HashMap<>()) { }, 100)); return lst.stream(); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index be3d89618..aa331b4aa 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -18,21 +18,17 @@ import static io.aiven.kafka.connect.common.config.SourceConfigFragment.MAX_POLL_RECORDS; -import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.source.SourceTask; -import org.apache.kafka.connect.storage.Converter; import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.common.source.input.TransformerFactory; @@ -71,10 +67,6 @@ public class S3SourceTask extends SourceTask { private AmazonS3 s3Client; private Iterator<S3SourceRecord> sourceRecordIterator; - private Optional<Converter> keyConverter; - - private Converter valueConverter; - private Transformer transformer; private boolean taskInitialized; @@ -102,7 +94,6 @@ public String version() { public void start(final Map<String, String> props) { LOGGER.info("S3 Source task started."); s3SourceConfig = new S3SourceConfig(props); - initializeConverters(); this.transformer = TransformerFactory.getTransformer(s3SourceConfig); offsetManager = new OffsetManager(context, s3SourceConfig); awsv2SourceClient = new AWSV2SourceClient(s3SourceConfig, failedObjectKeys); @@ -110,21 +101,6 @@ public void start(final Map<String, String> props) { this.taskInitialized = true; } - private void initializeConverters() { - try { - keyConverter = Optional - .of((Converter) Class.forName((String) s3SourceConfig.originals().get("key.converter")) - .getDeclaredConstructor() - .newInstance()); - valueConverter = (Converter) Class.forName((String) s3SourceConfig.originals().get("value.converter")) - .getDeclaredConstructor() - .newInstance(); - } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | InvocationTargetException - | NoSuchMethodException e) { - throw new ConnectException("Connect converters could not be instantiated.", e); - } - } - private void prepareReaderFromOffsetStorageReader() { sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, offsetManager, this.transformer, awsv2SourceClient); @@ -174,8 +150,8 @@ private List<SourceRecord> extractSourceRecords(final List<SourceRecord> results if (connectorStopped.get()) { return results; } - return RecordProcessor.processRecords(sourceRecordIterator, results, s3SourceConfig, keyConverter, - valueConverter, connectorStopped, this.transformer, awsv2SourceClient, offsetManager); + return RecordProcessor.processRecords(sourceRecordIterator, results, s3SourceConfig, connectorStopped, + awsv2SourceClient, offsetManager); } private void waitForObjects() throws InterruptedException { @@ -200,14 +176,6 @@ private void closeResources() { } // below for visibility in tests - public Optional<Converter> getKeyConverter() { - return keyConverter; - } - - public Converter getValueConverter() { - return valueConverter; - } - public Transformer getTransformer() { return transformer; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index b89cdfd81..bdf265338 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -16,21 +16,15 @@ package io.aiven.kafka.connect.s3.source.utils; -import java.util.HashMap; import java.util.Iterator; import java.util.List; -import java.util.Map; -import java.util.Optional; import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.kafka.connect.data.SchemaAndValue; import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.source.SourceRecord; -import org.apache.kafka.connect.storage.Converter; import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; -import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import org.slf4j.Logger; @@ -45,19 +39,16 @@ private RecordProcessor() { } public static List<SourceRecord> processRecords(final Iterator<S3SourceRecord> sourceRecordIterator, - final List<SourceRecord> results, final S3SourceConfig s3SourceConfig, - final Optional<Converter> keyConverter, final Converter valueConverter, - final AtomicBoolean connectorStopped, final Transformer transformer, final AWSV2SourceClient sourceClient, - final OffsetManager offsetManager) { + final List<SourceRecord> results, final S3SourceConfig s3SourceConfig, final AtomicBoolean connectorStopped, + final AWSV2SourceClient sourceClient, final OffsetManager offsetManager) { - final Map<String, String> conversionConfig = new HashMap<>(); final int maxPollRecords = s3SourceConfig.getMaxPollRecords(); for (int i = 0; sourceRecordIterator.hasNext() && i < maxPollRecords && !connectorStopped.get(); i++) { final S3SourceRecord s3SourceRecord = sourceRecordIterator.next(); if (s3SourceRecord != null) { - final SourceRecord sourceRecord = createSourceRecord(s3SourceRecord, s3SourceConfig, keyConverter, - valueConverter, conversionConfig, transformer, sourceClient, offsetManager); + final SourceRecord sourceRecord = createSourceRecord(s3SourceRecord, s3SourceConfig, sourceClient, + offsetManager); results.add(sourceRecord); } } @@ -66,20 +57,11 @@ public static List<SourceRecord> processRecords(final Iterator<S3SourceRecord> s } static SourceRecord createSourceRecord(final S3SourceRecord s3SourceRecord, final S3SourceConfig s3SourceConfig, - final Optional<Converter> keyConverter, final Converter valueConverter, - final Map<String, String> conversionConfig, final Transformer transformer, final AWSV2SourceClient sourceClient, final OffsetManager offsetManager) { - - final String topic = s3SourceRecord.getTopic(); - final Optional<SchemaAndValue> keyData = keyConverter.map(c -> c.toConnectData(topic, s3SourceRecord.key())); - - transformer.configureValueConverter(conversionConfig, s3SourceConfig); - valueConverter.configure(conversionConfig, false); try { - final SchemaAndValue schemaAndValue = valueConverter.toConnectData(topic, s3SourceRecord.value()); offsetManager.updateCurrentOffsets(s3SourceRecord.getPartitionMap(), s3SourceRecord.getOffsetMap()); s3SourceRecord.setOffsetMap(offsetManager.getOffsets().get(s3SourceRecord.getPartitionMap())); - return s3SourceRecord.getSourceRecord(topic, keyData, schemaAndValue); + return s3SourceRecord.getSourceRecord(); } catch (DataException e) { if (ErrorsTolerance.NONE.equals(s3SourceConfig.getErrorsTolerance())) { throw new ConnectException("Data Exception caught during S3 record to source record transformation", e); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java index 7880bf868..c4be50217 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java @@ -19,7 +19,6 @@ import java.util.Collections; import java.util.HashMap; import java.util.Map; -import java.util.Optional; import org.apache.kafka.connect.data.SchemaAndValue; import org.apache.kafka.connect.source.SourceRecord; @@ -29,22 +28,21 @@ public class S3SourceRecord { private Map<String, Object> offsetMap; private final String topic; private final Integer topicPartition; - private final byte[] recordKey; - private final byte[] recordValue; + private final SchemaAndValue keyData; + + private final SchemaAndValue valueData; private final String objectKey; public S3SourceRecord(final Map<String, Object> partitionMap, final Map<String, Object> offsetMap, - final String topic, final Integer topicPartition, final byte[] recordKey, final byte[] recordValue, - final String objectKey) { + final String topic, final Integer topicPartition, final String objectKey, final SchemaAndValue keyData, + final SchemaAndValue valueData) { this.partitionMap = new HashMap<>(partitionMap); this.offsetMap = new HashMap<>(offsetMap); - this.topic = topic; this.topicPartition = topicPartition; - this.recordKey = recordKey.clone(); // Defensive copy - this.recordValue = recordValue.clone(); // Defensive copy - + this.keyData = keyData; + this.valueData = valueData; this.objectKey = objectKey; } @@ -64,14 +62,6 @@ public Integer partition() { return topicPartition; } - public byte[] key() { - return (recordKey == null) ? null : recordKey.clone(); // Return a defensive copy - } - - public byte[] value() { - return (recordValue == null) ? null : recordValue.clone(); // Return a defensive copy - } - public String getObjectKey() { return objectKey; } @@ -80,10 +70,8 @@ public void setOffsetMap(final Map<String, Object> offsetMap) { this.offsetMap = new HashMap<>(offsetMap); } - public SourceRecord getSourceRecord(final String topic, final Optional<SchemaAndValue> keyData, - final SchemaAndValue schemaAndValue) { - return new SourceRecord(getPartitionMap(), getOffsetMap(), topic, partition(), - keyData.map(SchemaAndValue::schema).orElse(null), keyData.map(SchemaAndValue::value).orElse(null), - schemaAndValue.schema(), schemaAndValue.value()); + public SourceRecord getSourceRecord() { + return new SourceRecord(getPartitionMap(), getOffsetMap(), topic, partition(), keyData.schema(), + keyData.value(), valueData.schema(), valueData.value()); } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 6cab0d12f..ac5a3061a 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -17,7 +17,6 @@ package io.aiven.kafka.connect.s3.source.utils; import java.io.IOException; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; @@ -27,6 +26,8 @@ import java.util.regex.Pattern; import java.util.stream.Stream; +import org.apache.kafka.connect.data.SchemaAndValue; + import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; @@ -141,18 +142,15 @@ private List<S3SourceRecord> readNext() { return sourceRecords; } - final byte[] keyBytes = currentObjectKey.getBytes(StandardCharsets.UTF_8); - try (Stream<Object> recordStream = transformer.getRecords(s3Object::getObjectContent, topic, topicPartition, s3SourceConfig, numberOfRecsAlreadyProcessed)) { final Iterator<Object> recordIterator = recordStream.iterator(); while (recordIterator.hasNext()) { final Object record = recordIterator.next(); - final byte[] valueBytes = transformer.getValueBytes(record, topic, s3SourceConfig); - - sourceRecords.add(getSourceRecord(keyBytes, valueBytes, topic, topicPartition, offsetManager, - startOffset, partitionMap)); + sourceRecords.add(getSourceRecord(topic, topicPartition, offsetManager, startOffset, + partitionMap, transformer.getValueData(record, topic, s3SourceConfig), + transformer.getKeyData(currentObjectKey, topic, s3SourceConfig))); // Break if we have reached the max records per poll if (sourceRecords.size() >= s3SourceConfig.getMaxPollRecords()) { @@ -171,9 +169,9 @@ private boolean checkBytesTransformation(final Transformer transformer, && numberOfRecsAlreadyProcessed == BYTES_TRANSFORMATION_NUM_OF_RECS; } - private S3SourceRecord getSourceRecord(final byte[] key, final byte[] value, final String topic, - final int topicPartition, final OffsetManager offsetManager, final long startOffset, - final Map<String, Object> partitionMap) { + private S3SourceRecord getSourceRecord(final String topic, final int topicPartition, + final OffsetManager offsetManager, final long startOffset, final Map<String, Object> partitionMap, + final SchemaAndValue valueData, final SchemaAndValue keyData) { long currentOffset; @@ -189,7 +187,8 @@ private S3SourceRecord getSourceRecord(final byte[] key, final byte[] value, fin final Map<String, Object> offsetMap = offsetManager.getOffsetValueMap(currentObjectKey, currentOffset); - return new S3SourceRecord(partitionMap, offsetMap, topic, topicPartition, key, value, currentObjectKey); + return new S3SourceRecord(partitionMap, offsetMap, topic, topicPartition, currentObjectKey, keyData, + valueData); } @Override diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java index b20b713fa..590ad23bb 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -27,13 +27,12 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.Random; -import org.apache.kafka.connect.converters.ByteArrayConverter; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaAndValue; import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.source.SourceTaskContext; -import org.apache.kafka.connect.storage.Converter; import org.apache.kafka.connect.storage.OffsetStorageReader; import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; @@ -130,13 +129,6 @@ void testS3SourceTaskInitialization() { final S3SourceTask s3SourceTask = new S3SourceTask(); startSourceTask(s3SourceTask); - final Optional<Converter> keyConverter = s3SourceTask.getKeyConverter(); - assertThat(keyConverter).isPresent(); - assertThat(keyConverter.get()).isInstanceOf(ByteArrayConverter.class); - - final Converter valueConverter = s3SourceTask.getValueConverter(); - assertThat(valueConverter).isInstanceOf(ByteArrayConverter.class); - final Transformer transformer = s3SourceTask.getTransformer(); assertThat(transformer).isInstanceOf(ByteArrayTransformer.class); @@ -174,7 +166,9 @@ void testStop() { } private static S3SourceRecord getAivenS3SourceRecord() { - return new S3SourceRecord(new HashMap<>(), new HashMap<>(), "testtopic", 0, new byte[0], new byte[0], ""); + return new S3SourceRecord(new HashMap<>(), new HashMap<>(), "testtopic", 0, "", + new SchemaAndValue(Schema.OPTIONAL_BYTES_SCHEMA, new byte[0]), + new SchemaAndValue(Schema.OPTIONAL_BYTES_SCHEMA, new byte[0])); } @SuppressWarnings("PMD.AvoidAccessibilityAlteration") diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java index 11dae1dc0..e02135d18 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -18,8 +18,6 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; @@ -27,15 +25,11 @@ import static org.mockito.internal.verification.VerificationModeFactory.times; import java.net.ConnectException; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; -import java.util.HashMap; import java.util.Iterator; import java.util.List; -import java.util.Optional; import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.kafka.connect.data.SchemaAndValue; import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.storage.Converter; @@ -86,10 +80,8 @@ void testProcessRecordsNoRecords() { sourceRecordIterator, results, s3SourceConfig, - Optional.of(keyConverter), - valueConverter, connectorStopped, - transformer, sourceClient, offsetManager + sourceClient, offsetManager ); assertThat(processedRecords).as("Processed records should be empty when there are no records.").isEmpty(); @@ -108,10 +100,8 @@ void testProcessRecordsWithRecords() throws ConnectException { sourceRecordIterator, results, s3SourceConfig, - Optional.of(keyConverter), - valueConverter, connectorStopped, - transformer, sourceClient, offsetManager + sourceClient, offsetManager ); assertThat(results).hasSize(1); @@ -128,10 +118,8 @@ void testProcessRecordsConnectorStopped() { sourceRecordIterator, results, s3SourceConfig, - Optional.of(keyConverter), - valueConverter, connectorStopped, - transformer, sourceClient, offsetManager + sourceClient, offsetManager ); assertThat(processedRecords).as("Processed records should be empty when connector is stopped.").isEmpty(); @@ -141,16 +129,10 @@ void testProcessRecordsConnectorStopped() { @Test void testCreateSourceRecords() { final S3SourceRecord mockRecord = mock(S3SourceRecord.class); - when(mockRecord.getTopic()).thenReturn("test-topic"); - when(mockRecord.key()).thenReturn("mock-key".getBytes(StandardCharsets.UTF_8)); - when(mockRecord.value()).thenReturn("mock-value".getBytes(StandardCharsets.UTF_8)); + when(mockRecord.getSourceRecord()).thenReturn(mock(SourceRecord.class)); - when(valueConverter.toConnectData(anyString(), any())) - .thenReturn(new SchemaAndValue(null, "mock-value-converted")); - when(mockRecord.getSourceRecord(anyString(), any(), any())).thenReturn(mock(SourceRecord.class)); - - final SourceRecord sourceRecords = RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, - Optional.of(keyConverter), valueConverter, new HashMap<>(), transformer, sourceClient, offsetManager); + final SourceRecord sourceRecords = RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, + offsetManager); assertThat(sourceRecords).isNotNull(); } @@ -158,18 +140,12 @@ void testCreateSourceRecords() { @Test void errorToleranceOnNONE() { final S3SourceRecord mockRecord = mock(S3SourceRecord.class); - when(mockRecord.getTopic()).thenReturn("test-topic"); - when(mockRecord.key()).thenReturn("mock-key".getBytes(StandardCharsets.UTF_8)); - when(mockRecord.value()).thenReturn("mock-value".getBytes(StandardCharsets.UTF_8)); - - when(valueConverter.toConnectData(anyString(), any())) - .thenReturn(new SchemaAndValue(null, "mock-value-converted")); - when(mockRecord.getSourceRecord(anyString(), any(), any())).thenThrow(new DataException("generic issue")); + when(mockRecord.getSourceRecord()).thenThrow(new DataException("generic issue")); when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.NONE); - assertThatThrownBy(() -> RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, - Optional.of(keyConverter), valueConverter, new HashMap<>(), transformer, sourceClient, offsetManager)) + assertThatThrownBy( + () -> RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, offsetManager)) .isInstanceOf(org.apache.kafka.connect.errors.ConnectException.class) .hasMessage("Data Exception caught during S3 record to source record transformation"); @@ -178,18 +154,12 @@ void errorToleranceOnNONE() { @Test void errorToleranceOnALL() { final S3SourceRecord mockRecord = mock(S3SourceRecord.class); - when(mockRecord.getTopic()).thenReturn("test-topic"); - when(mockRecord.key()).thenReturn("mock-key".getBytes(StandardCharsets.UTF_8)); - when(mockRecord.value()).thenReturn("mock-value".getBytes(StandardCharsets.UTF_8)); - - when(valueConverter.toConnectData(anyString(), any())) - .thenReturn(new SchemaAndValue(null, "mock-value-converted")); - when(mockRecord.getSourceRecord(anyString(), any(), any())).thenThrow(new DataException("generic issue")); + when(mockRecord.getSourceRecord()).thenThrow(new DataException("generic issue")); when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.ALL); - assertThat(RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, Optional.of(keyConverter), - valueConverter, new HashMap<>(), transformer, sourceClient, offsetManager)).isNull(); + assertThat(RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, offsetManager)) + .isNull(); } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java index d73068bfd..61d8170f7 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -30,7 +30,6 @@ import static org.mockito.Mockito.when; import java.io.ByteArrayInputStream; -import java.nio.charset.StandardCharsets; import java.util.Collections; import java.util.stream.Stream; @@ -75,10 +74,6 @@ void testIteratorProcessesS3Objects() throws Exception { when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) .thenReturn(Stream.of(new Object())); - final String outStr = "this is a test"; - when(mockTransformer.getValueBytes(any(), anyString(), any())) - .thenReturn(outStr.getBytes(StandardCharsets.UTF_8)); - when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); when(mockSourceApiClient.getListOfObjectKeys(any())).thenReturn(Collections.emptyIterator()); @@ -115,11 +110,6 @@ void testIteratorProcessesS3ObjectsForByteArrayTransformer() throws Exception { when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) .thenReturn(Stream.of(new Object())); - final String outStr = "this is a test"; - - when(mockTransformer.getValueBytes(any(), anyString(), any())) - .thenReturn(outStr.getBytes(StandardCharsets.UTF_8)); - when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); when(mockSourceApiClient.getListOfObjectKeys(any())) From 9d78391e17d1c18b73e32aa24e28480dd672e6a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aindri=C3=BA=20Lavelle?= <121855584+aindriu-aiven@users.noreply.github.com> Date: Thu, 19 Dec 2024 09:33:57 +0000 Subject: [PATCH 82/90] Add Service Loader for quick start up (#375) Add Service Loader for quick start up Signed-off-by: Aindriu Lavelle <aindriu.lavelle@aiven.io> --- .../services/org.apache.kafka.connect.source.SourceConnector | 1 + 1 file changed, 1 insertion(+) create mode 100644 s3-source-connector/src/main/resources/META-INF/services/org.apache.kafka.connect.source.SourceConnector diff --git a/s3-source-connector/src/main/resources/META-INF/services/org.apache.kafka.connect.source.SourceConnector b/s3-source-connector/src/main/resources/META-INF/services/org.apache.kafka.connect.source.SourceConnector new file mode 100644 index 000000000..46c0eaf4f --- /dev/null +++ b/s3-source-connector/src/main/resources/META-INF/services/org.apache.kafka.connect.source.SourceConnector @@ -0,0 +1 @@ +io.aiven.kafka.connect.s3.source.AivenKafkaConnectS3SourceConnector From e0184bbce025ddd756407e818f0081721ee36805 Mon Sep 17 00:00:00 2001 From: Murali Basani <muralidhar.basani@aiven.io> Date: Tue, 24 Dec 2024 14:55:16 +0100 Subject: [PATCH 83/90] Adding readme for s3 source [KCON-9] (#376) KCON-9 Read me to configure aws s3 source connector --- s3-source-connector/README.md | 276 +++++++++++++++++++++++++++++++++- 1 file changed, 274 insertions(+), 2 deletions(-) diff --git a/s3-source-connector/README.md b/s3-source-connector/README.md index 2ee9caacc..3c236d4d0 100644 --- a/s3-source-connector/README.md +++ b/s3-source-connector/README.md @@ -1,6 +1,6 @@ # Aiven's S3 Source Connector for Apache Kafka -This is a source Apache Kafka Connect connector that stores Apache Kafka messages in an AWS S3 bucket. +This is a source Apache Kafka Connect connector that stores AWS S3 bucket objects onto an Apache Kafka topic. **Table of Contents** @@ -21,7 +21,279 @@ published into the corresponding Kafka topic. The connector requires Java 11 or newer for development and production. -## TODO update documentation +#### Authorization + +The connector needs the following permissions to the specified bucket: +* ``s3:GetObject`` +* ``s3:ListObjectsV2`` + +In case of ``Access Denied`` error, see https://aws.amazon.com/premiumsupport/knowledge-center/s3-troubleshoot-403/ + +#### Authentication + +To make the connector work, a user has to specify AWS credentials that allow writing to S3. +There are two ways to specify AWS credentials in this connector: + +1) Long term credentials. + + It requires both `aws.access.key.id` and `aws.secret.access.key` to be specified. +2) Short term credentials. + + The connector will request a temporary token from the AWS STS service and assume a role from another AWS account. + It requires `aws.sts.role.arn`, `aws.sts.role.session.name` to be specified. +3) Use default provider chain or custom provider + + If you prefer to use AWS default provider chain, you can leave {`aws.access.key.id` and `aws.secret.access.key`} and + {`aws.sts.role.arn`, `aws.sts.role.session.name`} blank. In case you prefer to build your own custom + provider, pass the custom provider class as a parameter to `aws.credential.provider` + +It is important not to use both 1 and 2 simultaneously. +Using option 2, it is recommended to specify the S3 bucket region in `aws.s3.region` and the +corresponding AWS STS endpoint in `aws.sts.config.endpoint`. It's better to specify both or none. +It is also important to specify `aws.sts.role.external.id` for the security reason. +(see some details [here](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-user_externalid.html)). + +### File name format + +> File name format is tightly related to [Record Grouping](#record-grouping) + +The connector uses the following format for input files (blobs): +`<prefix><filename>`. + +`<prefix>`is the optional prefix that can be used, for example, for +subdirectories in the bucket. +`<filename>` is the file name. The connector has a fixed +template for file names. + + Fixed template for file : `{{topic}}-{{partition}}-{{start_offset}}` + +Example object name : customertopic-00001-1734445664111.txt + +## Data Format + +Connector class name, in this case: `io.aiven.kafka.connect.s3.AivenKafkaConnectS3SourceConnector`. + +### S3 Object Names + +S3 connector reads series of files in the specified bucket. +Each object would be of pattern `[<aws.s3.prefix>]<topic>-<partition>-<start_offset>-<extension>` + +### Kafka topic names +S3 object keys have format with topic names which would be the target kafka topics. + +### Data File Format + +S3 Object files are text files that contain one record per line (i.e., +they're separated by `\n`) except `PARQUET` format. + +There are four types of data format available: + +- Complex structure, where file is in format of [JSON lines](https://jsonlines.org/). + It contains one record per line and each line is a valid JSON object(`jsonl`) + + Configuration: ```input.format=jsonl```. + +- Complex structure, where file is a valid avro file with multiple records. + + Configuration: ```input.format=avro```. + +- Complex structure, where file is in Apache Parquet file format. + + Configuration: ```input.format=parquet```. +- +- Complex structure, where file is in bytes format. + + Configuration: ```input.format=bytes```. + +The connector can output the following fields from records into the +output: the key, the value, the timestamp, the offset and headers. (The set and the order of +output: the key, the value, the timestamp, the offset and headers. The set of +these output fields is configurable.) The field values are separated by comma. + +#### JSONL Format example + +For example, if we output `key,value,offset,timestamp`, a record line might look like: + +```json + { "key": "k1", "value": "v0", "offset": 1232155, "timestamp":"2020-01-01T00:00:01Z" } +``` + +org.apache.kafka.connect.json.JsonConverter is used internally to convert this data and make output files human-readable. + +**NB!** + +- Value/Key schema will not be presented in output kafka event, even if `value.converter.schemas.enable` property is `true`, + however, if this is set to true, it has no impact at the moment. + +#### Parquet or Avro format example + +For example, if we input `key,offset,timestamp,headers,value`, an input - Parquet schema in an s3 object might look like this: +```json +{ + "type": "record", "fields": [ + {"name": "key", "type": "RecordKeySchema"}, + {"name": "offset", "type": "long"}, + {"name": "timestamp", "type": "long"}, + {"name": "headers", "type": "map"}, + {"name": "value", "type": "RecordValueSchema"} + ] +} +``` +where `RecordKeySchema` - a key schema and `RecordValueSchema` - a record value schema. +This means that in case you have the record and key schema like: + +Key schema: +```json +{ + "type": "string" +} +``` + +Record schema: +```json +{ + "type": "record", "fields": [ + {"name": "foo", "type": "string"}, + {"name": "bar", "type": "long"} + ] +} +``` +the final `Avro` schema for `Parquet` is: +```json +{ + "type": "record", "fields": [ + {"name": "key", "type": "string"}, + {"name": "offset", "type": "long"}, + {"name": "timestamp", "type": "long"}, + {"name": "headers", "type": "map", "values": "long"}, + { "name": "value", + "type": "record", + "fields": [ + {"name": "foo", "type": "string"}, + {"name": "bar", "type": "long"} + ] + } + ] +} +``` +**NB!** + +- Connector works just fine with and without Schema Registry + +## Usage + +### Connector Configuration + +> **Important Note** Since this connector is developed aligning it with S3 sink connector, +> and since version `2.6`, all existing configuration in S3 sink +is deprecated and will be replaced with new one during a certain transition period (within 2-3 releases). Most of the +> configuration parameters remain same. + +List of new configuration parameters: +- `aws.access.key.id` - AWS Access Key ID for accessing S3 bucket. +- `aws.secret.access.key` - AWS S3 Secret Access Key. +- `aws.s3.bucket.name` - - Name of an existing bucket for storing the records. Mandatory. See bucket name rules: <https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html> +- `aws.s3.endpoint` - The endpoint configuration (service endpoint & signing region) to be used for requests. +- `aws.s3.prefix` - The prefix that will be added to the file name in the bucket. Can be used for putting output files into a subdirectory. +- `aws.s3.region` - Name of the region for the bucket used for storing the records. Defaults to `us-east-1`. +- `aws.sts.role.arn` - AWS role ARN, for cross-account access role instead of `aws.access.key.id` and `aws.secret.access.key` +- `aws.sts.role.external.id` - AWS ExternalId for cross-account access role +- `aws.sts.role.session.name` - AWS session name for cross-account access role +- `aws.sts.role.session.duration` - Session duration for cross-account access role in Seconds. Minimum value - 900. +- `aws.sts.config.endpoint` - AWS STS endpoint for cross-account access role. + +## Configuration + +[Here](https://kafka.apache.org/documentation/#connect_running) you can +read about the Connect workers configuration and +[here](https://kafka.apache.org/documentation/#connect_resuming), about +the connector Configuration. + +Here is an example connector configuration with descriptions: + +```properties +### Standard connector configuration + +## Fill in your values in these: + +## These must have exactly these values: + +# The Java class for the connector +connector.class=io.aiven.kafka.connect.s3.AivenKafkaConnectS3SourceConnector + +# Number of worker tasks to run concurrently +tasks.max=1 + +# The key converter for this connector +key.converter=org.apache.kafka.connect.storage.StringConverter + +# The type of data format used to write data to the kafka events. +# The supported values are: `jsonl`, 'avro', `parquet` and 'bytes'. +input.type=jsonl + +# A comma-separated list of topics to use as output for this connector +# Also a regular expression version `topics.regex` is supported. +# See https://kafka.apache.org/documentation/#connect_configuring +topics=topic1,topic2 + +# A comma-separated list of topic partitions where the connector's offset storage reader +# can read the stored offsets for those partitions. If not mentioned, s3 objects will be read again if +# available in the bucket +topic.partitions=1,2,3 + +### Connector-specific configuration +### Fill in you values +# AWS Access Key ID +aws.access.key.id=YOUR_AWS_KEY_ID + +# AWS Access Secret Key +aws.secret.access.key=YOUR_AWS_SECRET_ACCESS_KEY + +#AWS Region +aws.s3.region=us-east-1 + +#The name of the S3 bucket to use +#Required. +aws.s3.bucket.name=my-bucket + +#The prefix of the S3 bucket to use +#Optional. +aws.s3.prefix=file-prefix + +#Errors tolerance +# Possible values 'none' or 'all'. Default being 'none' +# Errors are logged and ignored for 'all' +errors.tolerance=none +``` + +### Retry strategy configuration + +#### Apache Kafka connect retry strategy configuration property + +- `kafka.retry.backoff.ms` - The retry backoff in milliseconds. This config is used to notify Apache Kafka Connect to retry delivering a message batch or + performing recovery in case of transient exceptions. Maximum value is `24` hours. + +There are four configuration properties to configure retry strategy exists. + +#### AWS S3 retry strategy configuration properties + + `aws.s3.backoff.delay.ms` - S3 default base sleep time +for non-throttled exceptions in milliseconds. +Default is `100` ms. +- `aws.s3.backoff.max.delay.ms` - S3 maximum back-off +time before retrying a request in milliseconds. +Default is `20 000` ms. +- `aws.s3.backoff.max.retries` - Maximum retry limit +(if the value is greater than 30, there can be +integer overflow issues during delay calculation). +Default is `3`. + +### AWS S3 server side encryption properties + +- `aws.s3.sse.algorithm` - The name of the Server-side encryption algorithm to use for uploads. If unset the default SSE-S3 is used. +- To use SSE-S3 set to `AES256` or leave empty +- To use SSE-KMS set to `aws:kms` +- To use DSSE-KMS set to `aws:kms:dsse` ## Development From b4475c978a4a88e1684e85a6901d1c08d0c0337e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aindri=C3=BA=20Lavelle?= <121855584+aindriu-aiven@users.noreply.github.com> Date: Mon, 30 Dec 2024 12:50:45 +0000 Subject: [PATCH 84/90] AWS SDK 2.X migration for source connector [KCON-84] (#374) The AWS 1.X sdk is in maintenance mode and will be out of support by December 2025. Key differences are * Use of the builder pattern when creating objects * get and set removed from getters and setters e.g. getKey(), setKey(newKey) -> key(), key(newKey) * S3Client is immutable * different package names * Additional built in functionality removing some of the work from the connector implementation and having the existing library handle it. SDK 1.X still in use by sink connector but that will be required to be updated as well in the future, but this means the s3-commons code has both the 1.x and 2.x jars. --------- Signed-off-by: Aindriu Lavelle <aindriu.lavelle@aiven.io> --- s3-commons/build.gradle.kts | 3 + .../connect/config/s3/S3ConfigFragment.java | 41 +++- .../iam/AwsCredentialProviderFactory.java | 34 ++++ s3-source-connector/build.gradle.kts | 9 +- .../connect/s3/source/IntegrationBase.java | 24 +-- .../connect/s3/source/IntegrationTest.java | 16 +- .../kafka/connect/s3/source/S3SourceTask.java | 10 +- .../s3/source/config/S3ClientFactory.java | 62 +++--- .../s3/source/config/S3SourceConfig.java | 20 +- .../s3/source/utils/AWSV2SourceClient.java | 64 ++++--- .../s3/source/utils/SourceRecordIterator.java | 34 ++-- .../connect/s3/source/S3SourceTaskTest.java | 46 +++-- .../s3/source/config/S3SourceConfigTest.java | 11 +- .../s3/source/testutils/BucketAccessor.java | 71 ++++--- .../s3/source/testutils/S3OutputStream.java | 181 ------------------ .../source/utils/AWSV2SourceClientTest.java | 95 +++++---- .../utils/SourceRecordIteratorTest.java | 21 +- 17 files changed, 325 insertions(+), 417 deletions(-) delete mode 100644 s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java diff --git a/s3-commons/build.gradle.kts b/s3-commons/build.gradle.kts index 0e3d825aa..5e54c05ef 100644 --- a/s3-commons/build.gradle.kts +++ b/s3-commons/build.gradle.kts @@ -18,10 +18,13 @@ plugins { id("aiven-apache-kafka-connectors-all.java-conventions") } val amazonS3Version by extra("1.12.777") val amazonSTSVersion by extra("1.12.777") +val amazonV2Version by extra("2.29.34") dependencies { implementation("com.amazonaws:aws-java-sdk-s3:$amazonS3Version") implementation("com.amazonaws:aws-java-sdk-sts:$amazonSTSVersion") + implementation("software.amazon.awssdk:auth:$amazonV2Version") + implementation("software.amazon.awssdk:sts:$amazonV2Version") implementation(project(":commons")) diff --git a/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java b/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java index 1e87265b9..2ece623bf 100644 --- a/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java +++ b/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java @@ -41,11 +41,13 @@ import com.amazonaws.services.s3.internal.BucketNameUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; /** * The configuration fragment that defines the S3 specific characteristics. */ -@SuppressWarnings({ "PMD.TooManyMethods", "PMD.ExcessiveImports", "PMD.TooManyStaticImports" }) +@SuppressWarnings({ "PMD.TooManyMethods", "PMD.ExcessiveImports", "PMD.TooManyStaticImports", "PMD.GodClass" }) public final class S3ConfigFragment extends ConfigFragment { private static final Logger LOGGER = LoggerFactory.getLogger(S3ConfigFragment.class); @@ -345,7 +347,8 @@ public void validateCredentials() { } } else { final BasicAWSCredentials awsCredentials = getAwsCredentials(); - if (awsCredentials == null) { + final AwsBasicCredentials awsCredentialsV2 = getAwsCredentialsV2(); + if (awsCredentials == null && awsCredentialsV2 == null) { LOGGER.info( "Connector use {} as credential Provider, " + "when configuration for {{}, {}} OR {{}, {}} are absent", @@ -410,11 +413,13 @@ public AwsStsEndpointConfig getStsEndpointConfig() { return new AwsStsEndpointConfig(cfg.getString(AWS_STS_CONFIG_ENDPOINT), cfg.getString(AWS_S3_REGION_CONFIG)); } + @Deprecated public AwsClientBuilder.EndpointConfiguration getAwsEndpointConfiguration() { final AwsStsEndpointConfig config = getStsEndpointConfig(); return new AwsClientBuilder.EndpointConfiguration(config.getServiceEndpoint(), config.getSigningRegion()); } + @Deprecated public BasicAWSCredentials getAwsCredentials() { if (Objects.nonNull(cfg.getPassword(AWS_ACCESS_KEY_ID_CONFIG)) && Objects.nonNull(cfg.getPassword(AWS_SECRET_ACCESS_KEY_CONFIG))) { @@ -430,12 +435,26 @@ public BasicAWSCredentials getAwsCredentials() { return null; } + public AwsBasicCredentials getAwsCredentialsV2() { + if (Objects.nonNull(cfg.getPassword(AWS_ACCESS_KEY_ID_CONFIG)) + && Objects.nonNull(cfg.getPassword(AWS_SECRET_ACCESS_KEY_CONFIG))) { + + return AwsBasicCredentials.create(cfg.getPassword(AWS_ACCESS_KEY_ID_CONFIG).value(), + cfg.getPassword(AWS_SECRET_ACCESS_KEY_CONFIG).value()); + } else if (Objects.nonNull(cfg.getPassword(AWS_ACCESS_KEY_ID)) + && Objects.nonNull(cfg.getPassword(AWS_SECRET_ACCESS_KEY))) { + LOGGER.warn("Config options {} and {} are not supported for this Connector", AWS_ACCESS_KEY_ID, + AWS_SECRET_ACCESS_KEY); + } + return null; + } + public String getAwsS3EndPoint() { return Objects.nonNull(cfg.getString(AWS_S3_ENDPOINT_CONFIG)) ? cfg.getString(AWS_S3_ENDPOINT_CONFIG) : cfg.getString(AWS_S3_ENDPOINT); } - + @Deprecated public Region getAwsS3Region() { // we have priority of properties if old one not set or both old and new one set // the new property value will be selected @@ -448,6 +467,18 @@ public Region getAwsS3Region() { } } + public software.amazon.awssdk.regions.Region getAwsS3RegionV2() { + // we have priority of properties if old one not set or both old and new one set + // the new property value will be selected + if (Objects.nonNull(cfg.getString(AWS_S3_REGION_CONFIG))) { + return software.amazon.awssdk.regions.Region.of(cfg.getString(AWS_S3_REGION_CONFIG)); + } else if (Objects.nonNull(cfg.getString(AWS_S3_REGION))) { + return software.amazon.awssdk.regions.Region.of(cfg.getString(AWS_S3_REGION)); + } else { + return software.amazon.awssdk.regions.Region.of(Regions.US_EAST_1.getName()); + } + } + public String getAwsS3BucketName() { return Objects.nonNull(cfg.getString(AWS_S3_BUCKET_NAME_CONFIG)) ? cfg.getString(AWS_S3_BUCKET_NAME_CONFIG) @@ -484,6 +515,10 @@ public AWSCredentialsProvider getCustomCredentialsProvider() { return cfg.getConfiguredInstance(AWS_CREDENTIALS_PROVIDER_CONFIG, AWSCredentialsProvider.class); } + public AwsCredentialsProvider getCustomCredentialsProviderV2() { + return cfg.getConfiguredInstance(AWS_CREDENTIALS_PROVIDER_CONFIG, AwsCredentialsProvider.class); + } + public int getFetchPageSize() { return cfg.getInt(FETCH_PAGE_SIZE); } diff --git a/s3-commons/src/main/java/io/aiven/kafka/connect/iam/AwsCredentialProviderFactory.java b/s3-commons/src/main/java/io/aiven/kafka/connect/iam/AwsCredentialProviderFactory.java index 2a5089726..167d872a7 100644 --- a/s3-commons/src/main/java/io/aiven/kafka/connect/iam/AwsCredentialProviderFactory.java +++ b/s3-commons/src/main/java/io/aiven/kafka/connect/iam/AwsCredentialProviderFactory.java @@ -26,6 +26,11 @@ import com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider; import com.amazonaws.services.securitytoken.AWSSecurityTokenService; import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClientBuilder; +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; +import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; +import software.amazon.awssdk.services.sts.auth.StsAssumeRoleCredentialsProvider; +import software.amazon.awssdk.services.sts.model.AssumeRoleRequest; public class AwsCredentialProviderFactory { @@ -58,4 +63,33 @@ private AWSSecurityTokenService securityTokenService(final S3ConfigFragment conf } return AWSSecurityTokenServiceClientBuilder.defaultClient(); } + + public AwsCredentialsProvider getAwsV2Provider(final S3ConfigFragment config) { + + if (config.hasAwsStsRole()) { + return getV2StsProvider(config); + } + final AwsBasicCredentials awsCredentials = config.getAwsCredentialsV2(); + if (Objects.isNull(awsCredentials)) { + return config.getCustomCredentialsProviderV2(); + } + return StaticCredentialsProvider.create(awsCredentials); + + } + + private StsAssumeRoleCredentialsProvider getV2StsProvider(final S3ConfigFragment config) { + if (config.hasAwsStsRole()) { + return StsAssumeRoleCredentialsProvider.builder() + .refreshRequest(() -> AssumeRoleRequest.builder() + .roleArn(config.getStsRole().getArn()) + // Maker this a unique identifier + .roleSessionName("AwsV2SDKConnectorSession") + .build()) + .build(); + } + + return StsAssumeRoleCredentialsProvider.builder().build(); + + } + } diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts index 3530724e0..20d5a3b82 100644 --- a/s3-source-connector/build.gradle.kts +++ b/s3-source-connector/build.gradle.kts @@ -18,8 +18,8 @@ import com.github.spotbugs.snom.SpotBugsTask plugins { id("aiven-apache-kafka-connectors-all.java-conventions") } -val amazonS3Version by extra("1.12.729") -val amazonSTSVersion by extra("1.12.729") +val amazonS3Version by extra("2.29.34") +val amazonSTSVersion by extra("2.29.34") val s3mockVersion by extra("0.2.6") val testKafkaVersion by extra("3.7.1") @@ -67,8 +67,8 @@ dependencies { implementation(project(":commons")) implementation(project(":s3-commons")) - implementation("com.amazonaws:aws-java-sdk-s3:$amazonS3Version") - implementation("com.amazonaws:aws-java-sdk-sts:$amazonSTSVersion") + implementation("software.amazon.awssdk:s3:$amazonS3Version") + implementation("software.amazon.awssdk:sts:$amazonSTSVersion") implementation(tools.spotbugs.annotations) implementation(logginglibs.slf4j) @@ -154,7 +154,6 @@ dependencies { exclude(group = "org.apache.commons", module = "commons-math3") exclude(group = "org.apache.httpcomponents", module = "httpclient") exclude(group = "commons-codec", module = "commons-codec") - exclude(group = "commons-io", module = "commons-io") exclude(group = "commons-net", module = "commons-net") exclude(group = "org.eclipse.jetty") exclude(group = "org.eclipse.jetty.websocket") diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index 9ce09172b..6b505b996 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -22,6 +22,7 @@ import java.io.File; import java.io.IOException; import java.net.ServerSocket; +import java.net.URI; import java.nio.file.Files; import java.nio.file.Path; import java.time.Duration; @@ -47,11 +48,6 @@ import org.apache.kafka.common.serialization.StringDeserializer; import org.apache.kafka.connect.json.JsonDeserializer; -import com.amazonaws.auth.AWSStaticCredentialsProvider; -import com.amazonaws.auth.BasicAWSCredentials; -import com.amazonaws.client.builder.AwsClientBuilder; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.AmazonS3ClientBuilder; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; @@ -61,6 +57,10 @@ import org.testcontainers.containers.Container; import org.testcontainers.containers.localstack.LocalStackContainer; import org.testcontainers.utility.DockerImageName; +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.services.s3.S3Client; public interface IntegrationBase { String PLUGINS_S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA = "plugins/s3-source-connector-for-apache-kafka/"; @@ -101,13 +101,13 @@ static void waitForRunningContainer(final Container<?> container) { await().atMost(Duration.ofMinutes(1)).until(container::isRunning); } - static AmazonS3 createS3Client(final LocalStackContainer localStackContainer) { - return AmazonS3ClientBuilder.standard() - .withEndpointConfiguration(new AwsClientBuilder.EndpointConfiguration( - localStackContainer.getEndpointOverride(LocalStackContainer.Service.S3).toString(), - localStackContainer.getRegion())) - .withCredentials(new AWSStaticCredentialsProvider(new BasicAWSCredentials( - localStackContainer.getAccessKey(), localStackContainer.getSecretKey()))) + static S3Client createS3Client(final LocalStackContainer localStackContainer) { + return S3Client.builder() + .endpointOverride( + URI.create(localStackContainer.getEndpointOverride(LocalStackContainer.Service.S3).toString())) + .region(Region.of(localStackContainer.getRegion())) + .credentialsProvider(StaticCredentialsProvider.create(AwsBasicCredentials + .create(localStackContainer.getAccessKey(), localStackContainer.getSecretKey()))) .build(); } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 7f96842f3..884051e30 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -33,7 +33,6 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.awaitility.Awaitility.await; -import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -62,9 +61,6 @@ import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.model.ObjectMetadata; -import com.amazonaws.services.s3.model.PutObjectRequest; import com.fasterxml.jackson.databind.JsonNode; import org.apache.avro.Schema; import org.apache.avro.file.DataFileWriter; @@ -83,6 +79,9 @@ import org.testcontainers.containers.localstack.LocalStackContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; +import software.amazon.awssdk.core.sync.RequestBody; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.PutObjectRequest; @Testcontainers @SuppressWarnings("PMD.ExcessiveImports") @@ -111,7 +110,7 @@ final class IntegrationTest implements IntegrationBase { private AdminClient adminClient; private ConnectRunner connectRunner; - private static AmazonS3 s3Client; + private static S3Client s3Client; @BeforeAll static void setUpAll() throws IOException, InterruptedException { @@ -263,7 +262,7 @@ void parquetTest(final TestInfo testInfo) throws IOException { final Path path = ContentUtils.getTmpFilePath(name); try { - s3Client.putObject(TEST_BUCKET_NAME, fileName, Files.newInputStream(path), null); + s3Client.putObject(PutObjectRequest.builder().bucket(TEST_BUCKET_NAME).key(fileName).build(), path); } catch (final Exception e) { // NOPMD broad exception caught LOGGER.error("Error in reading file {}", e.getMessage(), e); } finally { @@ -341,9 +340,8 @@ private static byte[] generateNextAvroMessagesStartingFromId(final int messageId private static String writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) { final String objectKey = addPrefixOrDefault("") + topicName + "-" + partitionId + "-" + System.currentTimeMillis() + ".txt"; - final PutObjectRequest request = new PutObjectRequest(TEST_BUCKET_NAME, objectKey, - new ByteArrayInputStream(testDataBytes), new ObjectMetadata()); - s3Client.putObject(request); + final PutObjectRequest request = PutObjectRequest.builder().bucket(TEST_BUCKET_NAME).key(objectKey).build(); + s3Client.putObject(request, RequestBody.fromBytes(testDataBytes)); return OBJECT_KEY + SEPARATOR + objectKey; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index aa331b4aa..320fa19cb 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -40,10 +40,10 @@ import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; import io.aiven.kafka.connect.s3.source.utils.Version; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.model.AmazonS3Exception; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import software.amazon.awssdk.core.exception.SdkException; +import software.amazon.awssdk.services.s3.S3Client; /** * S3SourceTask is a Kafka Connect SourceTask implementation that reads from source-s3 buckets and generates Kafka @@ -64,7 +64,7 @@ public class S3SourceTask extends SourceTask { private static final long ERROR_BACKOFF = 1000L; private S3SourceConfig s3SourceConfig; - private AmazonS3 s3Client; + private S3Client s3Client; private Iterator<S3SourceRecord> sourceRecordIterator; private Transformer transformer; @@ -122,8 +122,8 @@ public List<SourceRecord> poll() throws InterruptedException { extractSourceRecords(results); LOGGER.info("Number of records extracted and sent: {}", results.size()); return results; - } catch (AmazonS3Exception exception) { - if (exception.isRetryable()) { + } catch (SdkException exception) { + if (exception.retryable()) { LOGGER.warn("Retryable error encountered during polling. Waiting before retrying...", exception); pollLock.wait(ERROR_BACKOFF); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java index 346ec5825..13ff4d690 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java @@ -16,45 +16,51 @@ package io.aiven.kafka.connect.s3.source.config; +import java.net.URI; +import java.time.Duration; import java.util.Objects; +import java.util.Random; import io.aiven.kafka.connect.iam.AwsCredentialProviderFactory; -import com.amazonaws.PredefinedClientConfigurations; -import com.amazonaws.client.builder.AwsClientBuilder; -import com.amazonaws.retry.PredefinedBackoffStrategies; -import com.amazonaws.retry.PredefinedRetryPolicies; -import com.amazonaws.retry.RetryPolicy; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.AmazonS3ClientBuilder; +import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration; +import software.amazon.awssdk.core.retry.RetryMode; +import software.amazon.awssdk.retries.api.internal.backoff.ExponentialDelayWithJitter; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.S3Configuration; public class S3ClientFactory { private final AwsCredentialProviderFactory credentialFactory = new AwsCredentialProviderFactory(); - public AmazonS3 createAmazonS3Client(final S3SourceConfig config) { - final var awsEndpointConfig = newEndpointConfiguration(config); - final var clientConfig = PredefinedClientConfigurations.defaultConfig() - .withRetryPolicy(new RetryPolicy(PredefinedRetryPolicies.DEFAULT_RETRY_CONDITION, - new PredefinedBackoffStrategies.FullJitterBackoffStrategy( - Math.toIntExact(config.getS3RetryBackoffDelayMs()), - Math.toIntExact(config.getS3RetryBackoffMaxDelayMs())), - config.getS3RetryBackoffMaxRetries(), false)); - final var s3ClientBuilder = AmazonS3ClientBuilder.standard() - .withCredentials(credentialFactory.getProvider(config.getS3ConfigFragment())) - .withClientConfiguration(clientConfig); - if (Objects.isNull(awsEndpointConfig)) { - s3ClientBuilder.withRegion(config.getAwsS3Region().getName()); - } else { - s3ClientBuilder.withEndpointConfiguration(awsEndpointConfig).withPathStyleAccessEnabled(true); - } - return s3ClientBuilder.build(); - } + public S3Client createAmazonS3Client(final S3SourceConfig config) { + + final ExponentialDelayWithJitter backoffStrategy = new ExponentialDelayWithJitter(Random::new, + Duration.ofMillis(Math.toIntExact(config.getS3RetryBackoffDelayMs())), + Duration.ofMillis(Math.toIntExact(config.getS3RetryBackoffMaxDelayMs()))); - private AwsClientBuilder.EndpointConfiguration newEndpointConfiguration(final S3SourceConfig config) { + final ClientOverrideConfiguration clientOverrideConfiguration = ClientOverrideConfiguration.builder() + .retryStrategy(RetryMode.STANDARD) + .build(); if (Objects.isNull(config.getAwsS3EndPoint())) { - return null; + return S3Client.builder() + .overrideConfiguration(clientOverrideConfiguration) + .overrideConfiguration(o -> o.retryStrategy( + r -> r.backoffStrategy(backoffStrategy).maxAttempts(config.getS3RetryBackoffMaxRetries()))) + .region(config.getAwsS3Region()) + .credentialsProvider(credentialFactory.getAwsV2Provider(config.getS3ConfigFragment())) + .build(); + } else { + // TODO This is definitely used for testing but not sure if customers use it. + return S3Client.builder() + .overrideConfiguration(clientOverrideConfiguration) + .region(config.getAwsS3Region()) + .credentialsProvider(credentialFactory.getAwsV2Provider(config.getS3ConfigFragment())) + .endpointOverride(URI.create(config.getAwsS3EndPoint())) + .serviceConfiguration(S3Configuration.builder().pathStyleAccessEnabled(true).build()) + .build(); } - return new AwsClientBuilder.EndpointConfiguration(config.getAwsS3EndPoint(), config.getAwsS3Region().getName()); + } + } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 68b9b2f98..23dc69e9e 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -32,12 +32,10 @@ import io.aiven.kafka.connect.iam.AwsStsEndpointConfig; import io.aiven.kafka.connect.iam.AwsStsRole; -import com.amazonaws.auth.AWSCredentialsProvider; -import com.amazonaws.auth.BasicAWSCredentials; -import com.amazonaws.client.builder.AwsClientBuilder; -import com.amazonaws.regions.Region; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.regions.Region; final public class S3SourceConfig extends SourceCommonConfig { @@ -87,12 +85,8 @@ public AwsStsEndpointConfig getStsEndpointConfig() { return s3ConfigFragment.getStsEndpointConfig(); } - public AwsClientBuilder.EndpointConfiguration getAwsEndpointConfiguration() { - return s3ConfigFragment.getAwsEndpointConfiguration(); - } - - public BasicAWSCredentials getAwsCredentials() { - return s3ConfigFragment.getAwsCredentials(); + public AwsBasicCredentials getAwsCredentials() { + return s3ConfigFragment.getAwsCredentialsV2(); } public String getAwsS3EndPoint() { @@ -100,7 +94,7 @@ public String getAwsS3EndPoint() { } public Region getAwsS3Region() { - return s3ConfigFragment.getAwsS3Region(); + return s3ConfigFragment.getAwsS3RegionV2(); } public String getAwsS3BucketName() { @@ -131,10 +125,6 @@ public int getS3RetryBackoffMaxRetries() { return s3ConfigFragment.getS3RetryBackoffMaxRetries(); } - public AWSCredentialsProvider getCustomCredentialsProvider() { - return s3ConfigFragment.getCustomCredentialsProvider(); - } - public S3ConfigFragment getS3ConfigFragment() { return s3ConfigFragment; } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java index 1bbc477ee..44e28dfa7 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java @@ -16,6 +16,7 @@ package io.aiven.kafka.connect.s3.source.utils; +import java.io.InputStream; import java.util.HashSet; import java.util.Iterator; import java.util.Objects; @@ -26,11 +27,14 @@ import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.model.ListObjectsV2Request; -import com.amazonaws.services.s3.model.S3Object; -import com.amazonaws.services.s3.model.S3ObjectSummary; +import org.apache.commons.io.function.IOSupplier; import org.codehaus.plexus.util.StringUtils; +import software.amazon.awssdk.core.ResponseBytes; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.GetObjectRequest; +import software.amazon.awssdk.services.s3.model.GetObjectResponse; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; +import software.amazon.awssdk.services.s3.model.S3Object; /** * Called AWSV2SourceClient as this source client implements the V2 version of the aws client library. Handles all calls @@ -40,10 +44,10 @@ public class AWSV2SourceClient { public static final int PAGE_SIZE_FACTOR = 2; private final S3SourceConfig s3SourceConfig; - private final AmazonS3 s3Client; + private final S3Client s3Client; private final String bucketName; - private Predicate<S3ObjectSummary> filterPredicate = summary -> summary.getSize() > 0; + private Predicate<S3Object> filterPredicate = s3Object -> s3Object.size() > 0; private final Set<String> failedObjectKeys; /** @@ -70,7 +74,7 @@ public AWSV2SourceClient(final S3SourceConfig s3SourceConfig, final Set<String> * @param failedObjectKeys * all objectKeys which have already been tried but have been unable to process. */ - AWSV2SourceClient(final AmazonS3 s3Client, final S3SourceConfig s3SourceConfig, + AWSV2SourceClient(final S3Client s3Client, final S3SourceConfig s3SourceConfig, final Set<String> failedObjectKeys) { this.s3SourceConfig = s3SourceConfig; this.s3Client = s3Client; @@ -79,46 +83,52 @@ public AWSV2SourceClient(final S3SourceConfig s3SourceConfig, final Set<String> } public Iterator<String> getListOfObjectKeys(final String startToken) { - final ListObjectsV2Request request = new ListObjectsV2Request().withBucketName(bucketName) - .withMaxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR); - - if (StringUtils.isNotBlank(startToken)) { - request.withStartAfter(startToken); - } - // Prefix is optional so only use if supplied - if (StringUtils.isNotBlank(s3SourceConfig.getAwsS3Prefix())) { - request.withPrefix(s3SourceConfig.getAwsS3Prefix()); - } + final ListObjectsV2Request request = ListObjectsV2Request.builder() + .bucket(bucketName) + .maxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR) + .prefix(optionalKey(s3SourceConfig.getAwsS3Prefix())) + .startAfter(optionalKey(startToken)) + .build(); final Stream<String> s3ObjectKeyStream = Stream .iterate(s3Client.listObjectsV2(request), Objects::nonNull, response -> { // This is called every time next() is called on the iterator. if (response.isTruncated()) { - return s3Client.listObjectsV2( - new ListObjectsV2Request().withContinuationToken(response.getNextContinuationToken())); + return s3Client.listObjectsV2(ListObjectsV2Request.builder() + .maxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR) + .continuationToken(response.nextContinuationToken()) + .build()); } else { return null; } }) - .flatMap(response -> response.getObjectSummaries() + .flatMap(response -> response.contents() .stream() .filter(filterPredicate) - .filter(objectSummary -> assignObjectToTask(objectSummary.getKey())) - .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey()))) - .map(S3ObjectSummary::getKey); + .filter(objectSummary -> assignObjectToTask(objectSummary.key())) + .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.key()))) + .map(S3Object::key); return s3ObjectKeyStream.iterator(); } + private String optionalKey(final String key) { + if (StringUtils.isNotBlank(key)) { + return key; + } + return null; + } - public S3Object getObject(final String objectKey) { - return s3Client.getObject(bucketName, objectKey); + public IOSupplier<InputStream> getObject(final String objectKey) { + final GetObjectRequest getObjectRequest = GetObjectRequest.builder().bucket(bucketName).key(objectKey).build(); + final ResponseBytes<GetObjectResponse> s3ObjectResponse = s3Client.getObjectAsBytes(getObjectRequest); + return s3ObjectResponse::asInputStream; } public void addFailedObjectKeys(final String objectKey) { this.failedObjectKeys.add(objectKey); } - public void setFilterPredicate(final Predicate<S3ObjectSummary> predicate) { + public void setFilterPredicate(final Predicate<S3Object> predicate) { filterPredicate = predicate; } @@ -130,7 +140,7 @@ private boolean assignObjectToTask(final String objectKey) { } public void shutdown() { - s3Client.shutdown(); + s3Client.close(); } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index ac5a3061a..26f3c03cf 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -17,6 +17,7 @@ package io.aiven.kafka.connect.s3.source.utils; import java.io.IOException; +import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; @@ -32,10 +33,10 @@ import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import com.amazonaws.AmazonClientException; -import com.amazonaws.services.s3.model.S3Object; +import org.apache.commons.io.function.IOSupplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import software.amazon.awssdk.core.exception.SdkException; /** * Iterator that processes S3 files and creates Kafka source records. Supports different output formats (Avro, JSON, @@ -91,7 +92,7 @@ private void nextS3Object() { recordIterator = createIteratorForCurrentFile(); } } catch (IOException e) { - throw new AmazonClientException(e); + throw SdkException.create(e.getMessage(), e.getCause()); } } @@ -103,20 +104,20 @@ private Iterator<S3SourceRecord> createIteratorForCurrentFile() throws IOExcepti if (fileMatcher.find()) { // TODO move this from the SourceRecordIterator so that we can decouple it from S3 and make it API agnostic - try (S3Object s3Object = sourceClient.getObject(currentObjectKey);) { - topicName = fileMatcher.group(PATTERN_TOPIC_KEY); - defaultPartitionId = Integer.parseInt(fileMatcher.group(PATTERN_PARTITION_KEY)); + final IOSupplier<InputStream> s3Object = sourceClient.getObject(currentObjectKey); + topicName = fileMatcher.group(PATTERN_TOPIC_KEY); + defaultPartitionId = Integer.parseInt(fileMatcher.group(PATTERN_PARTITION_KEY)); - final long defaultStartOffsetId = 1L; + final long defaultStartOffsetId = 1L; - final String finalTopic = topicName; - final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(topicName, defaultPartitionId, - bucketName); + final String finalTopic = topicName; + final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(topicName, defaultPartitionId, + bucketName); + + return getObjectIterator(s3Object, finalTopic, defaultPartitionId, defaultStartOffsetId, transformer, + partitionMap); - return getObjectIterator(s3Object, finalTopic, defaultPartitionId, defaultStartOffsetId, transformer, - partitionMap); - } } else { LOGGER.error("File naming doesn't match to any topic. {}", currentObjectKey); return Collections.emptyIterator(); @@ -124,7 +125,7 @@ private Iterator<S3SourceRecord> createIteratorForCurrentFile() throws IOExcepti } @SuppressWarnings("PMD.CognitiveComplexity") - private Iterator<S3SourceRecord> getObjectIterator(final S3Object s3Object, final String topic, + private Iterator<S3SourceRecord> getObjectIterator(final IOSupplier<InputStream> s3Object, final String topic, final int topicPartition, final long startOffset, final Transformer transformer, final Map<String, Object> partitionMap) { return new Iterator<>() { @@ -142,8 +143,9 @@ private List<S3SourceRecord> readNext() { return sourceRecords; } - try (Stream<Object> recordStream = transformer.getRecords(s3Object::getObjectContent, topic, - topicPartition, s3SourceConfig, numberOfRecsAlreadyProcessed)) { + try (Stream<Object> recordStream = transformer.getRecords(s3Object, topic, topicPartition, + s3SourceConfig, numberOfRecsAlreadyProcessed)) { + final Iterator<Object> recordIterator = recordStream.iterator(); while (recordIterator.hasNext()) { final Object record = recordIterator.next(); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java index 590ad23bb..13ac66844 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -24,6 +24,8 @@ import static org.mockito.Mockito.when; import java.lang.reflect.Field; +import java.net.URI; +import java.net.URISyntaxException; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -39,15 +41,12 @@ import io.aiven.kafka.connect.common.source.input.InputFormat; import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.config.s3.S3ConfigFragment; +import io.aiven.kafka.connect.iam.AwsCredentialProviderFactory; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import io.aiven.kafka.connect.s3.source.utils.S3SourceRecord; import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; -import com.amazonaws.auth.AWSStaticCredentialsProvider; -import com.amazonaws.auth.BasicAWSCredentials; -import com.amazonaws.client.builder.AwsClientBuilder; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.AmazonS3ClientBuilder; import io.findify.s3mock.S3Mock; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; @@ -57,6 +56,10 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration; +import software.amazon.awssdk.core.retry.RetryMode; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.S3Configuration; @ExtendWith(MockitoExtension.class) final class S3SourceTaskTest { @@ -66,9 +69,10 @@ final class S3SourceTaskTest { private static BucketAccessor testBucketAccessor; private static final String TEST_BUCKET = "test-bucket"; - + // TODO S3Mock has not been maintained in 4 years + // Adobe have an alternative we can move to. private static S3Mock s3Api; - private static AmazonS3 s3Client; + private static S3Client s3Client; private static Map<String, String> commonProperties; @@ -79,7 +83,7 @@ final class S3SourceTaskTest { private OffsetStorageReader mockedOffsetStorageReader; @BeforeAll - public static void setUpClass() { + public static void setUpClass() throws URISyntaxException { final int s3Port = RANDOM.nextInt(10_000) + 10_000; s3Api = new S3Mock.Builder().withPort(s3Port).withInMemoryBackend().build(); @@ -90,17 +94,19 @@ public static void setUpClass() { S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET, S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG, "http://localhost:" + s3Port, S3ConfigFragment.AWS_S3_REGION_CONFIG, "us-west-2"); - final AmazonS3ClientBuilder builder = AmazonS3ClientBuilder.standard(); - final BasicAWSCredentials awsCreds = new BasicAWSCredentials( - commonProperties.get(S3ConfigFragment.AWS_ACCESS_KEY_ID_CONFIG), - commonProperties.get(S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG)); - builder.withCredentials(new AWSStaticCredentialsProvider(awsCreds)); - builder.withEndpointConfiguration(new AwsClientBuilder.EndpointConfiguration( - commonProperties.get(S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG), - commonProperties.get(S3ConfigFragment.AWS_S3_REGION_CONFIG))); - builder.withPathStyleAccessEnabled(true); + final AwsCredentialProviderFactory credentialFactory = new AwsCredentialProviderFactory(); + final S3SourceConfig config = new S3SourceConfig(commonProperties); + final ClientOverrideConfiguration clientOverrideConfiguration = ClientOverrideConfiguration.builder() + .retryStrategy(RetryMode.STANDARD) + .build(); - s3Client = builder.build(); + s3Client = S3Client.builder() + .overrideConfiguration(clientOverrideConfiguration) + .region(config.getAwsS3Region()) + .endpointOverride(URI.create(config.getAwsS3EndPoint())) + .serviceConfiguration(S3Configuration.builder().pathStyleAccessEnabled(true).build()) + .credentialsProvider(credentialFactory.getAwsV2Provider(config.getS3ConfigFragment())) + .build(); testBucketAccessor = new BucketAccessor(s3Client, TEST_BUCKET); testBucketAccessor.createBucket(); @@ -114,14 +120,14 @@ public static void tearDownClass() { @BeforeEach public void setUp() { properties = new HashMap<>(commonProperties); - s3Client.createBucket(TEST_BUCKET); + s3Client.createBucket(create -> create.bucket(TEST_BUCKET).build()); mockedSourceTaskContext = mock(SourceTaskContext.class); mockedOffsetStorageReader = mock(OffsetStorageReader.class); } @AfterEach public void tearDown() { - s3Client.deleteBucket(TEST_BUCKET); + s3Client.deleteBucket(delete -> delete.bucket(TEST_BUCKET).build()); } @Test diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java index edbe8dc98..10939c511 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java @@ -27,9 +27,8 @@ import io.aiven.kafka.connect.common.source.input.InputFormat; import io.aiven.kafka.connect.config.s3.S3ConfigFragment; -import com.amazonaws.regions.RegionUtils; -import com.amazonaws.regions.Regions; import org.junit.jupiter.api.Test; +import software.amazon.awssdk.regions.Region; final class S3SourceConfigTest { @Test @@ -42,7 +41,7 @@ void correctFullConfig() { props.put(S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG, "the-bucket"); props.put(S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG, "AWS_S3_ENDPOINT"); props.put(S3ConfigFragment.AWS_S3_PREFIX_CONFIG, "AWS_S3_PREFIX"); - props.put(S3ConfigFragment.AWS_S3_REGION_CONFIG, Regions.US_EAST_1.getName()); + props.put(S3ConfigFragment.AWS_S3_REGION_CONFIG, Region.US_EAST_1.id()); // record, topic specific props props.put(INPUT_FORMAT_KEY, InputFormat.AVRO.getValue()); @@ -53,11 +52,11 @@ void correctFullConfig() { final var conf = new S3SourceConfig(props); final var awsCredentials = conf.getAwsCredentials(); - assertThat(awsCredentials.getAWSAccessKeyId()).isEqualTo("AWS_ACCESS_KEY_ID"); - assertThat(awsCredentials.getAWSSecretKey()).isEqualTo("AWS_SECRET_ACCESS_KEY"); + assertThat(awsCredentials.accessKeyId()).isEqualTo("AWS_ACCESS_KEY_ID"); + assertThat(awsCredentials.secretAccessKey()).isEqualTo("AWS_SECRET_ACCESS_KEY"); assertThat(conf.getAwsS3BucketName()).isEqualTo("the-bucket"); assertThat(conf.getAwsS3EndPoint()).isEqualTo("AWS_S3_ENDPOINT"); - assertThat(conf.getAwsS3Region()).isEqualTo(RegionUtils.getRegion("us-east-1")); + assertThat(conf.getAwsS3Region()).isEqualTo(Region.of("us-east-1")); assertThat(conf.getInputFormat()).isEqualTo(InputFormat.AVRO); assertThat(conf.getTargetTopics()).isEqualTo("testtopic"); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java index 212088560..8b34f73d0 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java @@ -32,58 +32,71 @@ import io.aiven.kafka.connect.common.config.CompressionType; -import com.amazonaws.AmazonClientException; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.model.DeleteObjectsRequest; -import com.amazonaws.services.s3.model.MultiObjectDeleteException; -import com.amazonaws.services.s3.model.S3ObjectSummary; import com.github.luben.zstd.ZstdInputStream; import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xerial.snappy.SnappyInputStream; +import software.amazon.awssdk.core.exception.SdkException; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.Delete; +import software.amazon.awssdk.services.s3.model.DeleteBucketRequest; +import software.amazon.awssdk.services.s3.model.DeleteObjectsRequest; +import software.amazon.awssdk.services.s3.model.HeadObjectRequest; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; +import software.amazon.awssdk.services.s3.model.NoSuchKeyException; +import software.amazon.awssdk.services.s3.model.ObjectIdentifier; +import software.amazon.awssdk.services.s3.model.S3Exception; +import software.amazon.awssdk.services.s3.model.S3Object; public class BucketAccessor { private final String bucketName; - private final AmazonS3 s3Client; + private final S3Client s3Client; private static final Logger LOGGER = LoggerFactory.getLogger(BucketAccessor.class); @SuppressFBWarnings(value = "EI_EXPOSE_REP2", justification = "stores mutable s3Client object") - public BucketAccessor(final AmazonS3 s3Client, final String bucketName) { + public BucketAccessor(final S3Client s3Client, final String bucketName) { this.bucketName = bucketName; this.s3Client = s3Client; } public final void createBucket() { - s3Client.createBucket(bucketName); + s3Client.createBucket(builder -> builder.bucket(bucketName).build()); } public final void removeBucket() { - final var chunk = s3Client.listObjects(bucketName) - .getObjectSummaries() + final var deleteIds = s3Client.listObjectsV2(ListObjectsV2Request.builder().bucket(bucketName).build()) + .contents() .stream() - .map(S3ObjectSummary::getKey) - .toArray(String[]::new); + .map(S3Object::key) + .map(key -> ObjectIdentifier.builder().key(key).build()) + .collect(Collectors.toList()); - final var deleteObjectsRequest = new DeleteObjectsRequest(bucketName).withKeys(chunk); try { - s3Client.deleteObjects(deleteObjectsRequest); - } catch (final MultiObjectDeleteException e) { - for (final var err : e.getErrors()) { - LOGGER.warn(String.format("Couldn't delete object: %s. Reason: [%s] %s", err.getKey(), err.getCode(), - err.getMessage())); - } - } catch (final AmazonClientException e) { - LOGGER.error("Couldn't delete objects: {}", - Arrays.stream(chunk).reduce(" ", String::concat) + e.getMessage()); + s3Client.deleteObjects(DeleteObjectsRequest.builder() + .bucket(bucketName) + .delete(Delete.builder().objects(deleteIds).build()) + .build()); + } catch (final S3Exception e) { + LOGGER.warn( + String.format("Couldn't delete objects. Reason: [%s] %s", e.awsErrorDetails().errorMessage(), e)); + } catch (final SdkException e) { + + LOGGER.error("Couldn't delete objects: {}, Exception{} ", deleteIds, e.getMessage()); } - s3Client.deleteBucket(bucketName); + s3Client.deleteBucket(DeleteBucketRequest.builder().bucket(bucketName).build()); } + // TODO NOT Currently used public final Boolean doesObjectExist(final String objectName) { - return s3Client.doesObjectExist(bucketName, objectName); + try { + s3Client.headObject(HeadObjectRequest.builder().bucket(bucketName).key(objectName).build()); + return true; + } catch (NoSuchKeyException e) { + return false; + } } public final List<List<String>> readAndDecodeLines(final String blobName, final String compression, @@ -104,7 +117,8 @@ private List<List<String>> readAndDecodeLines0(final String blobName, final Stri public final byte[] readBytes(final String blobName, final String compression) throws IOException { Objects.requireNonNull(blobName, "blobName cannot be null"); - final byte[] blobBytes = s3Client.getObject(bucketName, blobName).getObjectContent().readAllBytes(); + final byte[] blobBytes = s3Client.getObjectAsBytes(builder -> builder.key(blobName).bucket(bucketName).build()) + .asByteArray(); try (ByteArrayInputStream bais = new ByteArrayInputStream(blobBytes); InputStream decompressedStream = getDecompressedStream(bais, compression); ByteArrayOutputStream decompressedBytes = new ByteArrayOutputStream()) { @@ -135,10 +149,11 @@ public final List<String> readLines(final String blobName, final String compress } public final List<String> listObjects() { - return s3Client.listObjects(bucketName) - .getObjectSummaries() + + return s3Client.listObjectsV2(ListObjectsV2Request.builder().bucket(bucketName).build()) + .contents() .stream() - .map(S3ObjectSummary::getKey) + .map(S3Object::key) .collect(Collectors.toList()); } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java deleted file mode 100644 index 4d33e46c5..000000000 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/S3OutputStream.java +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright 2020 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.s3.source.testutils; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; - -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.model.AbortMultipartUploadRequest; -import com.amazonaws.services.s3.model.CompleteMultipartUploadRequest; -import com.amazonaws.services.s3.model.InitiateMultipartUploadRequest; -import com.amazonaws.services.s3.model.ObjectMetadata; -import com.amazonaws.services.s3.model.PartETag; -import com.amazonaws.services.s3.model.UploadPartRequest; -import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class S3OutputStream extends OutputStream { - - private final Logger logger = LoggerFactory.getLogger(S3OutputStream.class); - - public static final int DEFAULT_PART_SIZE = 5 * 1024 * 1024; // 5 MB - - private final AmazonS3 client; - - private final ByteBuffer byteBuffer; - - private final String bucketName; - - private final String key; - - private MultipartUpload multipartUpload; - - private final int partSize; - - private final String serverSideEncryptionAlgorithm; - - private boolean closed; - - @SuppressFBWarnings(value = "EI_EXPOSE_REP2", justification = "AmazonS3 client is mutable") - public S3OutputStream(final String bucketName, final String key, final int partSize, final AmazonS3 client) { - this(bucketName, key, partSize, client, null); - } - - @SuppressFBWarnings(value = "EI_EXPOSE_REP2", justification = "AmazonS3 client is mutable") - public S3OutputStream(final String bucketName, final String key, final int partSize, final AmazonS3 client, - final String serverSideEncryptionAlgorithm) { - super(); - this.bucketName = bucketName; - this.key = key; - this.client = client; - this.partSize = partSize; - this.byteBuffer = ByteBuffer.allocate(partSize); - this.serverSideEncryptionAlgorithm = serverSideEncryptionAlgorithm; - } - - @Override - public void write(final int singleByte) throws IOException { - write(new byte[] { (byte) singleByte }, 0, 1); - } - - @Override - public void write(final byte[] bytes, final int off, final int len) throws IOException { - if (Objects.isNull(bytes) || bytes.length == 0) { - return; - } - if (Objects.isNull(multipartUpload)) { - multipartUpload = newMultipartUpload(); - } - final var source = ByteBuffer.wrap(bytes, off, len); - while (source.hasRemaining()) { - final var transferred = Math.min(byteBuffer.remaining(), source.remaining()); - final var offset = source.arrayOffset() + source.position(); - byteBuffer.put(source.array(), offset, transferred); - source.position(source.position() + transferred); - if (!byteBuffer.hasRemaining()) { - flushBuffer(0, partSize, partSize); - } - } - } - - private MultipartUpload newMultipartUpload() throws IOException { - logger.debug("Create new multipart upload request"); - final var initialRequest = new InitiateMultipartUploadRequest(bucketName, key); - initialRequest.setObjectMetadata(this.buildObjectMetadata()); - final var initiateResult = client.initiateMultipartUpload(initialRequest); - logger.debug("Upload ID: {}", initiateResult.getUploadId()); - return new MultipartUpload(initiateResult.getUploadId()); - } - - private ObjectMetadata buildObjectMetadata() { - final ObjectMetadata metadata = new ObjectMetadata(); - - if (this.serverSideEncryptionAlgorithm != null) { - metadata.setSSEAlgorithm(this.serverSideEncryptionAlgorithm); - } - - return metadata; - } - - @Override - public void close() throws IOException { - if (closed) { - return; - } - if (byteBuffer.position() > 0 && Objects.nonNull(multipartUpload)) { - flushBuffer(byteBuffer.arrayOffset(), byteBuffer.position(), byteBuffer.position()); - } - if (Objects.nonNull(multipartUpload)) { - multipartUpload.complete(); - multipartUpload = null; // NOPMD NullAssignment - } - closed = true; - super.close(); - } - - private void flushBuffer(final int offset, final int length, final int partSize) throws IOException { - try { - multipartUpload.uploadPart(new ByteArrayInputStream(byteBuffer.array(), offset, length), partSize); - byteBuffer.clear(); - } catch (final Exception e) { // NOPMD AvoidCatchingGenericException - multipartUpload.abort(); - multipartUpload = null; // NOPMD NullAssignment - throw new IOException(e); - } - } - - private class MultipartUpload { - - private final String uploadId; - - private final List<PartETag> partETags = new ArrayList<>(); - - public MultipartUpload(final String uploadId) { - this.uploadId = uploadId; - } - - public void uploadPart(final InputStream inputStream, final int partSize) throws IOException { - final var partNumber = partETags.size() + 1; - final var uploadPartRequest = new UploadPartRequest().withBucketName(bucketName) - .withKey(key) - .withUploadId(uploadId) - .withPartSize(partSize) - .withPartNumber(partNumber) - .withInputStream(inputStream); - final var uploadResult = client.uploadPart(uploadPartRequest); - partETags.add(uploadResult.getPartETag()); - } - - public void complete() { - client.completeMultipartUpload(new CompleteMultipartUploadRequest(bucketName, key, uploadId, partETags)); - } - - public void abort() { - client.abortMultipartUpload(new AbortMultipartUploadRequest(bucketName, key, uploadId)); - } - - } - -} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java index a8174a15c..beed0681c 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java @@ -34,19 +34,19 @@ import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.model.ListObjectsV2Request; -import com.amazonaws.services.s3.model.ListObjectsV2Result; -import com.amazonaws.services.s3.model.S3ObjectSummary; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.CsvSource; import org.mockito.ArgumentCaptor; import org.mockito.Captor; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Response; +import software.amazon.awssdk.services.s3.model.S3Object; class AWSV2SourceClientTest { - private AmazonS3 s3Client; + private S3Client s3Client; private AWSV2SourceClient awsv2SourceClient; @@ -66,8 +66,8 @@ private static Map<String, String> getConfigMap(final int maxTasks, final int ta @CsvSource({ "3, 1" }) void testFetchObjectSummariesWithNoObjects(final int maxTasks, final int taskId) { initializeWithTaskConfigs(maxTasks, taskId); - final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result(Collections.emptyList(), null); - when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); + final ListObjectsV2Response listObjectsV2Response = createListObjectsV2Response(Collections.emptyList(), null); + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Response); final Iterator<String> summaries = awsv2SourceClient.getListOfObjectKeys(null); assertThat(summaries).isExhausted(); @@ -107,8 +107,8 @@ void testFetchObjectSummariesWithOneNonZeroByteObjectWithTaskIdUnassigned(final @CsvSource({ "4, 3", "4, 0" }) void testFetchObjectSummariesWithZeroByteObject(final int maxTasks, final int taskId) { initializeWithTaskConfigs(maxTasks, taskId); - final ListObjectsV2Result listObjectsV2Result = getListObjectsV2Result(); - when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); + final ListObjectsV2Response listObjectsV2Response = getListObjectsV2Response(); + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Response); final Iterator<String> summaries = awsv2SourceClient.getListOfObjectKeys(null); @@ -121,13 +121,13 @@ void testFetchObjectSummariesWithZeroByteObject(final int maxTasks, final int ta @Test void testFetchObjectSummariesWithPagination() throws IOException { initializeWithTaskConfigs(4, 3); - final S3ObjectSummary object1 = createObjectSummary(1, "key1"); - final S3ObjectSummary object2 = createObjectSummary(2, "key2"); - final List<S3ObjectSummary> firstBatch = List.of(object1); - final List<S3ObjectSummary> secondBatch = List.of(object2); + final S3Object object1 = createObjectSummary(1, "key1"); + final S3Object object2 = createObjectSummary(2, "key2"); + final List<S3Object> firstBatch = List.of(object1); + final List<S3Object> secondBatch = List.of(object2); - final ListObjectsV2Result firstResult = createListObjectsV2Result(firstBatch, "nextToken"); - final ListObjectsV2Result secondResult = createListObjectsV2Result(secondBatch, null); + final ListObjectsV2Response firstResult = createListObjectsV2Response(firstBatch, "nextToken"); + final ListObjectsV2Response secondResult = createListObjectsV2Response(secondBatch, null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(firstResult).thenReturn(secondResult); @@ -142,14 +142,14 @@ void testFetchObjectWithPrefix() { final Map<String, String> configMap = getConfigMap(1, 0); configMap.put(AWS_S3_PREFIX_CONFIG, "test/"); final S3SourceConfig s3SourceConfig = new S3SourceConfig(configMap); - s3Client = mock(AmazonS3.class); + s3Client = mock(S3Client.class); awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig, Collections.emptySet()); requestCaptor = ArgumentCaptor.forClass(ListObjectsV2Request.class); - final S3ObjectSummary object1 = createObjectSummary(1, "key1"); - final S3ObjectSummary object2 = createObjectSummary(1, "key2"); + final S3Object object1 = createObjectSummary(1, "key1"); + final S3Object object2 = createObjectSummary(1, "key2"); - final ListObjectsV2Result firstResult = createListObjectsV2Result(List.of(object1), "nextToken"); - final ListObjectsV2Result secondResult = createListObjectsV2Result(List.of(object2), null); + final ListObjectsV2Response firstResult = createListObjectsV2Response(List.of(object1), "nextToken"); + final ListObjectsV2Response secondResult = createListObjectsV2Response(List.of(object2), null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(firstResult).thenReturn(secondResult); @@ -163,10 +163,10 @@ void testFetchObjectWithPrefix() { final List<ListObjectsV2Request> allRequests = requestCaptor.getAllValues(); assertThat(summaries).isExhausted(); - assertThat(allRequests.get(0).getPrefix()).isEqualTo(s3SourceConfig.getAwsS3Prefix()); + assertThat(allRequests.get(0).prefix()).isEqualTo(s3SourceConfig.getAwsS3Prefix()); // Not required with continuation token - assertThat(allRequests.get(1).getPrefix()).isNull(); - assertThat(allRequests.get(1).getContinuationToken()).isEqualTo("nextToken"); + assertThat(allRequests.get(1).prefix()).isNull(); + assertThat(allRequests.get(1).continuationToken()).isEqualTo("nextToken"); } @@ -175,14 +175,14 @@ void testFetchObjectWithInitialStartAfter() { final Map<String, String> configMap = getConfigMap(1, 0); final String startAfter = "file-option-1-12000.txt"; final S3SourceConfig s3SourceConfig = new S3SourceConfig(configMap); - s3Client = mock(AmazonS3.class); + s3Client = mock(S3Client.class); awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig, Collections.emptySet()); requestCaptor = ArgumentCaptor.forClass(ListObjectsV2Request.class); - final S3ObjectSummary object1 = createObjectSummary(1, "key1"); - final S3ObjectSummary object2 = createObjectSummary(1, "key2"); + final S3Object object1 = createObjectSummary(1, "key1"); + final S3Object object2 = createObjectSummary(1, "key2"); - final ListObjectsV2Result firstResult = createListObjectsV2Result(List.of(object1), "nextToken"); - final ListObjectsV2Result secondResult = createListObjectsV2Result(List.of(object2), null); + final ListObjectsV2Response firstResult = createListObjectsV2Response(List.of(object1), "nextToken"); + final ListObjectsV2Response secondResult = createListObjectsV2Response(List.of(object2), null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(firstResult).thenReturn(secondResult); @@ -196,32 +196,31 @@ void testFetchObjectWithInitialStartAfter() { final List<ListObjectsV2Request> allRequests = requestCaptor.getAllValues(); assertThat(summaries).isExhausted(); - assertThat(allRequests.get(0).getStartAfter()).isEqualTo(startAfter); + assertThat(allRequests.get(0).startAfter()).isEqualTo(startAfter); // Not required with continuation token - assertThat(allRequests.get(1).getStartAfter()).isNull(); - assertThat(allRequests.get(1).getContinuationToken()).isEqualTo("nextToken"); + assertThat(allRequests.get(1).startAfter()).isNull(); + assertThat(allRequests.get(1).continuationToken()).isEqualTo("nextToken"); } - private ListObjectsV2Result createListObjectsV2Result(final List<S3ObjectSummary> summaries, - final String nextToken) { - final ListObjectsV2Result result = mock(ListObjectsV2Result.class); - when(result.getObjectSummaries()).thenReturn(summaries); - when(result.getNextContinuationToken()).thenReturn(nextToken); + private ListObjectsV2Response createListObjectsV2Response(final List<S3Object> summaries, final String nextToken) { + final ListObjectsV2Response result = mock(ListObjectsV2Response.class); + when(result.contents()).thenReturn(summaries); + when(result.nextContinuationToken()).thenReturn(nextToken); when(result.isTruncated()).thenReturn(nextToken != null); return result; } - private S3ObjectSummary createObjectSummary(final long sizeOfObject, final String objectKey) { - final S3ObjectSummary summary = mock(S3ObjectSummary.class); - when(summary.getSize()).thenReturn(sizeOfObject); - when(summary.getKey()).thenReturn(objectKey); + private S3Object createObjectSummary(final long sizeOfObject, final String objectKey) { + final S3Object summary = mock(S3Object.class); + when(summary.size()).thenReturn(sizeOfObject); + when(summary.key()).thenReturn(objectKey); return summary; } private Iterator<String> getS3ObjectKeysIterator(final String objectKey) { - final S3ObjectSummary objectSummary = createObjectSummary(1, objectKey); - final ListObjectsV2Result listObjectsV2Result = createListObjectsV2Result( + final S3Object objectSummary = createObjectSummary(1, objectKey); + final ListObjectsV2Response listObjectsV2Result = createListObjectsV2Response( Collections.singletonList(objectSummary), null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); @@ -231,15 +230,15 @@ private Iterator<String> getS3ObjectKeysIterator(final String objectKey) { public void initializeWithTaskConfigs(final int maxTasks, final int taskId) { final Map<String, String> configMap = getConfigMap(maxTasks, taskId); final S3SourceConfig s3SourceConfig = new S3SourceConfig(configMap); - s3Client = mock(AmazonS3.class); + s3Client = mock(S3Client.class); awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig, Collections.emptySet()); } - private ListObjectsV2Result getListObjectsV2Result() { - final S3ObjectSummary zeroByteObject = createObjectSummary(0, "key1"); - final S3ObjectSummary nonZeroByteObject1 = createObjectSummary(1, "key2"); - final S3ObjectSummary nonZeroByteObject2 = createObjectSummary(1, "key3"); - return createListObjectsV2Result(List.of(zeroByteObject, nonZeroByteObject1, nonZeroByteObject2), null); + private ListObjectsV2Response getListObjectsV2Response() { + final S3Object zeroByteObject = createObjectSummary(0, "key1"); + final S3Object nonZeroByteObject1 = createObjectSummary(1, "key2"); + final S3Object nonZeroByteObject2 = createObjectSummary(1, "key3"); + return createListObjectsV2Response(List.of(zeroByteObject, nonZeroByteObject1, nonZeroByteObject2), null); } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java index 61d8170f7..b701ea85d 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -30,6 +30,7 @@ import static org.mockito.Mockito.when; import java.io.ByteArrayInputStream; +import java.io.InputStream; import java.util.Collections; import java.util.stream.Stream; @@ -38,8 +39,6 @@ import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import com.amazonaws.services.s3.model.S3Object; -import com.amazonaws.services.s3.model.S3ObjectInputStream; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -64,12 +63,9 @@ void testIteratorProcessesS3Objects() throws Exception { final String key = "topic-00001-abc123.txt"; - // Mock S3Object and InputStream - try (S3Object mockS3Object = mock(S3Object.class); - S3ObjectInputStream mockInputStream = new S3ObjectInputStream(new ByteArrayInputStream(new byte[] {}), - null);) { - when(mockSourceApiClient.getObject(anyString())).thenReturn(mockS3Object); - when(mockS3Object.getObjectContent()).thenReturn(mockInputStream); + // Mock InputStream + try (InputStream mockInputStream = new ByteArrayInputStream(new byte[] {})) { + when(mockSourceApiClient.getObject(anyString())).thenReturn(() -> mockInputStream); when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) .thenReturn(Stream.of(new Object())); @@ -98,12 +94,9 @@ void testIteratorProcessesS3ObjectsForByteArrayTransformer() throws Exception { final String key = "topic-00001-abc123.txt"; - // Mock S3Object and InputStream - try (S3Object mockS3Object = mock(S3Object.class); - S3ObjectInputStream mockInputStream = new S3ObjectInputStream(new ByteArrayInputStream(new byte[] {}), - null);) { - when(mockSourceApiClient.getObject(anyString())).thenReturn(mockS3Object); - when(mockS3Object.getObjectContent()).thenReturn(mockInputStream); + // Mock InputStream + try (InputStream mockInputStream = new ByteArrayInputStream(new byte[] {})) { + when(mockSourceApiClient.getObject(anyString())).thenReturn(() -> mockInputStream); // With ByteArrayTransformer mockTransformer = mock(ByteArrayTransformer.class); From f6d30874ebe9eea58f80c2a094760c9beb689552 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aindri=C3=BA=20Lavelle?= <121855584+aindriu-aiven@users.noreply.github.com> Date: Fri, 3 Jan 2025 12:19:24 +0000 Subject: [PATCH 85/90] Improve S3 Source Integration tests (#382) Increase performance of the S3 Integration tests by allowing the polling time to be dependent on the test and to take into account if their are more records left to be retrieved before waiting to collect the next batch of records. This allows the consuming of messages to occur faster if additional messages are already waiting to be retrieved. This allows different tests to allow a more subtle control of the time to wait before timing out. (previously all tests would wait the maximum 5 minutes even if only waiting for 5 messages from kafka) This should save 2-3 minutes per Integration test run and allow on a failure run to save between 12-15 minutes on each test run. --------- Signed-off-by: Aindriu Lavelle <aindriu.lavelle@aiven.io> --- .../connect/s3/source/IntegrationBase.java | 34 +++++++++++++------ .../connect/s3/source/IntegrationTest.java | 8 +++-- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index 6b505b996..442993bfc 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -135,41 +135,53 @@ static List<String> consumeByteMessages(final String topic, final int expectedMe String bootstrapServers) { final Properties consumerProperties = getConsumerProperties(bootstrapServers, ByteArrayDeserializer.class, ByteArrayDeserializer.class); - final List<byte[]> objects = consumeMessages(topic, expectedMessageCount, consumerProperties); + final List<byte[]> objects = consumeMessages(topic, expectedMessageCount, Duration.ofSeconds(60), + consumerProperties); return objects.stream().map(String::new).collect(Collectors.toList()); } static List<GenericRecord> consumeAvroMessages(final String topic, final int expectedMessageCount, - final String bootstrapServers, final String schemaRegistryUrl) { + final Duration expectedMaxDuration, final String bootstrapServers, final String schemaRegistryUrl) { final Properties consumerProperties = getConsumerProperties(bootstrapServers, StringDeserializer.class, KafkaAvroDeserializer.class, schemaRegistryUrl); - return consumeMessages(topic, expectedMessageCount, consumerProperties); + return consumeMessages(topic, expectedMessageCount, expectedMaxDuration, consumerProperties); } static List<JsonNode> consumeJsonMessages(final String topic, final int expectedMessageCount, final String bootstrapServers) { final Properties consumerProperties = getConsumerProperties(bootstrapServers, StringDeserializer.class, JsonDeserializer.class); - return consumeMessages(topic, expectedMessageCount, consumerProperties); + return consumeMessages(topic, expectedMessageCount, Duration.ofSeconds(60), consumerProperties); } static <K, V> List<V> consumeMessages(final String topic, final int expectedMessageCount, - final Properties consumerProperties) { + final Duration expectedMaxDuration, final Properties consumerProperties) { try (KafkaConsumer<K, V> consumer = new KafkaConsumer<>(consumerProperties)) { consumer.subscribe(Collections.singletonList(topic)); final List<V> recordValues = new ArrayList<>(); - await().atMost(Duration.ofMinutes(5)).pollInterval(Duration.ofSeconds(5)).untilAsserted(() -> { - final ConsumerRecords<K, V> records = consumer.poll(Duration.ofMillis(500L)); - for (final ConsumerRecord<K, V> record : records) { - recordValues.add(record.value()); - } - assertThat(recordValues).hasSize(expectedMessageCount); + await().atMost(expectedMaxDuration).pollInterval(Duration.ofSeconds(1)).untilAsserted(() -> { + assertThat(consumeRecordsInProgress(consumer, recordValues)).hasSize(expectedMessageCount); }); return recordValues; } } + private static <K, V> List<V> consumeRecordsInProgress(KafkaConsumer<K, V> consumer, List<V> recordValues) { + int recordsRetrieved; + do { + final ConsumerRecords<K, V> records = consumer.poll(Duration.ofMillis(500L)); + recordsRetrieved = records.count(); + for (final ConsumerRecord<K, V> record : records) { + recordValues.add(record.value()); + } + // Choosing 10 records as it allows for integration tests with a smaller max poll to be added + // while maintaining efficiency, a slightly larger number could be added but this is slightly more efficient + // than larger numbers. + } while (recordsRetrieved > 10); + return recordValues; + } + static Map<String, Object> consumeOffsetMessages(KafkaConsumer<byte[], byte[]> consumer) throws IOException { // Poll messages from the topic final Map<String, Object> messages = new HashMap<>(); diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 884051e30..5a573395e 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -230,9 +230,10 @@ void avroTest(final TestInfo testInfo) throws IOException { assertThat(testBucketAccessor.listObjects()).hasSize(5); // Poll Avro messages from the Kafka topic and deserialize them + // Waiting for 25k kafka records in this test so a longer Duration is added. final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, numOfRecsFactor * 5, - connectRunner.getBootstrapServers(), schemaRegistry.getSchemaRegistryUrl()); // Ensure this method - // deserializes Avro + Duration.ofMinutes(3), connectRunner.getBootstrapServers(), schemaRegistry.getSchemaRegistryUrl()); + // Ensure this method deserializes Avro // Verify that the correct data is read from the S3 bucket and pushed to Kafka assertThat(records).map(record -> entry(record.get("id"), String.valueOf(record.get("message")))) @@ -269,7 +270,8 @@ void parquetTest(final TestInfo testInfo) throws IOException { Files.delete(path); } - final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 100, + // Waiting for a small number of messages so using a smaller Duration of a minute + final List<GenericRecord> records = IntegrationBase.consumeAvroMessages(topicName, 100, Duration.ofSeconds(60), connectRunner.getBootstrapServers(), schemaRegistry.getSchemaRegistryUrl()); final List<String> expectedRecordNames = IntStream.range(0, 100) .mapToObj(i -> name + i) From c4604f4eb562e9da10f366312084dc73f8c279eb Mon Sep 17 00:00:00 2001 From: Claude Warren <claude.warren@aiven.io> Date: Thu, 9 Jan 2025 14:08:47 +0000 Subject: [PATCH 86/90] Polling efficiency (#378) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes for KCON-26 - Backoff when no data available. Fixes for KCON-28 - Improve poll method Creates an AbstractSourceTask in commons to handle response to poll and backoff calculations as well as start, stop. Implementations need to implement an Iterator that poll will call to retrieve data. Private classes Timer and Backoff are created in AbstractSourceTask and may be moved out at a later date if needed elsewhere. Changes made to configurations to support configuration extraction in AbstractSourceTask. Modifications to S3SourceTask to operate under AbstractSourceTask. Additional tests added --------- Co-authored-by: ¨Claude <¨claude.warren@aiven.io¨> Co-authored-by: Jarkko Jaakola <91882676+jjaakola-aiven@users.noreply.github.com> Co-authored-by: Murali Basani <murali.basani@gmail.com> --- commons/build.gradle.kts | 1 + .../common/config/SourceCommonConfig.java | 8 +- .../common/config/SourceConfigFragment.java | 4 +- .../common/source/AbstractSourceTask.java | 511 ++++++++++++++++++ .../common/source/input/AvroTransformer.java | 19 +- .../source/input/ByteArrayTransformer.java | 19 +- .../common/source/input/JsonTransformer.java | 19 +- .../source/input/ParquetTransformer.java | 19 +- .../common/source/input/Transformer.java | 33 +- .../source/input/TransformerFactory.java | 34 +- .../common/source/AbstractSourceTaskTest.java | 145 +++++ .../source/input/AvroTransformerTest.java | 36 +- .../input/ByteArrayTransformerTest.java | 19 +- .../source/input/JsonTransformerTest.java | 70 +-- .../source/input/ParquetTransformerTest.java | 45 +- .../input/TransformerStreamingTest.java | 52 +- s3-source-connector/build.gradle.kts | 1 + .../connect/s3/source/AwsIntegrationTest.java | 310 +++++++++++ .../connect/s3/source/IntegrationBase.java | 73 +++ .../connect/s3/source/IntegrationTest.java | 85 +-- .../kafka/connect/s3/source/S3SourceTask.java | 192 +++---- .../s3/source/utils/AWSV2SourceClient.java | 153 ++++-- .../connect/s3/source/utils/ConnectUtils.java | 2 +- .../s3/source/utils/OffsetManager.java | 36 +- .../s3/source/utils/RecordProcessor.java | 32 +- .../s3/source/utils/S3SourceRecord.java | 28 +- .../s3/source/utils/SourceRecordIterator.java | 246 ++++----- .../connect/s3/source/S3SourceTaskTest.java | 327 ++++++++--- .../s3/source/utils/RecordProcessorTest.java | 100 ++-- .../utils/SourceRecordIteratorTest.java | 61 ++- settings.gradle.kts | 4 + 31 files changed, 1932 insertions(+), 752 deletions(-) create mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/AbstractSourceTask.java create mode 100644 commons/src/test/java/io/aiven/kafka/connect/common/source/AbstractSourceTaskTest.java create mode 100644 s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java diff --git a/commons/build.gradle.kts b/commons/build.gradle.kts index 232404466..101ef8db9 100644 --- a/commons/build.gradle.kts +++ b/commons/build.gradle.kts @@ -87,6 +87,7 @@ dependencies { testImplementation(jackson.databind) testImplementation(testinglibs.mockito.core) testImplementation(testinglibs.assertj.core) + testImplementation(testinglibs.awaitility) testImplementation(testFixtures(project(":commons"))) testImplementation(testinglibs.woodstox.stax2.api) testImplementation(apache.hadoop.mapreduce.client.core) diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java index 7fb8cd9b2..954c9151d 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java @@ -22,6 +22,8 @@ import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; import io.aiven.kafka.connect.common.source.input.InputFormat; +import io.aiven.kafka.connect.common.source.input.Transformer; +import io.aiven.kafka.connect.common.source.input.TransformerFactory; public class SourceCommonConfig extends CommonConfig { @@ -64,11 +66,15 @@ public String getTargetTopicPartitions() { } public ErrorsTolerance getErrorsTolerance() { - return ErrorsTolerance.forName(sourceConfigFragment.getErrorsTolerance()); + return sourceConfigFragment.getErrorsTolerance(); } public int getMaxPollRecords() { return sourceConfigFragment.getMaxPollRecords(); } + public Transformer getTransformer() { + return TransformerFactory.getTransformer(schemaRegistryFragment.getInputFormat()); + } + } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java index c62431dcb..58befa60e 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java @@ -88,8 +88,8 @@ public int getExpectedMaxMessageBytes() { return cfg.getInt(EXPECTED_MAX_MESSAGE_BYTES); } - public String getErrorsTolerance() { - return cfg.getString(ERRORS_TOLERANCE); + public ErrorsTolerance getErrorsTolerance() { + return ErrorsTolerance.forName(cfg.getString(ERRORS_TOLERANCE)); } private static class ErrorsToleranceValidator implements ConfigDef.Validator { diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/AbstractSourceTask.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/AbstractSourceTask.java new file mode 100644 index 000000000..f55257f46 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/AbstractSourceTask.java @@ -0,0 +1,511 @@ +/* + * Copyright 2024-2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.apache.kafka.connect.source.SourceRecord; +import org.apache.kafka.connect.source.SourceTask; + +import io.aiven.kafka.connect.common.config.SourceCommonConfig; +import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; + +import org.apache.commons.lang3.time.StopWatch; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class handles extracting records from an iterator and returning them to Kafka. It uses an exponential backoff + * with jitter to reduce the number of calls to the backend when there is no data. This solution: + * <ul> + * <li>When polled this implementation moves available records from the SourceRecord iterator to the return array.</li> + * <li>if there are no records + * <ul> + * <li>{@link #poll()} will return null.</li> + * <li>The poll will delay no more than approx 5 seconds.</li> + * </ul> + * </li> + * <li>Upto {@link #maxPollRecords} will be sent in a single poll request</li> + * <li>When the connector is stopped any collected records are returned to kafka before stopping.</li> + * </ul> + * + * + */ +public abstract class AbstractSourceTask extends SourceTask { + + public static final List<SourceRecord> NULL_RESULT = null; + + /** + * The maximum time to spend polling. This is set to 5 seconds as that is the time that is allotted to a system for + * shutdown. + */ + public static final Duration MAX_POLL_TIME = Duration.ofSeconds(5); + /** + * The boolean that indicates the connector is stopped. + */ + private final AtomicBoolean connectorStopped; + + /** + * The logger to use. Set from the class implementing AbstractSourceTask. + */ + private final Logger logger; + + /** + * The maximum number of records to put in a poll. Specified in the configuration. + */ + private int maxPollRecords; + + /** + * The Backoff implementation that executes the delay in the poll loop. + */ + private final Backoff backoff; + + private final Timer timer; + + /** + * The configuration + */ + private SourceCommonConfig config; + + private Iterator<SourceRecord> sourceRecordIterator; + + /** + * Constructor. + * + * @param logger + * the logger to use. + */ + protected AbstractSourceTask(final Logger logger) { + super(); + this.logger = logger; + connectorStopped = new AtomicBoolean(); + timer = new Timer(MAX_POLL_TIME); + backoff = new Backoff(timer.getBackoffConfig()); + } + + /** + * Gets the iterator of SourceRecords. The iterator that SourceRecords are extracted from during a poll event. When + * this iterator runs out of records it should attempt to reset and read more records from the backend on the next + * {@code hasNext()} call. In this way it should detect when new data has been added to the backend and continue + * processing. + * <p> + * This method should handle any backend exception that can be retried. Any runtime exceptions that are thrown when + * this iterator executes may cause the task to abort. + * </p> + * + * @param config + * the configuration for the Backoff. + * @return The iterator of SourceRecords. + */ + abstract protected Iterator<SourceRecord> getIterator(BackoffConfig config); + + /** + * Called by {@link #start} to allows the concrete implementation to configure itself based on properties. + * + * @param props + * the properties to use for configuration. + */ + abstract protected SourceCommonConfig configure(Map<String, String> props); + + @Override + public final void start(final Map<String, String> props) { + logger.debug("Starting"); + config = configure(props); + maxPollRecords = config.getMaxPollRecords(); + sourceRecordIterator = getIterator(timer.getBackoffConfig()); + } + + /** + * Try to add a SourceRecord to the results. + * + * @param results + * the result to add the record to. + * @param sourceRecordIterator + * the source record iterator. + * @return true if successful, false if the iterator is empty. + */ + private boolean tryAdd(final List<SourceRecord> results, final Iterator<SourceRecord> sourceRecordIterator) { + if (sourceRecordIterator.hasNext()) { + backoff.reset(); + final SourceRecord sourceRecord = sourceRecordIterator.next(); + if (logger.isDebugEnabled()) { + logger.debug("tryAdd() : read record {}", sourceRecord.sourceOffset()); + } + results.add(sourceRecord); + return true; + } + logger.info("No records found in tryAdd call"); + return false; + } + + /** + * Returns {@code true} if the connector is not stopped and the timer has not expired. + * + * @return {@code true} if the connector is not stopped and the timer has not expired. + */ + protected boolean stillPolling() { + final boolean result = !connectorStopped.get() && !timer.isExpired(); + logger.debug("Still polling: {}", result); + return result; + } + + @Override + public final List<SourceRecord> poll() { + logger.debug("Polling"); + if (connectorStopped.get()) { + logger.info("Stopping"); + closeResources(); + return NULL_RESULT; + } else { + timer.start(); + try { + final List<SourceRecord> result = populateList(); + if (logger.isDebugEnabled()) { + logger.debug("Poll() returning {} SourceRecords.", result == null ? null : result.size()); + } + return result; + } finally { + timer.stop(); + timer.reset(); + } + } + } + + /** + * Attempts to populate the return list. Will read as many records into the list as it can until the timer expires + * or the task is shut down. + * + * @return A list SourceRecords or {@code null} if the system hit a runtime exception. + */ + private List<SourceRecord> populateList() { + final List<SourceRecord> results = new ArrayList<>(); + try { + while (stillPolling() && results.size() < maxPollRecords) { + if (!tryAdd(results, sourceRecordIterator)) { + if (!results.isEmpty()) { + logger.debug("tryAdd() did not add to the list, returning current results."); + // if we could not get a record and the results are not empty return them + break; + } + logger.debug("Attempting {}", backoff); + backoff.cleanDelay(); + } + } + + } catch (RuntimeException e) { // NOPMD must catch runtime here. + logger.error("Error during poll(): {}", e.getMessage(), e); + if (config.getErrorsTolerance() == ErrorsTolerance.NONE) { + logger.error("Stopping Task"); + throw e; + } + } + return results.isEmpty() ? NULL_RESULT : results; + } + + @Override + public final void stop() { + logger.debug("Stopping"); + connectorStopped.set(true); + } + + /** + * Returns the running state of the task. + * + * @return {@code true} if the connector is running, {@code false} otherwise. + */ + public final boolean isRunning() { + return !connectorStopped.get(); + } + + /** + * Close any resources the source has open. Called by the IteratorRunnable when it is stopping. + */ + abstract protected void closeResources(); + + /** + * Calculates elapsed time and flags when expired. + */ + protected static class Timer extends StopWatch { + /** + * The length of time that the timer should run. + */ + private final long duration; + + /** + * The flag that indicates the timer has been aborted. + */ + private boolean hasAborted; + + /** + * Constructor. + * + * @param duration + * the length of time the timer should run. + */ + Timer(final Duration duration) { + super(); + this.duration = duration.toMillis(); + } + + /** + * Gets the maximum duration for this timer. + * + * @return the maximum duration for the timer. + */ + public long millisecondsRemaining() { + return super.isStarted() ? duration - super.getTime() : duration; + } + + /** + * Returns {@code true} if the timer has expired. + * + * @return {@code true} if the timer has expired. + */ + public boolean isExpired() { + return hasAborted || super.getTime() >= duration; + } + + /** + * Aborts the timer. Timer will report that it has expired until reset is called. + */ + public void abort() { + hasAborted = true; + } + + @Override + public void start() { + try { + hasAborted = false; + super.start(); + } catch (IllegalStateException e) { + throw new IllegalStateException("Timer: " + e.getMessage()); + } + } + + @Override + public void stop() { + try { + super.stop(); + } catch (IllegalStateException e) { + throw new IllegalStateException("Timer: " + e.getMessage()); + } + } + + @Override + public void reset() { + try { + hasAborted = false; + super.reset(); + } catch (IllegalStateException e) { + throw new IllegalStateException("Timer: " + e.getMessage()); + } + } + + /** + * Gets a Backoff Config for this timer. + * + * @return a backoff Configuration. + */ + public BackoffConfig getBackoffConfig() { + return new BackoffConfig() { + + @Override + public SupplierOfLong getSupplierOfTimeRemaining() { + return Timer.this::millisecondsRemaining; + } + + @Override + public AbortTrigger getAbortTrigger() { + return Timer.this::abort; + } + }; + } + } + + /** + * Performs a delay based on the number of successive {@link #delay()} or {@link #cleanDelay()} calls without a + * {@link #reset()}. Delay increases exponentially but never exceeds the time remaining by more than 0.512 seconds. + */ + public static class Backoff { + /** The logger to write to */ + private static final Logger LOGGER = LoggerFactory.getLogger(Backoff.class); + /** + * The maximum jitter random number. Should be a power of 2 for speed. + */ + public static final int MAX_JITTER = 1024; + + public static final int JITTER_SUBTRAHEND = MAX_JITTER / 2; + /** + * A supplier of the time remaining (in milliseconds) on the overriding timer. + */ + private final SupplierOfLong timeRemaining; + + /** + * A function to call to abort the timer. + */ + private final AbortTrigger abortTrigger; + + /** + * The maximum number of times {@link #delay()} will be called before maxWait is reached. + */ + private int maxCount; + /** + * The number of times {@link #delay()} has been called. + */ + private int waitCount; + + /** + * A random number generator to construct jitter. + */ + Random random = new Random(); + + /** + * Constructor. + * + * @param config + * The configuration for the backoff. + */ + public Backoff(final BackoffConfig config) { + this.timeRemaining = config.getSupplierOfTimeRemaining(); + this.abortTrigger = config.getAbortTrigger(); + reset(); + } + + /** + * Reset the backoff time so that delay is again at the minimum. + */ + public final void reset() { + // if the reminaing time is 0 or negative the maxCount will be infinity + // so make sure that it is 0 in that case. + final long remainingTime = timeRemaining.get(); + maxCount = remainingTime < 1L ? 0 : (int) (Math.log10(remainingTime) / Math.log10(2)); + waitCount = 0; + LOGGER.debug("Reset {}", this); + } + + /** + * Handle adjustment when maxCount could not be set. + * + * @return the corrected maxCount + */ + private int getMaxCount() { + if (maxCount == 0) { + reset(); + } + return maxCount; + } + + /** + * Calculates the delay wihtout jitter. + * + * @return the number of milliseconds the delay will be. + */ + public long estimatedDelay() { + long sleepTime = timeRemaining.get(); + if (sleepTime > 0 && waitCount < maxCount) { + sleepTime = (long) Math.min(sleepTime, Math.pow(2, waitCount + 1)); + } + return sleepTime < 0 ? 0 : sleepTime; + } + + /** + * Calculates the range of jitter in milliseconds. + * + * @return the maximum jitter in milliseconds. jitter is +/- maximum jitter. + */ + public int getMaxJitter() { + return MAX_JITTER - JITTER_SUBTRAHEND; + } + + private long timeWithJitter() { + // generate approx +/- 0.512 seconds of jitter + final int jitter = random.nextInt(MAX_JITTER) - JITTER_SUBTRAHEND; + return (long) Math.pow(2, waitCount) + jitter; + } + + /** + * Delay execution based on the number of times this method has been called. + * + * @throws InterruptedException + * If any thread interrupts this thread. + */ + public void delay() throws InterruptedException { + final long sleepTime = timeRemaining.get(); + if (sleepTime > 0 && waitCount < (maxCount == 0 ? getMaxCount() : maxCount)) { + waitCount++; + final long nextSleep = timeWithJitter(); + // don't sleep negative time. Jitter can introduce negative tme. + if (nextSleep > 0) { + if (nextSleep >= sleepTime) { + LOGGER.debug("Backoff aborting timer"); + abortTrigger.apply(); + } else { + LOGGER.debug("Backoff sleepiing {}", nextSleep); + Thread.sleep(nextSleep); + } + } + } + } + + /** + * Like {@link #delay} but swallows the {@link InterruptedException}. + */ + public void cleanDelay() { + try { + delay(); + } catch (InterruptedException exception) { + // do nothing return results below + } + } + + @Override + public String toString() { + return String.format("Backoff %s/%s, %s milliseconds remaining.", waitCount, maxCount, timeRemaining.get()); + } + } + + /** + * A functional interface to return long values. + */ + @FunctionalInterface + public interface SupplierOfLong { + long get(); + } + + /** + * A functional interface that will abort the timer. After being called timer will indicate that it is expired, + * until it is reset. + */ + @FunctionalInterface + public interface AbortTrigger { + void apply(); + } + + /** + * An interface to define the Backoff configuration. Used for convenience with Timer. + */ + public interface BackoffConfig { + SupplierOfLong getSupplierOfTimeRemaining(); + AbortTrigger getAbortTrigger(); + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java index de770cbc2..760d074d2 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java @@ -37,7 +37,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class AvroTransformer extends Transformer<GenericRecord> { +public class AvroTransformer extends Transformer { private final AvroData avroData; @@ -54,9 +54,9 @@ public void configureValueConverter(final Map<String, String> config, final Abst } @Override - public StreamSpliterator<GenericRecord> createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, - final String topic, final int topicPartition, final AbstractConfig sourceConfig) { - return new StreamSpliterator<>(LOGGER, inputStreamIOSupplier) { + public StreamSpliterator createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, + final int topicPartition, final AbstractConfig sourceConfig) { + return new StreamSpliterator(LOGGER, inputStreamIOSupplier) { private DataFileStream<GenericRecord> dataFileStream; private final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); @@ -78,9 +78,10 @@ public void doClose() { } @Override - protected boolean doAdvance(final Consumer<? super GenericRecord> action) { + protected boolean doAdvance(final Consumer<? super SchemaAndValue> action) { if (dataFileStream.hasNext()) { - action.accept(dataFileStream.next()); + final GenericRecord record = dataFileStream.next(); + action.accept(avroData.toConnectData(record.getSchema(), record)); return true; } return false; @@ -88,12 +89,6 @@ protected boolean doAdvance(final Consumer<? super GenericRecord> action) { }; } - @Override - public SchemaAndValue getValueData(final GenericRecord record, final String topic, - final AbstractConfig sourceConfig) { - return avroData.toConnectData(record.getSchema(), record); - } - @Override public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topic, final AbstractConfig sourceConfig) { diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java index f571062d9..232aaef24 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java @@ -31,7 +31,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class ByteArrayTransformer extends Transformer<byte[]> { +public class ByteArrayTransformer extends Transformer { private static final Logger LOGGER = LoggerFactory.getLogger(ByteArrayTransformer.class); private static final int MAX_BUFFER_SIZE = 4096; @@ -42,9 +42,9 @@ public void configureValueConverter(final Map<String, String> config, final Abst } @Override - public StreamSpliterator<byte[]> createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, - final String topic, final int topicPartition, final AbstractConfig sourceConfig) { - return new StreamSpliterator<byte[]>(LOGGER, inputStreamIOSupplier) { + public StreamSpliterator createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, + final int topicPartition, final AbstractConfig sourceConfig) { + return new StreamSpliterator(LOGGER, inputStreamIOSupplier) { @Override protected InputStream inputOpened(final InputStream input) { return input; @@ -56,7 +56,7 @@ protected void doClose() { } @Override - protected boolean doAdvance(final Consumer<? super byte[]> action) { + protected boolean doAdvance(final Consumer<? super SchemaAndValue> action) { final byte[] buffer = new byte[MAX_BUFFER_SIZE]; try { final int bytesRead = IOUtils.read(inputStream, buffer); @@ -64,9 +64,9 @@ protected boolean doAdvance(final Consumer<? super byte[]> action) { return false; } if (bytesRead < MAX_BUFFER_SIZE) { - action.accept(Arrays.copyOf(buffer, bytesRead)); + action.accept(new SchemaAndValue(null, Arrays.copyOf(buffer, bytesRead))); } else { - action.accept(buffer); + action.accept(new SchemaAndValue(null, buffer)); } return true; } catch (IOException e) { @@ -77,11 +77,6 @@ protected boolean doAdvance(final Consumer<? super byte[]> action) { }; } - @Override - public SchemaAndValue getValueData(final byte[] record, final String topic, final AbstractConfig sourceConfig) { - return new SchemaAndValue(null, record); - } - @Override public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topic, final AbstractConfig sourceConfig) { diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java index 4ff0f1a24..c6aea0e82 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java @@ -34,7 +34,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class JsonTransformer extends Transformer<byte[]> { +public class JsonTransformer extends Transformer { private final JsonConverter jsonConverter; @@ -52,9 +52,9 @@ public void configureValueConverter(final Map<String, String> config, final Abst } @Override - public StreamSpliterator<byte[]> createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, - final String topic, final int topicPartition, final AbstractConfig sourceConfig) { - final StreamSpliterator<byte[]> spliterator = new StreamSpliterator<>(LOGGER, inputStreamIOSupplier) { + public StreamSpliterator createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, + final int topicPartition, final AbstractConfig sourceConfig) { + return new StreamSpliterator(LOGGER, inputStreamIOSupplier) { BufferedReader reader; @Override @@ -75,7 +75,7 @@ public void doClose() { } @Override - public boolean doAdvance(final Consumer<? super byte[]> action) { + public boolean doAdvance(final Consumer<? super SchemaAndValue> action) { String line = null; try { // remove blank and empty lines. @@ -87,7 +87,7 @@ public boolean doAdvance(final Consumer<? super byte[]> action) { } } line = line.trim(); - action.accept(line.getBytes(StandardCharsets.UTF_8)); + action.accept(jsonConverter.toConnectData(topic, line.getBytes(StandardCharsets.UTF_8))); return true; } catch (IOException e) { LOGGER.error("Error reading input stream: {}", e.getMessage(), e); @@ -95,13 +95,6 @@ public boolean doAdvance(final Consumer<? super byte[]> action) { } } }; - - return spliterator; - } - - @Override - public SchemaAndValue getValueData(final byte[] record, final String topic, final AbstractConfig sourceConfig) { - return jsonConverter.toConnectData(topic, record); } @Override diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java index 7da61c412..2c47d5103 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java @@ -43,7 +43,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class ParquetTransformer extends Transformer<GenericRecord> { +public class ParquetTransformer extends Transformer { private final AvroData avroData; @@ -59,12 +59,6 @@ public void configureValueConverter(final Map<String, String> config, final Abst config.put(SCHEMA_REGISTRY_URL, sourceConfig.getString(SCHEMA_REGISTRY_URL)); } - @Override - public SchemaAndValue getValueData(final GenericRecord record, final String topic, - final AbstractConfig sourceConfig) { - return avroData.toConnectData(record.getSchema(), record); - } - @Override public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topic, final AbstractConfig sourceConfig) { @@ -72,10 +66,10 @@ public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topi } @Override - public StreamSpliterator<GenericRecord> createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, - final String topic, final int topicPartition, final AbstractConfig sourceConfig) { + public StreamSpliterator createSpliterator(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, + final int topicPartition, final AbstractConfig sourceConfig) { - final StreamSpliterator<GenericRecord> spliterator = new StreamSpliterator<>(LOGGER, inputStreamIOSupplier) { + return new StreamSpliterator(LOGGER, inputStreamIOSupplier) { private ParquetReader<GenericRecord> reader; private File parquetFile; @@ -114,11 +108,11 @@ protected void doClose() { } @Override - protected boolean doAdvance(final Consumer<? super GenericRecord> action) { + protected boolean doAdvance(final Consumer<? super SchemaAndValue> action) { try { final GenericRecord record = reader.read(); if (record != null) { - action.accept(record); // Pass record to the stream + action.accept(avroData.toConnectData(record.getSchema(), record)); // Pass record to the stream return true; } } catch (IOException e) { @@ -127,7 +121,6 @@ protected boolean doAdvance(final Consumer<? super GenericRecord> action) { return false; } }; - return spliterator; } static void deleteTmpFile(final Path parquetFile) { diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java index 196d9ae3c..09e8c0ca5 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java @@ -30,14 +30,14 @@ import org.apache.commons.io.function.IOSupplier; import org.slf4j.Logger; -public abstract class Transformer<T> { +public abstract class Transformer { public abstract void configureValueConverter(Map<String, String> config, AbstractConfig sourceConfig); - public final Stream<T> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, final String topic, - final int topicPartition, final AbstractConfig sourceConfig, final long skipRecords) { + public final Stream<SchemaAndValue> getRecords(final IOSupplier<InputStream> inputStreamIOSupplier, + final String topic, final int topicPartition, final AbstractConfig sourceConfig, final long skipRecords) { - final StreamSpliterator<T> spliterator = createSpliterator(inputStreamIOSupplier, topic, topicPartition, + final StreamSpliterator spliterator = createSpliterator(inputStreamIOSupplier, topic, topicPartition, sourceConfig); return StreamSupport.stream(spliterator, false).onClose(spliterator::close).skip(skipRecords); } @@ -55,20 +55,15 @@ public final Stream<T> getRecords(final IOSupplier<InputStream> inputStreamIOSup * the source configuraiton. * @return a StreamSpliterator instance. */ - protected abstract StreamSpliterator<T> createSpliterator(IOSupplier<InputStream> inputStreamIOSupplier, - String topic, int topicPartition, AbstractConfig sourceConfig); - - public abstract SchemaAndValue getValueData(T record, String topic, AbstractConfig sourceConfig); + protected abstract StreamSpliterator createSpliterator(IOSupplier<InputStream> inputStreamIOSupplier, String topic, + int topicPartition, AbstractConfig sourceConfig); public abstract SchemaAndValue getKeyData(Object cloudStorageKey, String topic, AbstractConfig sourceConfig); /** * A Spliterator that performs various checks on the opening/closing of the input stream. - * - * @param <T> - * the type of item created by this Spliterator. */ - protected abstract static class StreamSpliterator<T> implements Spliterator<T> { + protected abstract static class StreamSpliterator implements Spliterator<SchemaAndValue> { /** * The input stream supplier. */ @@ -109,7 +104,7 @@ protected StreamSpliterator(final Logger logger, final IOSupplier<InputStream> i * the Consumer to call if record is created. * @return {@code true} if a record was processed, {@code false} otherwise. */ - abstract protected boolean doAdvance(Consumer<? super T> action); + abstract protected boolean doAdvance(Consumer<? super SchemaAndValue> action); /** * Method to close additional inputs if needed. @@ -121,6 +116,7 @@ public final void close() { try { if (inputStream != null) { inputStream.close(); + inputStream = null; // NOPMD setting null to release resources closed = true; } } catch (IOException e) { @@ -143,15 +139,16 @@ public final void close() { abstract protected InputStream inputOpened(InputStream input) throws IOException; @Override - public final boolean tryAdvance(final Consumer<? super T> action) { - boolean result = false; + public final boolean tryAdvance(final Consumer<? super SchemaAndValue> action) { if (closed) { - logger.error("Attempt to advance after closed"); + return false; } + boolean result = false; try { if (inputStream == null) { try { - inputStream = inputOpened(inputStreamIOSupplier.get()); + inputStream = inputStreamIOSupplier.get(); + inputOpened(inputStream); } catch (IOException e) { logger.error("Error trying to open inputStream: {}", e.getMessage(), e); close(); @@ -169,7 +166,7 @@ public final boolean tryAdvance(final Consumer<? super T> action) { } @Override - public final Spliterator<T> trySplit() { // NOPMD returning null is reqruied by API + public final Spliterator<SchemaAndValue> trySplit() { // NOPMD returning null is reqruied by API return null; } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java index 43a1b0ef7..574604306 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java @@ -16,48 +16,46 @@ package io.aiven.kafka.connect.common.source.input; -import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMAS_ENABLE; -import java.util.HashMap; import java.util.Map; import org.apache.kafka.connect.json.JsonConverter; -import io.aiven.kafka.connect.common.config.SchemaRegistryFragment; -import io.aiven.kafka.connect.common.config.SourceCommonConfig; - import io.confluent.connect.avro.AvroData; +/** + * A factory to create Transformers. + */ public final class TransformerFactory { - + /** The cache size for systems that read Avro data */ public static final int CACHE_SIZE = 100; private TransformerFactory() { // hidden } - public static Transformer getTransformer(final SourceCommonConfig sourceConfig) { - final InputFormat inputFormatEnum = new SchemaRegistryFragment(sourceConfig).getInputFormat(); - switch (inputFormatEnum) { + + /** + * Gets a configured Transformer. + * + * @param inputFormat + * The input format for the transformer. + * @return the Transformer for the specified input format. + */ + public static Transformer getTransformer(final InputFormat inputFormat) { + switch (inputFormat) { case AVRO : return new AvroTransformer(new AvroData(CACHE_SIZE)); case PARQUET : return new ParquetTransformer(new AvroData(CACHE_SIZE)); case JSONL : final JsonConverter jsonConverter = new JsonConverter(); - configureJsonConverter(jsonConverter); + jsonConverter.configure(Map.of(SCHEMAS_ENABLE, "false"), false); return new JsonTransformer(jsonConverter); case BYTES : return new ByteArrayTransformer(); default : - throw new IllegalArgumentException( - "Unknown input format in configuration: " + sourceConfig.getString(INPUT_FORMAT_KEY)); + throw new IllegalArgumentException("Unknown input format in configuration: " + inputFormat); } } - - private static void configureJsonConverter(final JsonConverter jsonConverter) { - final Map<String, String> config = new HashMap<>(); - config.put(SCHEMAS_ENABLE, "false"); - jsonConverter.configure(config, false); - } } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/AbstractSourceTaskTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/AbstractSourceTaskTest.java new file mode 100644 index 000000000..9b3a581eb --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/AbstractSourceTaskTest.java @@ -0,0 +1,145 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; +import static org.awaitility.Awaitility.await; + +import java.time.Duration; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.apache.commons.lang3.time.StopWatch; +import org.junit.jupiter.api.Test; + +class AbstractSourceTaskTest { + + @Test + void timerTest() { + final AbstractSourceTask.Timer timer = new AbstractSourceTask.Timer(Duration.ofSeconds(1)); + assertThat(timer.millisecondsRemaining()).isEqualTo(Duration.ofSeconds(1).toMillis()); + timer.start(); + await().atMost(Duration.ofSeconds(2)).until(timer::isExpired); + assertThat(timer.millisecondsRemaining()).isLessThan(0); + timer.stop(); + assertThat(timer.millisecondsRemaining()).isEqualTo(Duration.ofSeconds(1).toMillis()); + } + + @Test + void timerSequenceTest() { + final AbstractSourceTask.Timer timer = new AbstractSourceTask.Timer(Duration.ofSeconds(1)); + // stopped state does not allow stop + assertThatExceptionOfType(IllegalStateException.class).as("stop while not running") + .isThrownBy(timer::stop) + .withMessageStartingWith("Timer: "); + timer.reset(); // verify that an exception is not thrown. + + // started state does not allow start + timer.start(); + assertThatExceptionOfType(IllegalStateException.class).as("start while running") + .isThrownBy(timer::start) + .withMessageStartingWith("Timer: "); + timer.reset(); + timer.start(); // restart the timer. + timer.stop(); + + // stopped state does not allow stop or start + assertThatExceptionOfType(IllegalStateException.class).as("stop after stop") + .isThrownBy(timer::stop) + .withMessageStartingWith("Timer: "); + assertThatExceptionOfType(IllegalStateException.class).as("start after stop") + .isThrownBy(timer::start) + .withMessageStartingWith("Timer: "); + timer.reset(); + + // stopped + reset does not allow stop. + assertThatExceptionOfType(IllegalStateException.class).as("stop after reset (1)") + .isThrownBy(timer::stop) + .withMessageStartingWith("Timer: "); + timer.start(); + timer.reset(); + + // started + reset does not allow stop; + assertThatExceptionOfType(IllegalStateException.class).as("stop after reset (2)") + .isThrownBy(timer::stop) + .withMessageStartingWith("Timer: "); + } + + @Test + void backoffTest() throws InterruptedException { + final AbstractSourceTask.Timer timer = new AbstractSourceTask.Timer(Duration.ofSeconds(1)); + final AbstractSourceTask.Backoff backoff = new AbstractSourceTask.Backoff(timer.getBackoffConfig()); + final long estimatedDelay = backoff.estimatedDelay(); + assertThat(estimatedDelay).isLessThan(500); + + // execute delay without timer running. + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + backoff.delay(); + stopWatch.stop(); + assertThat(stopWatch.getTime()).as("Result without timer running") + .isBetween(estimatedDelay - backoff.getMaxJitter(), estimatedDelay + backoff.getMaxJitter()); + + timer.start(); + for (int i = 0; i < 9; i++) { + stopWatch.reset(); + timer.reset(); + timer.start(); + stopWatch.start(); + await().atMost(Duration.ofSeconds(2)).until(() -> { + backoff.delay(); + return backoff.estimatedDelay() == 0 || timer.isExpired(); + }); + stopWatch.stop(); + timer.stop(); + final int step = i; + if (!timer.isExpired()) { + assertThat(stopWatch.getTime()).as(() -> String.format("Result with timer running at step %s", step)) + .isBetween(Duration.ofSeconds(1).toMillis() - backoff.getMaxJitter(), + Duration.ofSeconds(1).toMillis() + backoff.getMaxJitter()); + } + } + } + + @Test + void backoffIncrementalTimeTest() throws InterruptedException { + final AtomicBoolean abortTrigger = new AtomicBoolean(); + // delay increases in powers of 2. + final long maxDelay = 1000; // not a power of 2 + final AbstractSourceTask.BackoffConfig config = new AbstractSourceTask.BackoffConfig() { + @Override + public AbstractSourceTask.SupplierOfLong getSupplierOfTimeRemaining() { + return () -> maxDelay; + } + + @Override + public AbstractSourceTask.AbortTrigger getAbortTrigger() { + return () -> abortTrigger.set(true); + } + }; + + final AbstractSourceTask.Backoff backoff = new AbstractSourceTask.Backoff(config); + long expected = 2; + while (backoff.estimatedDelay() < maxDelay) { + assertThat(backoff.estimatedDelay()).isEqualTo(expected); + backoff.delay(); + expected *= 2; + } + assertThat(backoff.estimatedDelay()).isEqualTo(maxDelay); + assertThat(abortTrigger).isFalse(); + } +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java index 50e54a284..617dd290a 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java @@ -32,6 +32,9 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.data.Struct; + import io.aiven.kafka.connect.common.config.SourceCommonConfig; import io.confluent.connect.avro.AvroData; @@ -75,7 +78,7 @@ void testConfigureValueConverter() { void testReadAvroRecordsInvalidData() { final InputStream inputStream = new ByteArrayInputStream("mock-avro-data".getBytes(StandardCharsets.UTF_8)); - final Stream<GenericRecord> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + final Stream<SchemaAndValue> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 0); final List<Object> recs = records.collect(Collectors.toList()); @@ -87,11 +90,17 @@ void testReadAvroRecords() throws Exception { final ByteArrayOutputStream avroData = generateMockAvroData(25); final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); - final Stream<GenericRecord> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + final List<String> expected = new ArrayList<>(); + for (int i = 0; i < 25; i++) { + expected.add("Hello, Kafka Connect S3 Source! object " + i); + } + + final Stream<SchemaAndValue> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 0); - final List<Object> recs = records.collect(Collectors.toList()); - assertThat(recs).hasSize(25); + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Struct) sv).getString("message")) + .containsExactlyElementsOf(expected); } @Test @@ -99,14 +108,16 @@ void testReadAvroRecordsSkipFew() throws Exception { final ByteArrayOutputStream avroData = generateMockAvroData(20); final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); - final Stream<GenericRecord> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + final List<String> expected = new ArrayList<>(); + for (int i = 5; i < 20; i++) { + expected.add("Hello, Kafka Connect S3 Source! object " + i); + } + final Stream<SchemaAndValue> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 5); - final List<Object> recs = records.collect(Collectors.toList()); - assertThat(recs).hasSize(15); - // get first rec - assertThat(((GenericRecord) recs.get(0)).get("message").toString()) - .isEqualTo("Hello, Kafka Connect S3 Source! object 5"); + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Struct) sv).getString("message")) + .containsExactlyElementsOf(expected); } @Test @@ -114,11 +125,10 @@ void testReadAvroRecordsSkipMoreRecordsThanExist() throws Exception { final ByteArrayOutputStream avroData = generateMockAvroData(20); final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); - final Stream<GenericRecord> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + final Stream<SchemaAndValue> records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, 25); - final List<Object> recs = records.collect(Collectors.toList()); - assertThat(recs).hasSize(0); + assertThat(records).isEmpty(); } static ByteArrayOutputStream generateMockAvroData(final int numRecs) throws IOException { diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java index ee6b76001..80820e13b 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java @@ -24,6 +24,8 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.kafka.connect.data.SchemaAndValue; + import io.aiven.kafka.connect.common.config.SourceCommonConfig; import org.apache.commons.io.function.IOSupplier; @@ -53,12 +55,12 @@ void testGetRecordsSingleChunk() { final InputStream inputStream = new ByteArrayInputStream(data); final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; - final Stream<byte[]> records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, + final Stream<SchemaAndValue> records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, sourceCommonConfig, 0); - final List<Object> recs = records.collect(Collectors.toList()); + final List<SchemaAndValue> recs = records.collect(Collectors.toList()); assertThat(recs).hasSize(1); - assertThat((byte[]) recs.get(0)).isEqualTo(data); + assertThat(recs.get(0).value()).isEqualTo(data); } @Test @@ -67,18 +69,9 @@ void testGetRecordsEmptyInputStream() { final IOSupplier<InputStream> inputStreamIOSupplier = () -> inputStream; - final Stream<byte[]> records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, + final Stream<SchemaAndValue> records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, sourceCommonConfig, 0); assertThat(records).hasSize(0); } - - @Test - void testGetValueBytes() { - final byte[] record = { 1, 2, 3 }; - final byte[] result = (byte[]) byteArrayTransformer.getValueData(record, TEST_TOPIC, sourceCommonConfig) - .value(); - - assertThat(result).containsExactlyInAnyOrder(record); - } } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java index a38a2bc8a..e482fd61c 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java @@ -18,7 +18,6 @@ import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMAS_ENABLE; import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -26,13 +25,13 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.stream.Collectors; import java.util.stream.Stream; -import org.apache.kafka.connect.errors.DataException; +import org.apache.kafka.connect.data.SchemaAndValue; import org.apache.kafka.connect.json.JsonConverter; import io.aiven.kafka.connect.common.config.SourceCommonConfig; @@ -77,31 +76,38 @@ void destroy() { @Test void testHandleValueDataWithValidJson() { final InputStream validJsonInputStream = new ByteArrayInputStream( - "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); - final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; - final Stream<byte[]> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + getJsonRecs(100).getBytes(StandardCharsets.UTF_8)); + + final List<String> expected = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + expected.add("value" + i); + } + + final Stream<SchemaAndValue> records = jsonTransformer.getRecords(() -> validJsonInputStream, TESTTOPIC, 1, sourceCommonConfig, 0); - assertThat(jsonNodes).hasSize(1); + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Map) sv).get("key")) + .containsExactlyElementsOf(expected); } @Test void testHandleValueDataWithValidJsonSkipFew() { final InputStream validJsonInputStream = new ByteArrayInputStream( getJsonRecs(100).getBytes(StandardCharsets.UTF_8)); - final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; - final Stream<byte[]> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + + final List<String> expected = new ArrayList<>(); + for (int i = 25; i < 100; i++) { + expected.add("value" + i); + } + + final Stream<SchemaAndValue> records = jsonTransformer.getRecords(() -> validJsonInputStream, TESTTOPIC, 1, sourceCommonConfig, 25L); - final List<byte[]> recs = jsonNodes.collect(Collectors.toList()); - assertThat(recs).hasSize(75); - assertThat(recs).extracting(record -> ((Map) jsonTransformer.getValueData(record, "", null).value()).get("key")) - .doesNotContain("value1") - .doesNotContain("value2") - .doesNotContain("value25") - .contains("value26") - .contains("value27") - .contains("value100"); + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Map) sv).get("key")) + .containsExactlyElementsOf(expected); + } @Test @@ -110,35 +116,17 @@ void testHandleValueDataWithInvalidJson() { "invalid-json".getBytes(StandardCharsets.UTF_8)); final IOSupplier<InputStream> inputStreamIOSupplier = () -> invalidJsonInputStream; - final Stream<byte[]> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + final Stream<SchemaAndValue> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, sourceCommonConfig, 0); - assertThatThrownBy(() -> jsonTransformer.getValueData(jsonNodes.findAny().get(), "", null)) - .isInstanceOf(DataException.class) - .hasMessage("Converting byte[] to Kafka Connect data failed due to serialization error: "); - } + assertThat(jsonNodes).isEmpty(); - @Test - void testSerializeJsonDataValid() throws IOException { - final InputStream validJsonInputStream = new ByteArrayInputStream( - "{\"key\":\"value\"}".getBytes(StandardCharsets.UTF_8)); - final IOSupplier<InputStream> inputStreamIOSupplier = () -> validJsonInputStream; - final Stream<byte[]> jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, - sourceCommonConfig, 0); - final Object serializedData = jsonTransformer - .getValueData( - jsonNodes.findFirst().orElseThrow(() -> new AssertionError("No records found in stream!")), - TESTTOPIC, sourceCommonConfig) - .value(); - - // Assert: Verify the serialized data - assertThat(serializedData).isInstanceOf(Map.class).extracting("key").isEqualTo("value"); } @Test void testGetRecordsWithIOException() throws IOException { when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException")); - final Stream<byte[]> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); + final Stream<SchemaAndValue> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); assertThat(resultStream).isEmpty(); } @@ -146,14 +134,14 @@ void testGetRecordsWithIOException() throws IOException { @Test void testCustomSpliteratorWithIOExceptionDuringInitialization() throws IOException { when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException during initialization")); - final Stream<byte[]> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); + final Stream<SchemaAndValue> resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); assertThat(resultStream).isEmpty(); } static String getJsonRecs(final int recordCount) { final StringBuilder jsonRecords = new StringBuilder(); - for (int i = 1; i <= recordCount; i++) { + for (int i = 0; i < recordCount; i++) { jsonRecords.append(String.format("{\"key\":\"value%d\"}", i)); if (i < recordCount) { jsonRecords.append("\n"); // NOPMD AppendCharacterWithChar diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java index 154baf45a..2f7a405fe 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java @@ -29,14 +29,17 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.data.Struct; + import io.aiven.kafka.connect.common.config.SourceCommonConfig; import io.confluent.connect.avro.AvroData; -import org.apache.avro.generic.GenericRecord; import org.apache.commons.io.IOUtils; import org.apache.commons.io.function.IOSupplier; import org.junit.jupiter.api.BeforeEach; @@ -63,7 +66,7 @@ void testHandleValueDataWithZeroBytes() { final String topic = "test-topic"; final int topicPartition = 0; - final Stream<GenericRecord> recs = parquetTransformer.getRecords(inputStreamIOSupplier, topic, topicPartition, + final Stream<SchemaAndValue> recs = parquetTransformer.getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig, 0L); assertThat(recs).isEmpty(); @@ -78,15 +81,17 @@ void testGetRecordsWithValidData() throws Exception { final String topic = "test-topic"; final int topicPartition = 0; - - final List<Object> records = parquetTransformer + final List<String> expected = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + expected.add("name" + i); + } + final List<SchemaAndValue> records = parquetTransformer .getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig, 0L) .collect(Collectors.toList()); - assertThat(records).hasSize(100); - assertThat(records).extracting(record -> ((GenericRecord) record).get("name").toString()) - .contains("name1") - .contains("name2"); + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Struct) sv).getString("name")) + .containsExactlyElementsOf(expected); } @Test @@ -99,18 +104,18 @@ void testGetRecordsWithValidDataSkipFew() throws Exception { final String topic = "test-topic"; final int topicPartition = 0; - final List<Object> records = parquetTransformer + final List<String> expected = new ArrayList<>(); + for (int i = 25; i < 100; i++) { + expected.add("name" + i); + } + + final List<SchemaAndValue> records = parquetTransformer .getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig, 25L) .collect(Collectors.toList()); - assertThat(records).hasSize(75); - assertThat(records).extracting(record -> ((GenericRecord) record).get("name").toString()) - .doesNotContain("name1") - .doesNotContain("name2") - .doesNotContain("name24") - .contains("name25") - .contains("name26") - .contains("name99"); + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Struct) sv).getString("name")) + .containsExactlyElementsOf(expected); } @Test @@ -124,7 +129,7 @@ void testGetRecordsWithInvalidData() { final String topic = "test-topic"; final int topicPartition = 0; - final Stream<GenericRecord> records = parquetTransformer.getRecords(inputStreamIOSupplier, topic, + final Stream<SchemaAndValue> records = parquetTransformer.getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig, 0L); assertThat(records).isEmpty(); } @@ -150,7 +155,7 @@ void testIOExceptionCreatingTempFile() { .thenThrow(new IOException("Test IOException for temp file")); final IOSupplier<InputStream> inputStreamSupplier = mock(IOSupplier.class); - final Stream<GenericRecord> resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", + final Stream<SchemaAndValue> resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", 1, null, 0L); assertThat(resultStream).isEmpty(); @@ -163,7 +168,7 @@ void testIOExceptionDuringDataCopy() throws IOException { when(inputStreamMock.read(any(byte[].class))).thenThrow(new IOException("Test IOException during copy")); final IOSupplier<InputStream> inputStreamSupplier = () -> inputStreamMock; - final Stream<GenericRecord> resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", + final Stream<SchemaAndValue> resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", 1, null, 0L); assertThat(resultStream).isEmpty(); diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java index f61dd9423..73b27b01f 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java @@ -17,6 +17,8 @@ package io.aiven.kafka.connect.common.source.input; import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -32,11 +34,10 @@ import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; -import org.apache.kafka.connect.json.JsonConverter; +import org.apache.kafka.connect.data.SchemaAndValue; import io.aiven.kafka.connect.common.config.CommonConfig; -import io.confluent.connect.avro.AvroData; import org.apache.commons.io.function.IOSupplier; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; @@ -49,8 +50,8 @@ class TransformerStreamingTest { @ParameterizedTest @MethodSource("testData") - void verifyExceptionDuringIOOpen(final Transformer<?> transformer, final byte[] testData, - final AbstractConfig config, final int expectedCount) throws IOException { + void verifyExceptionDuringIOOpen(final Transformer transformer, final byte[] testData, final AbstractConfig config, + final int expectedCount) throws IOException { final IOSupplier<InputStream> ioSupplier = mock(IOSupplier.class); when(ioSupplier.get()).thenThrow(new IOException("Test IOException during initialization")); final Stream<?> objStream = transformer.getRecords(ioSupplier, "topic", 1, config, 0); @@ -59,7 +60,28 @@ void verifyExceptionDuringIOOpen(final Transformer<?> transformer, final byte[] @ParameterizedTest @MethodSource("testData") - void verifyCloseCalledAtEnd(final Transformer<?> transformer, final byte[] testData, final AbstractConfig config, + void verifyExceptionDuringRead(final Transformer transformer, final byte[] testData, final AbstractConfig config, + final int expectedCount) throws IOException { + try (InputStream inputStream = mock(InputStream.class)) { + when(inputStream.read()).thenThrow(new IOException("Test IOException during read")); + when(inputStream.read(any())).thenThrow(new IOException("Test IOException during read")); + when(inputStream.read(any(), anyInt(), anyInt())) + .thenThrow(new IOException("Test IOException during read")); + when(inputStream.readNBytes(any(), anyInt(), anyInt())) + .thenThrow(new IOException("Test IOException during read")); + when(inputStream.readNBytes(anyInt())).thenThrow(new IOException("Test IOException during read")); + when(inputStream.readAllBytes()).thenThrow(new IOException("Test IOException during read")); + try (CloseTrackingStream stream = new CloseTrackingStream(inputStream)) { + final Stream<?> objStream = transformer.getRecords(() -> stream, "topic", 1, config, 0); + assertThat(objStream).isEmpty(); + assertThat(stream.closeCount).isGreaterThan(0); + } + } + } + + @ParameterizedTest + @MethodSource("testData") + void verifyCloseCalledAtEnd(final Transformer transformer, final byte[] testData, final AbstractConfig config, final int expectedCount) throws IOException { final CloseTrackingStream stream = new CloseTrackingStream(new ByteArrayInputStream(testData)); final Stream<?> objStream = transformer.getRecords(() -> stream, "topic", 1, config, 0); @@ -70,11 +92,11 @@ void verifyCloseCalledAtEnd(final Transformer<?> transformer, final byte[] testD @ParameterizedTest @MethodSource("testData") - void verifyCloseCalledAtIteratorEnd(final Transformer<?> transformer, final byte[] testData, + void verifyCloseCalledAtIteratorEnd(final Transformer transformer, final byte[] testData, final AbstractConfig config, final int expectedCount) throws IOException { final CloseTrackingStream stream = new CloseTrackingStream(new ByteArrayInputStream(testData)); - final Stream<?> objStream = transformer.getRecords(() -> stream, "topic", 1, config, 0); - final Iterator<?> iter = objStream.iterator(); + final Stream<SchemaAndValue> objStream = transformer.getRecords(() -> stream, "topic", 1, config, 0); + final Iterator<SchemaAndValue> iter = objStream.iterator(); long count = 0L; while (iter.hasNext()) { count += 1; @@ -86,19 +108,19 @@ void verifyCloseCalledAtIteratorEnd(final Transformer<?> transformer, final byte static Stream<Arguments> testData() throws IOException { final List<Arguments> lst = new ArrayList<>(); - final AvroData avroData = new AvroData(100); - lst.add(Arguments.of(new AvroTransformer(avroData), AvroTransformerTest.generateMockAvroData(100).toByteArray(), + lst.add(Arguments.of(TransformerFactory.getTransformer(InputFormat.AVRO), + AvroTransformerTest.generateMockAvroData(100).toByteArray(), new CommonConfig(new ConfigDef(), new HashMap<>()) { }, 100)); - lst.add(Arguments.of(new ByteArrayTransformer(), "Hello World".getBytes(StandardCharsets.UTF_8), - new CommonConfig(new ConfigDef(), new HashMap<>()) { + lst.add(Arguments.of(TransformerFactory.getTransformer(InputFormat.BYTES), + "Hello World".getBytes(StandardCharsets.UTF_8), new CommonConfig(new ConfigDef(), new HashMap<>()) { }, 1)); - lst.add(Arguments.of(new JsonTransformer(new JsonConverter()), + lst.add(Arguments.of(TransformerFactory.getTransformer(InputFormat.JSONL), JsonTransformerTest.getJsonRecs(100).getBytes(StandardCharsets.UTF_8), new CommonConfig(new ConfigDef(), new HashMap<>()) { }, 100)); - lst.add(Arguments.of(new ParquetTransformer(avroData), ParquetTransformerTest.generateMockParquetData(), - new CommonConfig(new ConfigDef(), new HashMap<>()) { + lst.add(Arguments.of(TransformerFactory.getTransformer(InputFormat.PARQUET), + ParquetTransformerTest.generateMockParquetData(), new CommonConfig(new ConfigDef(), new HashMap<>()) { }, 100)); return lst.stream(); } diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts index 20d5a3b82..db1b4a7d0 100644 --- a/s3-source-connector/build.gradle.kts +++ b/s3-source-connector/build.gradle.kts @@ -65,6 +65,7 @@ dependencies { compileOnly(apache.kafka.connect.api) compileOnly(apache.kafka.connect.runtime) + implementation(apache.commons.collection4) implementation(project(":commons")) implementation(project(":s3-commons")) implementation("software.amazon.awssdk:s3:$amazonS3Version") diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java new file mode 100644 index 000000000..42d10aad7 --- /dev/null +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java @@ -0,0 +1,310 @@ +/* + * Copyright 2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.AVRO_VALUE_SERIALIZER; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPIC_PARTITIONS; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_ACCESS_KEY_ID_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_PREFIX_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; +import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.kafka.connect.source.SourceTaskContext; +import org.apache.kafka.connect.storage.OffsetStorageReader; + +import io.aiven.kafka.connect.common.source.input.InputFormat; +import io.aiven.kafka.connect.common.source.input.TransformerFactory; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; +import io.aiven.kafka.connect.s3.source.utils.AWSV2SourceClient; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; +import io.aiven.kafka.connect.s3.source.utils.S3SourceRecord; +import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; + +import org.apache.avro.Schema; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; +import org.testcontainers.containers.localstack.LocalStackContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.S3Object; + +@Testcontainers +class AwsIntegrationTest implements IntegrationBase { + + private static final String COMMON_PREFIX = "s3-source-connector-for-apache-kafka-AWS-test-"; + + @Container + public static final LocalStackContainer LOCALSTACK = IntegrationBase.createS3Container(); + + private static String s3Prefix; + + private S3Client s3Client; + private String s3Endpoint; + + private BucketAccessor testBucketAccessor; + + @Override + public String getS3Prefix() { + return s3Prefix; + } + + @Override + public S3Client getS3Client() { + return s3Client; + } + + @BeforeAll + static void setUpAll() { + s3Prefix = COMMON_PREFIX + ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) + "/"; + } + + @BeforeEach + void setupAWS() { + s3Client = IntegrationBase.createS3Client(LOCALSTACK); + s3Endpoint = LOCALSTACK.getEndpoint().toString(); + testBucketAccessor = new BucketAccessor(s3Client, TEST_BUCKET_NAME); + testBucketAccessor.createBucket(); + } + + @AfterEach + void tearDownAWS() { + testBucketAccessor.removeBucket(); + s3Client.close(); + } + + private Map<String, String> getConfig(final String topics, final int maxTasks) { + final Map<String, String> config = new HashMap<>(); + config.put(AWS_ACCESS_KEY_ID_CONFIG, S3_ACCESS_KEY_ID); + config.put(AWS_SECRET_ACCESS_KEY_CONFIG, S3_SECRET_ACCESS_KEY); + config.put(AWS_S3_ENDPOINT_CONFIG, s3Endpoint); + config.put(AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET_NAME); + config.put(AWS_S3_PREFIX_CONFIG, getS3Prefix()); + config.put(TARGET_TOPIC_PARTITIONS, "0,1"); + config.put(TARGET_TOPICS, topics); + config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put("tasks.max", String.valueOf(maxTasks)); + return config; + } + + /** + * Test the integration with the Amazon connector + * + * @param testInfo + * The testing configuration. + */ + @Test + void sourceRecordIteratorBytesTest(final TestInfo testInfo) { + final var topicName = IntegrationBase.topicName(testInfo); + final Map<String, String> configData = getConfig(topicName, 1); + + configData.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); + + final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; + final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; + + final List<String> offsetKeys = new ArrayList<>(); + final List<String> expectedKeys = new ArrayList<>(); + // write 2 objects to s3 + expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000")); + expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00000")); + expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00001")); + expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00001")); + + // we don't expext the empty one. + offsetKeys.addAll(expectedKeys); + offsetKeys.add(writeToS3(topicName, new byte[0], "00003")); + + assertThat(testBucketAccessor.listObjects()).hasSize(5); + + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configData); + final SourceTaskContext context = mock(SourceTaskContext.class); + final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); + when(context.offsetStorageReader()).thenReturn(offsetStorageReader); + when(offsetStorageReader.offsets(any())).thenReturn(new HashMap<>()); + + final OffsetManager offsetManager = new OffsetManager(context, s3SourceConfig); + + final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig, new HashSet<>()); + + final Iterator<S3SourceRecord> sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, offsetManager, + TransformerFactory.getTransformer(InputFormat.BYTES), sourceClient); + + final HashSet<String> seenKeys = new HashSet<>(); + while (sourceRecordIterator.hasNext()) { + final S3SourceRecord s3SourceRecord = sourceRecordIterator.next(); + final String key = OBJECT_KEY + SEPARATOR + s3SourceRecord.getObjectKey(); + assertThat(offsetKeys).contains(key); + seenKeys.add(key); + } + assertThat(seenKeys).containsAll(expectedKeys); + } + + @Test + void sourceRecordIteratorAvroTest(final TestInfo testInfo) throws IOException { + final var topicName = IntegrationBase.topicName(testInfo); + + final Map<String, String> configData = getConfig(topicName, 1); + + configData.put(INPUT_FORMAT_KEY, InputFormat.AVRO.getValue()); + configData.put(VALUE_CONVERTER_KEY, "io.confluent.connect.avro.AvroConverter"); + configData.put(AVRO_VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); + + // Define Avro schema + final String schemaJson = "{\n" + " \"type\": \"record\",\n" + " \"name\": \"TestRecord\",\n" + + " \"fields\": [\n" + " {\"name\": \"message\", \"type\": \"string\"},\n" + + " {\"name\": \"id\", \"type\": \"int\"}\n" + " ]\n" + "}"; + final Schema.Parser parser = new Schema.Parser(); + final Schema schema = parser.parse(schemaJson); + + final int numOfRecsFactor = 5000; + + final byte[] outputStream1 = IntegrationBase.generateNextAvroMessagesStartingFromId(1, numOfRecsFactor, schema); + final byte[] outputStream2 = IntegrationBase.generateNextAvroMessagesStartingFromId(numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream3 = IntegrationBase.generateNextAvroMessagesStartingFromId(2 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream4 = IntegrationBase.generateNextAvroMessagesStartingFromId(3 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream5 = IntegrationBase.generateNextAvroMessagesStartingFromId(4 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + + final Set<String> offsetKeys = new HashSet<>(); + + offsetKeys.add(writeToS3(topicName, outputStream1, "00001")); + offsetKeys.add(writeToS3(topicName, outputStream2, "00001")); + + offsetKeys.add(writeToS3(topicName, outputStream3, "00002")); + offsetKeys.add(writeToS3(topicName, outputStream4, "00002")); + offsetKeys.add(writeToS3(topicName, outputStream5, "00002")); + + assertThat(testBucketAccessor.listObjects()).hasSize(5); + + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configData); + final SourceTaskContext context = mock(SourceTaskContext.class); + final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); + when(context.offsetStorageReader()).thenReturn(offsetStorageReader); + when(offsetStorageReader.offsets(any())).thenReturn(new HashMap<>()); + + final OffsetManager offsetManager = new OffsetManager(context, s3SourceConfig); + + final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig, new HashSet<>()); + + final Iterator<S3SourceRecord> sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, offsetManager, + TransformerFactory.getTransformer(InputFormat.AVRO), sourceClient); + + final HashSet<String> seenKeys = new HashSet<>(); + final Map<String, List<Long>> seenRecords = new HashMap<>(); + while (sourceRecordIterator.hasNext()) { + final S3SourceRecord s3SourceRecord = sourceRecordIterator.next(); + final String key = OBJECT_KEY + SEPARATOR + s3SourceRecord.getObjectKey(); + seenRecords.compute(key, (k, v) -> { + final List<Long> lst = v == null ? new ArrayList<>() : v; // NOPMD new object inside loop + lst.add(s3SourceRecord.getRecordNumber()); + return lst; + }); + assertThat(offsetKeys).contains(key); + seenKeys.add(key); + } + assertThat(seenKeys).containsAll(offsetKeys); + assertThat(seenRecords).hasSize(5); + final List<Long> expected = new ArrayList<>(); + for (long l = 0; l < numOfRecsFactor; l++) { + expected.add(l + 1); + } + for (final String key : offsetKeys) { + final List<Long> seen = seenRecords.get(key); + assertThat(seen).as("Count for " + key).containsExactlyInAnyOrderElementsOf(expected); + } + } + + @Test + void verifyIteratorRehydration(final TestInfo testInfo) { + // create 2 files. + final var topicName = IntegrationBase.topicName(testInfo); + final Map<String, String> configData = getConfig(topicName, 1); + + configData.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); + + final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; + final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; + final String testData3 = "Hello, Kafka Connect S3 Source! object 3"; + + final List<String> expectedKeys = new ArrayList<>(); + + final List<String> actualKeys = new ArrayList<>(); + + // write 2 objects to s3 + expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000") + .substring((OBJECT_KEY + SEPARATOR).length())); + expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00000") + .substring((OBJECT_KEY + SEPARATOR).length())); + + assertThat(testBucketAccessor.listObjects()).hasSize(2); + + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configData); + final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig, new HashSet<>()); + final Iterator<S3Object> iter = sourceClient.getS3ObjectIterator(null); + + assertThat(iter).hasNext(); + S3Object object = iter.next(); + actualKeys.add(object.key()); + assertThat(iter).hasNext(); + object = iter.next(); + actualKeys.add(object.key()); + assertThat(iter).isExhausted(); + assertThat(actualKeys).containsAll(expectedKeys); + + // write 3rd object to s3 + expectedKeys.add(writeToS3(topicName, testData3.getBytes(StandardCharsets.UTF_8), "00000") + .substring((OBJECT_KEY + SEPARATOR).length())); + assertThat(testBucketAccessor.listObjects()).hasSize(3); + + assertThat(iter).hasNext(); + object = iter.next(); + actualKeys.add(object.key()); + assertThat(iter).isExhausted(); + assertThat(actualKeys).containsAll(expectedKeys); + + } +} diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index 442993bfc..a8b91a197 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -16,9 +16,12 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; +import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; import static org.assertj.core.api.Assertions.assertThat; import static org.awaitility.Awaitility.await; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.net.ServerSocket; @@ -52,20 +55,90 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import io.confluent.kafka.serializers.KafkaAvroDeserializer; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.DatumWriter; import org.junit.jupiter.api.TestInfo; import org.testcontainers.containers.Container; import org.testcontainers.containers.localstack.LocalStackContainer; import org.testcontainers.utility.DockerImageName; import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; +import software.amazon.awssdk.core.sync.RequestBody; import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.PutObjectRequest; +@SuppressWarnings("PMD.ExcessiveImports") public interface IntegrationBase { String PLUGINS_S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA = "plugins/s3-source-connector-for-apache-kafka/"; String S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA_TEST = "s3-source-connector-for-apache-kafka-test-"; ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + String TEST_BUCKET_NAME = "test-bucket0"; + String S3_ACCESS_KEY_ID = "test-key-id0"; + String VALUE_CONVERTER_KEY = "value.converter"; + String S3_SECRET_ACCESS_KEY = "test_secret_key0"; + + static byte[] generateNextAvroMessagesStartingFromId(final int messageId, final int noOfAvroRecs, + final Schema schema) throws IOException { + final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); + try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) { + dataFileWriter.create(schema, outputStream); + for (int i = messageId; i < messageId + noOfAvroRecs; i++) { + final GenericRecord avroRecord = new GenericData.Record(schema); // NOPMD + avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + i); + avroRecord.put("id", i); + dataFileWriter.append(avroRecord); + } + + dataFileWriter.flush(); + return outputStream.toByteArray(); + } + } + + S3Client getS3Client(); + + String getS3Prefix(); + + /** + * Write file to s3 with the specified key and data. + * + * @param objectKey + * the key. + * @param testDataBytes + * the data. + */ + default void writeToS3WithKey(final String objectKey, final byte[] testDataBytes) { + final PutObjectRequest request = PutObjectRequest.builder() + .bucket(IntegrationTest.TEST_BUCKET_NAME) + .key(objectKey) + .build(); + getS3Client().putObject(request, RequestBody.fromBytes(testDataBytes)); + + } + + /** + * Writes to S3 using a key of the form {@code [prefix]topicName-partitionId-systemTime.txt}. + * + * @param topicName + * the topic name to use + * @param testDataBytes + * the data. + * @param partitionId + * the partition id. + * @return the key prefixed by {@link S3SourceTask#OBJECT_KEY} and + * {@link io.aiven.kafka.connect.s3.source.utils.OffsetManager#SEPARATOR} + */ + default String writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) { + final String objectKey = org.apache.commons.lang3.StringUtils.defaultIfBlank(getS3Prefix(), "") + topicName + + "-" + partitionId + "-" + System.currentTimeMillis() + ".txt"; + writeToS3WithKey(objectKey, testDataBytes); + return OBJECT_KEY + SEPARATOR + objectKey; + } default AdminClient newAdminClient(final String bootstrapServers) { final Properties adminClientConfig = new Properties(); diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 5a573395e..083d8627e 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -27,13 +27,10 @@ import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG; import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG; -import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; -import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; import static java.util.Map.entry; import static org.assertj.core.api.Assertions.assertThat; import static org.awaitility.Awaitility.await; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -63,23 +60,17 @@ import com.fasterxml.jackson.databind.JsonNode; import org.apache.avro.Schema; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.DatumWriter; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; -import org.junit.platform.commons.util.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testcontainers.containers.localstack.LocalStackContainer; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; -import software.amazon.awssdk.core.sync.RequestBody; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.model.PutObjectRequest; @@ -92,13 +83,6 @@ final class IntegrationTest implements IntegrationBase { private static final String COMMON_PREFIX = "s3-source-connector-for-apache-kafka-test-"; private static final int OFFSET_FLUSH_INTERVAL_MS = 500; - private static final String S3_ACCESS_KEY_ID = "test-key-id0"; - private static final String S3_SECRET_ACCESS_KEY = "test_secret_key0"; - - private static final String VALUE_CONVERTER_KEY = "value.converter"; - - private static final String TEST_BUCKET_NAME = "test-bucket0"; - private static String s3Endpoint; private static String s3Prefix; private static BucketAccessor testBucketAccessor; @@ -112,8 +96,19 @@ final class IntegrationTest implements IntegrationBase { private static S3Client s3Client; - @BeforeAll - static void setUpAll() throws IOException, InterruptedException { + @Override + public S3Client getS3Client() { + return s3Client; + } + + @Override + public String getS3Prefix() { + return s3Prefix; + } + + public + + @BeforeAll static void setUpAll() throws IOException, InterruptedException { s3Prefix = COMMON_PREFIX + ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) + "/"; s3Client = IntegrationBase.createS3Client(LOCALSTACK); @@ -159,7 +154,7 @@ void tearDown() { @Test void bytesTest(final TestInfo testInfo) { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 2); + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 1); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); @@ -208,15 +203,15 @@ void avroTest(final TestInfo testInfo) throws IOException { final int numOfRecsFactor = 5000; - final byte[] outputStream1 = generateNextAvroMessagesStartingFromId(1, numOfRecsFactor, schema); - final byte[] outputStream2 = generateNextAvroMessagesStartingFromId(numOfRecsFactor + 1, numOfRecsFactor, - schema); - final byte[] outputStream3 = generateNextAvroMessagesStartingFromId(2 * numOfRecsFactor + 1, numOfRecsFactor, - schema); - final byte[] outputStream4 = generateNextAvroMessagesStartingFromId(3 * numOfRecsFactor + 1, numOfRecsFactor, - schema); - final byte[] outputStream5 = generateNextAvroMessagesStartingFromId(4 * numOfRecsFactor + 1, numOfRecsFactor, - schema); + final byte[] outputStream1 = IntegrationBase.generateNextAvroMessagesStartingFromId(1, numOfRecsFactor, schema); + final byte[] outputStream2 = IntegrationBase.generateNextAvroMessagesStartingFromId(numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream3 = IntegrationBase.generateNextAvroMessagesStartingFromId(2 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream4 = IntegrationBase.generateNextAvroMessagesStartingFromId(3 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream5 = IntegrationBase.generateNextAvroMessagesStartingFromId(4 * numOfRecsFactor + 1, + numOfRecsFactor, schema); final Set<String> offsetKeys = new HashSet<>(); @@ -254,8 +249,8 @@ void parquetTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); final String partition = "00000"; - final String fileName = addPrefixOrDefault("") + topicName + "-" + partition + "-" + System.currentTimeMillis() - + ".txt"; + final String fileName = org.apache.commons.lang3.StringUtils.defaultIfBlank(getS3Prefix(), "") + topicName + "-" + + partition + "-" + System.currentTimeMillis() + ".txt"; final String name = "testuser"; final Map<String, String> connectorConfig = getAvroConfig(topicName, InputFormat.PARQUET); @@ -321,36 +316,6 @@ void jsonTest(final TestInfo testInfo) { verifyOffsetPositions(Map.of(offsetKey, 500), connectRunner.getBootstrapServers()); } - private static byte[] generateNextAvroMessagesStartingFromId(final int messageId, final int noOfAvroRecs, - final Schema schema) throws IOException { - final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); - try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter); - ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) { - dataFileWriter.create(schema, outputStream); - for (int i = messageId; i < messageId + noOfAvroRecs; i++) { - final GenericRecord avroRecord = new GenericData.Record(schema); // NOPMD - avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + i); - avroRecord.put("id", i); - dataFileWriter.append(avroRecord); - } - - dataFileWriter.flush(); - return outputStream.toByteArray(); - } - } - - private static String writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) { - final String objectKey = addPrefixOrDefault("") + topicName + "-" + partitionId + "-" - + System.currentTimeMillis() + ".txt"; - final PutObjectRequest request = PutObjectRequest.builder().bucket(TEST_BUCKET_NAME).key(objectKey).build(); - s3Client.putObject(request, RequestBody.fromBytes(testDataBytes)); - return OBJECT_KEY + SEPARATOR + objectKey; - } - - private static String addPrefixOrDefault(final String defaultValue) { - return StringUtils.isNotBlank(s3Prefix) ? s3Prefix : defaultValue; - } - private Map<String, String> getConfig(final String connectorName, final String topics, final int maxTasks) { final Map<String, String> config = new HashMap<>(basicS3ConnectorConfig()); config.put("name", connectorName); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 320fa19cb..1bfc55580 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -16,22 +16,17 @@ package io.aiven.kafka.connect.s3.source; -import static io.aiven.kafka.connect.common.config.SourceConfigFragment.MAX_POLL_RECORDS; - -import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; -import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; -import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.source.SourceRecord; -import org.apache.kafka.connect.source.SourceTask; +import io.aiven.kafka.connect.common.config.SourceCommonConfig; +import io.aiven.kafka.connect.common.source.AbstractSourceTask; import io.aiven.kafka.connect.common.source.input.Transformer; -import io.aiven.kafka.connect.common.source.input.TransformerFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.utils.AWSV2SourceClient; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; @@ -40,18 +35,17 @@ import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; import io.aiven.kafka.connect.s3.source.utils.Version; +import org.apache.commons.collections4.IteratorUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import software.amazon.awssdk.core.exception.SdkException; -import software.amazon.awssdk.services.s3.S3Client; /** * S3SourceTask is a Kafka Connect SourceTask implementation that reads from source-s3 buckets and generates Kafka * Connect records. */ -@SuppressWarnings({ "PMD.TooManyMethods", "PMD.ExcessiveImports" }) -public class S3SourceTask extends SourceTask { - +public class S3SourceTask extends AbstractSourceTask { + /** The logger to write to */ private static final Logger LOGGER = LoggerFactory.getLogger(S3SourceTask.class); public static final String BUCKET = "bucket"; @@ -60,29 +54,23 @@ public class S3SourceTask extends SourceTask { public static final String OBJECT_KEY = "object_key"; public static final String PARTITION = "topicPartition"; - private static final long S_3_POLL_INTERVAL_MS = 10_000L; - private static final long ERROR_BACKOFF = 1000L; - - private S3SourceConfig s3SourceConfig; - private S3Client s3Client; - - private Iterator<S3SourceRecord> sourceRecordIterator; + /** An iterator or S3SourceRecords */ + private Iterator<S3SourceRecord> s3SourceRecordIterator; + /** + * The transformer that we are using TODO move this to AbstractSourceTask + */ private Transformer transformer; + /** The AWS Source client */ - private boolean taskInitialized; - - private final AtomicBoolean connectorStopped = new AtomicBoolean(); - - private final Object pollLock = new Object(); private AWSV2SourceClient awsv2SourceClient; + /** The list of failed object keys */ private final Set<String> failedObjectKeys = new HashSet<>(); - private final Set<String> inProcessObjectKeys = new HashSet<>(); - + /** The offset manager this task uses */ private OffsetManager offsetManager; + private S3SourceConfig s3SourceConfig; - @SuppressWarnings("PMD.UnnecessaryConstructor") public S3SourceTask() { - super(); + super(LOGGER); } @Override @@ -91,100 +79,98 @@ public String version() { } @Override - public void start(final Map<String, String> props) { + protected Iterator<SourceRecord> getIterator(BackoffConfig config) { // NOPMD cognitive complexity + final Iterator<SourceRecord> inner = new Iterator<>() { + /** + * The backoff for Amazon retryable exceptions + */ + final Backoff backoff = new Backoff(config); + + @Override + public boolean hasNext() { + while (stillPolling()) { + try { + return s3SourceRecordIterator.hasNext(); + } catch (SdkException exception) { + if (exception.retryable()) { + LOGGER.warn("Retryable error encountered during polling. Waiting before retrying...", + exception); + try { + backoff.delay(); + } catch (InterruptedException e) { + LOGGER.warn("Backoff delay was interrupted. Throwing original exception: {}", + exception.getMessage()); + throw exception; + } + } else { + // TODO validate that the iterator does not lose an S3Object. Add test to + // S3ObjectIterator. + throw exception; + } + } + } + return false; + } + + @Override + public SourceRecord next() { + final S3SourceRecord s3SourceRecord = s3SourceRecordIterator.next(); + offsetManager.updateAndReturnCurrentOffsets(s3SourceRecord.getPartitionMap(), + s3SourceRecord.getObjectKey(), s3SourceRecord.getRecordNumber()); + return RecordProcessor.createSourceRecord(s3SourceRecord, s3SourceConfig, awsv2SourceClient, + offsetManager); + } + }; + return IteratorUtils.filteredIterator(inner, Objects::nonNull); + } + + @Override + protected SourceCommonConfig configure(final Map<String, String> props) { LOGGER.info("S3 Source task started."); - s3SourceConfig = new S3SourceConfig(props); - this.transformer = TransformerFactory.getTransformer(s3SourceConfig); + this.s3SourceConfig = new S3SourceConfig(props); + this.transformer = s3SourceConfig.getTransformer(); offsetManager = new OffsetManager(context, s3SourceConfig); awsv2SourceClient = new AWSV2SourceClient(s3SourceConfig, failedObjectKeys); - prepareReaderFromOffsetStorageReader(); - this.taskInitialized = true; - } - - private void prepareReaderFromOffsetStorageReader() { - sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, offsetManager, this.transformer, - awsv2SourceClient); + setS3SourceRecordIterator( + new SourceRecordIterator(s3SourceConfig, offsetManager, this.transformer, awsv2SourceClient)); + return s3SourceConfig; } @Override - public List<SourceRecord> poll() throws InterruptedException { - LOGGER.info("Polling for new records..."); - synchronized (pollLock) { - final List<SourceRecord> results = new ArrayList<>(s3SourceConfig.getInt(MAX_POLL_RECORDS)); - - if (connectorStopped.get()) { - LOGGER.info("Connector has been stopped. Returning empty result list."); - return results; - } - - while (!connectorStopped.get()) { - try { - extractSourceRecords(results); - LOGGER.info("Number of records extracted and sent: {}", results.size()); - return results; - } catch (SdkException exception) { - if (exception.retryable()) { - LOGGER.warn("Retryable error encountered during polling. Waiting before retrying...", - exception); - pollLock.wait(ERROR_BACKOFF); - - prepareReaderFromOffsetStorageReader(); - } else { - LOGGER.warn("Non-retryable AmazonS3Exception occurred. Stopping polling.", exception); - return null; // NOPMD - } - } catch (DataException exception) { - LOGGER.warn("DataException occurred during polling. No retries will be attempted.", exception); - } catch (final Throwable t) { // NOPMD - LOGGER.error("Unexpected error encountered. Closing resources and stopping task.", t); - closeResources(); - throw t; - } - } - return results; - } + public void commit() { + LOGGER.info("Committed all records through last poll()"); } - private List<SourceRecord> extractSourceRecords(final List<SourceRecord> results) throws InterruptedException { - waitForObjects(); - if (connectorStopped.get()) { - return results; + @Override + public void commitRecord(final SourceRecord record) { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("Committed individual record {} committed", (Map<String, Object>) record.sourceOffset()); } - return RecordProcessor.processRecords(sourceRecordIterator, results, s3SourceConfig, connectorStopped, - awsv2SourceClient, offsetManager); } - private void waitForObjects() throws InterruptedException { - while (!sourceRecordIterator.hasNext() && !connectorStopped.get()) { - LOGGER.debug("Blocking until new S3 files are available."); - Thread.sleep(S_3_POLL_INTERVAL_MS); - prepareReaderFromOffsetStorageReader(); - } + /** + * Set the S3 source record iterator that this task is using. Protected to be overridden in testing implementation. + * + * @param iterator + * The S3SourceRecord iterator to use. + */ + protected void setS3SourceRecordIterator(final Iterator<S3SourceRecord> iterator) { + s3SourceRecordIterator = iterator; } @Override - public void stop() { - this.taskInitialized = false; - this.connectorStopped.set(true); - synchronized (pollLock) { - closeResources(); - } - } - - private void closeResources() { + protected void closeResources() { awsv2SourceClient.shutdown(); } // below for visibility in tests + + /** + * Get the transformer that we are using. + * + * @return the transformer that we are using. + */ public Transformer getTransformer() { return transformer; } - - public boolean isTaskInitialized() { - return taskInitialized; - } - - public AtomicBoolean getConnectorStopped() { - return new AtomicBoolean(connectorStopped.get()); - } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java index 44e28dfa7..ed460a500 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java @@ -28,7 +28,9 @@ import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import org.apache.commons.io.function.IOSupplier; -import org.codehaus.plexus.util.StringUtils; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import software.amazon.awssdk.core.ResponseBytes; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.model.GetObjectRequest; @@ -42,6 +44,7 @@ */ public class AWSV2SourceClient { + private static final Logger LOGGER = LoggerFactory.getLogger(AWSV2SourceClient.class); public static final int PAGE_SIZE_FACTOR = 2; private final S3SourceConfig s3SourceConfig; private final S3Client s3Client; @@ -50,6 +53,9 @@ public class AWSV2SourceClient { private Predicate<S3Object> filterPredicate = s3Object -> s3Object.size() > 0; private final Set<String> failedObjectKeys; + private final int taskId; + private final int maxTasks; + /** * @param s3SourceConfig * configuration for Source connector @@ -57,11 +63,7 @@ public class AWSV2SourceClient { * all objectKeys which have already been tried but have been unable to process. */ public AWSV2SourceClient(final S3SourceConfig s3SourceConfig, final Set<String> failedObjectKeys) { - this.s3SourceConfig = s3SourceConfig; - final S3ClientFactory s3ClientFactory = new S3ClientFactory(); - this.s3Client = s3ClientFactory.createAmazonS3Client(s3SourceConfig); - this.bucketName = s3SourceConfig.getAwsS3BucketName(); - this.failedObjectKeys = new HashSet<>(failedObjectKeys); + this(new S3ClientFactory().createAmazonS3Client(s3SourceConfig), s3SourceConfig, failedObjectKeys); } /** @@ -80,42 +82,96 @@ public AWSV2SourceClient(final S3SourceConfig s3SourceConfig, final Set<String> this.s3Client = s3Client; this.bucketName = s3SourceConfig.getAwsS3BucketName(); this.failedObjectKeys = new HashSet<>(failedObjectKeys); + + // TODO the code below should be configured in some sort of taks assignement method/process/call. + int maxTasks; + try { + final Object value = s3SourceConfig.originals().get("tasks.max"); + if (value == null) { + LOGGER.info("Setting tasks.max to 1"); + maxTasks = 1; + } else { + maxTasks = Integer.parseInt(value.toString()); + } + } catch (NumberFormatException e) { // NOPMD catch null pointer + LOGGER.warn("Invalid tasks.max: {}", e.getMessage()); + LOGGER.info("Setting tasks.max to 1"); + maxTasks = 1; + } + this.maxTasks = maxTasks; + int taskId; + try { + final Object value = s3SourceConfig.originals().get("task.id"); + if (value == null) { + LOGGER.info("Setting task.id to 0"); + taskId = 0; + } else { + taskId = Integer.parseInt(value.toString()) % maxTasks; + } + } catch (NumberFormatException e) { // NOPMD catch null pointer + LOGGER.warn("Invalid task.id: {}", e.getMessage()); + LOGGER.info("Setting task.id to 0"); + taskId = 0; + } + this.taskId = taskId; } - public Iterator<String> getListOfObjectKeys(final String startToken) { + /** + * Creates a stream from which we will create an iterator. + * + * @param startToken + * the beginning key, or {@code null} to start at the beginning. + * @return a Stream of S3Objects for the current state of the S3 storage. + */ + private Stream<S3Object> getS3ObjectStream(final String startToken) { final ListObjectsV2Request request = ListObjectsV2Request.builder() .bucket(bucketName) .maxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR) - .prefix(optionalKey(s3SourceConfig.getAwsS3Prefix())) - .startAfter(optionalKey(startToken)) + .prefix(StringUtils.defaultIfBlank(s3SourceConfig.getAwsS3Prefix(), null)) + .startAfter(StringUtils.defaultIfBlank(startToken, null)) .build(); - final Stream<String> s3ObjectKeyStream = Stream - .iterate(s3Client.listObjectsV2(request), Objects::nonNull, response -> { - // This is called every time next() is called on the iterator. - if (response.isTruncated()) { - return s3Client.listObjectsV2(ListObjectsV2Request.builder() - .maxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR) - .continuationToken(response.nextContinuationToken()) - .build()); - } else { - return null; - } - - }) + return Stream.iterate(s3Client.listObjectsV2(request), Objects::nonNull, response -> { + // This is called every time next() is called on the iterator. + if (response.isTruncated()) { + return s3Client.listObjectsV2(ListObjectsV2Request.builder() + .maxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR) + .continuationToken(response.nextContinuationToken()) + .build()); + } else { + return null; + } + + }) .flatMap(response -> response.contents() .stream() .filter(filterPredicate) .filter(objectSummary -> assignObjectToTask(objectSummary.key())) - .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.key()))) - .map(S3Object::key); - return s3ObjectKeyStream.iterator(); + .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.key()))); } - private String optionalKey(final String key) { - if (StringUtils.isNotBlank(key)) { - return key; - } - return null; + + /** + * Creates an S3Object iterator that will return the objects from the current objects in S3 storage and then try to + * refresh on every {@code hasNext()} that returns false. This should pick up new files as they are dropped on the + * file system. + * + * @param startToken + * the beginning key, or {@code null} to start at the beginning. + * @return an Iterator on the S3Objects. + */ + public Iterator<S3Object> getS3ObjectIterator(final String startToken) { + return new S3ObjectIterator(startToken); + } + + /** + * Gets an iterator of keys from the current S3 storage. + * + * @param startToken + * the beginning key, or {@code null} to start at the beginning. + * @return an Iterator on the keys of the current S3Objects. + */ + public Iterator<String> getListOfObjectKeys(final String startToken) { + return getS3ObjectStream(startToken).map(S3Object::key).iterator(); } public IOSupplier<InputStream> getObject(final String objectKey) { @@ -133,8 +189,6 @@ public void setFilterPredicate(final Predicate<S3Object> predicate) { } private boolean assignObjectToTask(final String objectKey) { - final int maxTasks = Integer.parseInt(s3SourceConfig.originals().get("tasks.max").toString()); - final int taskId = Integer.parseInt(s3SourceConfig.originals().get("task.id").toString()) % maxTasks; final int taskAssignment = Math.floorMod(objectKey.hashCode(), maxTasks); return taskAssignment == taskId; } @@ -143,4 +197,39 @@ public void shutdown() { s3Client.close(); } + /** + * An iterator that reads from + */ + public class S3ObjectIterator implements Iterator<S3Object> { + + /** The current iterator. */ + private Iterator<S3Object> inner; + /** The last object key that was seen. */ + private String lastSeenObjectKey; + + private S3ObjectIterator(final String initialKey) { + lastSeenObjectKey = initialKey; + inner = getS3ObjectStream(lastSeenObjectKey).iterator(); + } + @Override + public boolean hasNext() { + if (!inner.hasNext()) { + inner = getS3ObjectStream(lastSeenObjectKey).iterator(); + } + return inner.hasNext(); + } + + @Override + public S3Object next() { + final S3Object result = inner.next(); + lastSeenObjectKey = result.key(); + return result; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + } + } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java index f401c4e1f..6c60bb8ed 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java @@ -28,7 +28,7 @@ final public class ConnectUtils { private ConnectUtils() { // hidden } - static Map<String, Object> getPartitionMap(final String topicName, final Integer defaultPartitionId, + public static Map<String, Object> getPartitionMap(final String topicName, final Integer defaultPartitionId, final String bucketName) { final Map<String, Object> partitionMap = new HashMap<>(); partitionMap.put(BUCKET, bucketName); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java index 1b52d8d83..95bc4053d 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -23,6 +23,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.Hashtable; import java.util.List; import java.util.Map; import java.util.Set; @@ -81,7 +82,17 @@ public long incrementAndUpdateOffsetMap(final Map<String, Object> partitionMap, return startOffset; } - public String getObjectMapKey(final String currentObjectKey) { + public Map<String, Object> updateAndReturnCurrentOffsets(final Map<String, Object> partitionMap, + final String currentObjectKey, final long offset) { + final Map<String, Object> offsetMap = offsets.compute(partitionMap, (k, v) -> { + final Map<String, Object> map = v == null ? new Hashtable<>() : v; + map.put(getObjectMapKey(currentObjectKey), offset); + return map; + }); + return new HashMap<>(offsetMap); + } + + public static String getObjectMapKey(final String currentObjectKey) { return OBJECT_KEY + SEPARATOR + currentObjectKey; } @@ -92,29 +103,6 @@ public long recordsProcessedForObjectKey(final Map<String, Object> partitionMap, return 0L; } - public void createNewOffsetMap(final Map<String, Object> partitionMap, final String objectKey, - final long offsetId) { - final Map<String, Object> offsetMap = getOffsetValueMap(objectKey, offsetId); - offsets.put(partitionMap, offsetMap); - } - - public Map<String, Object> getOffsetValueMap(final String currentObjectKey, final long offsetId) { - final Map<String, Object> offsetMap = new HashMap<>(); - offsetMap.put(getObjectMapKey(currentObjectKey), offsetId); - - return offsetMap; - } - - void updateCurrentOffsets(final Map<String, Object> partitionMap, final Map<String, Object> offsetValueMap) { - if (offsets.containsKey(partitionMap)) { - final Map<String, Object> offsetMap = new HashMap<>(offsets.get(partitionMap)); - offsetMap.putAll(offsetValueMap); - offsets.put(partitionMap, offsetMap); - } else { - offsets.put(partitionMap, offsetValueMap); - } - } - private static Set<Integer> parsePartitions(final S3SourceConfig s3SourceConfig) { final String partitionString = s3SourceConfig.getTargetTopicPartitions(); return Arrays.stream(partitionString.split(",")).map(Integer::parseInt).collect(Collectors.toSet()); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index bdf265338..e945c2565 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -16,10 +16,6 @@ package io.aiven.kafka.connect.s3.source.utils; -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.atomic.AtomicBoolean; - import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.source.SourceRecord; @@ -35,33 +31,13 @@ public final class RecordProcessor { private static final Logger LOGGER = LoggerFactory.getLogger(RecordProcessor.class); private RecordProcessor() { - - } - - public static List<SourceRecord> processRecords(final Iterator<S3SourceRecord> sourceRecordIterator, - final List<SourceRecord> results, final S3SourceConfig s3SourceConfig, final AtomicBoolean connectorStopped, - final AWSV2SourceClient sourceClient, final OffsetManager offsetManager) { - - final int maxPollRecords = s3SourceConfig.getMaxPollRecords(); - - for (int i = 0; sourceRecordIterator.hasNext() && i < maxPollRecords && !connectorStopped.get(); i++) { - final S3SourceRecord s3SourceRecord = sourceRecordIterator.next(); - if (s3SourceRecord != null) { - final SourceRecord sourceRecord = createSourceRecord(s3SourceRecord, s3SourceConfig, sourceClient, - offsetManager); - results.add(sourceRecord); - } - } - - return results; } - static SourceRecord createSourceRecord(final S3SourceRecord s3SourceRecord, final S3SourceConfig s3SourceConfig, - final AWSV2SourceClient sourceClient, final OffsetManager offsetManager) { + public static SourceRecord createSourceRecord(final S3SourceRecord s3SourceRecord, + final S3SourceConfig s3SourceConfig, final AWSV2SourceClient sourceClient, + final OffsetManager offsetManager) { try { - offsetManager.updateCurrentOffsets(s3SourceRecord.getPartitionMap(), s3SourceRecord.getOffsetMap()); - s3SourceRecord.setOffsetMap(offsetManager.getOffsets().get(s3SourceRecord.getPartitionMap())); - return s3SourceRecord.getSourceRecord(); + return s3SourceRecord.getSourceRecord(offsetManager); } catch (DataException e) { if (ErrorsTolerance.NONE.equals(s3SourceConfig.getErrorsTolerance())) { throw new ConnectException("Data Exception caught during S3 record to source record transformation", e); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java index c4be50217..05ca02ba4 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java @@ -25,7 +25,7 @@ public class S3SourceRecord { private final Map<String, Object> partitionMap; - private Map<String, Object> offsetMap; + private final long recordNumber; private final String topic; private final Integer topicPartition; private final SchemaAndValue keyData; @@ -34,11 +34,11 @@ public class S3SourceRecord { private final String objectKey; - public S3SourceRecord(final Map<String, Object> partitionMap, final Map<String, Object> offsetMap, - final String topic, final Integer topicPartition, final String objectKey, final SchemaAndValue keyData, + public S3SourceRecord(final Map<String, Object> partitionMap, final long recordNumber, final String topic, + final Integer topicPartition, final String objectKey, final SchemaAndValue keyData, final SchemaAndValue valueData) { this.partitionMap = new HashMap<>(partitionMap); - this.offsetMap = new HashMap<>(offsetMap); + this.recordNumber = recordNumber; this.topic = topic; this.topicPartition = topicPartition; this.keyData = keyData; @@ -50,8 +50,8 @@ public Map<String, Object> getPartitionMap() { return Collections.unmodifiableMap(partitionMap); } - public Map<String, Object> getOffsetMap() { - return Collections.unmodifiableMap(offsetMap); + public long getRecordNumber() { + return recordNumber; } public String getTopic() { @@ -66,12 +66,18 @@ public String getObjectKey() { return objectKey; } - public void setOffsetMap(final Map<String, Object> offsetMap) { - this.offsetMap = new HashMap<>(offsetMap); + public SchemaAndValue getKey() { + return new SchemaAndValue(keyData.schema(), keyData.value()); } - public SourceRecord getSourceRecord() { - return new SourceRecord(getPartitionMap(), getOffsetMap(), topic, partition(), keyData.schema(), - keyData.value(), valueData.schema(), valueData.value()); + public SchemaAndValue getValue() { + return new SchemaAndValue(valueData.schema(), valueData.value()); + } + + public SourceRecord getSourceRecord(final OffsetManager offsetManager) { + final Map<String, Object> offsetMap = offsetManager.updateAndReturnCurrentOffsets(getPartitionMap(), + getObjectKey(), getRecordNumber()); + return new SourceRecord(getPartitionMap(), offsetMap, topic, partition(), keyData.schema(), keyData.value(), + valueData.schema(), valueData.value()); } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 26f3c03cf..bded51d1b 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -16,13 +16,11 @@ package io.aiven.kafka.connect.s3.source.utils; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; -import java.util.List; import java.util.Map; +import java.util.function.Function; +import java.util.function.Predicate; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Stream; @@ -33,27 +31,20 @@ import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import org.apache.commons.io.function.IOSupplier; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import software.amazon.awssdk.core.exception.SdkException; +import org.apache.commons.collections4.IteratorUtils; +import software.amazon.awssdk.services.s3.model.S3Object; /** * Iterator that processes S3 files and creates Kafka source records. Supports different output formats (Avro, JSON, * Parquet). */ public final class SourceRecordIterator implements Iterator<S3SourceRecord> { - private static final Logger LOGGER = LoggerFactory.getLogger(SourceRecordIterator.class); public static final String PATTERN_TOPIC_KEY = "topicName"; public static final String PATTERN_PARTITION_KEY = "partitionId"; public static final Pattern FILE_DEFAULT_PATTERN = Pattern.compile("(?<topicName>[^/]+?)-" + "(?<partitionId>\\d{5})-" + "(?<uniqueId>[a-zA-Z0-9]+)" + "\\.(?<fileExtension>[^.]+)$"); // topic-00001.txt public static final long BYTES_TRANSFORMATION_NUM_OF_RECS = 1L; - private String currentObjectKey; - - private Iterator<String> objectListIterator; - private Iterator<S3SourceRecord> recordIterator = Collections.emptyIterator(); private final OffsetManager offsetManager; @@ -63,165 +54,55 @@ public final class SourceRecordIterator implements Iterator<S3SourceRecord> { private final Transformer transformer; // Once we decouple the S3Object from the Source Iterator we can change this to be the SourceApiClient // At which point it will work for al our integrations. - private final AWSV2SourceClient sourceClient; // NOPMD - - public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final OffsetManager offsetManager, - final Transformer transformer, final AWSV2SourceClient sourceClient) { - this.s3SourceConfig = s3SourceConfig; - this.offsetManager = offsetManager; + private final AWSV2SourceClient sourceClient; - this.bucketName = s3SourceConfig.getAwsS3BucketName(); - this.transformer = transformer; - this.sourceClient = sourceClient; - objectListIterator = sourceClient.getListOfObjectKeys(null); - } + private String topic; + private int partitionId; - private void nextS3Object() { - if (!objectListIterator.hasNext()) { - // Start after the object Key we have just finished with. - objectListIterator = sourceClient.getListOfObjectKeys(currentObjectKey); - if (!objectListIterator.hasNext()) { - recordIterator = Collections.emptyIterator(); - return; - } - } + private final Iterator<S3Object> inner; - try { - currentObjectKey = objectListIterator.next(); - if (currentObjectKey != null) { - recordIterator = createIteratorForCurrentFile(); - } - } catch (IOException e) { - throw SdkException.create(e.getMessage(), e.getCause()); - } - } + private Iterator<S3SourceRecord> outer; - private Iterator<S3SourceRecord> createIteratorForCurrentFile() throws IOException { + private final Predicate<S3Object> fileNamePredicate = s3Object -> { - final Matcher fileMatcher = FILE_DEFAULT_PATTERN.matcher(currentObjectKey); - String topicName; - int defaultPartitionId; + final Matcher fileMatcher = FILE_DEFAULT_PATTERN.matcher(s3Object.key()); if (fileMatcher.find()) { // TODO move this from the SourceRecordIterator so that we can decouple it from S3 and make it API agnostic + topic = fileMatcher.group(PATTERN_TOPIC_KEY); + partitionId = Integer.parseInt(fileMatcher.group(PATTERN_PARTITION_KEY)); + return true; + } + return false; + }; - final IOSupplier<InputStream> s3Object = sourceClient.getObject(currentObjectKey); - topicName = fileMatcher.group(PATTERN_TOPIC_KEY); - defaultPartitionId = Integer.parseInt(fileMatcher.group(PATTERN_PARTITION_KEY)); - - final long defaultStartOffsetId = 1L; - - final String finalTopic = topicName; - final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(topicName, defaultPartitionId, - bucketName); - - return getObjectIterator(s3Object, finalTopic, defaultPartitionId, defaultStartOffsetId, transformer, - partitionMap); + public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final OffsetManager offsetManager, + final Transformer transformer, final AWSV2SourceClient sourceClient) { + super(); + this.s3SourceConfig = s3SourceConfig; + this.offsetManager = offsetManager; - } else { - LOGGER.error("File naming doesn't match to any topic. {}", currentObjectKey); - return Collections.emptyIterator(); - } - } + this.bucketName = s3SourceConfig.getAwsS3BucketName(); + this.transformer = transformer; + this.sourceClient = sourceClient; - @SuppressWarnings("PMD.CognitiveComplexity") - private Iterator<S3SourceRecord> getObjectIterator(final IOSupplier<InputStream> s3Object, final String topic, - final int topicPartition, final long startOffset, final Transformer transformer, - final Map<String, Object> partitionMap) { - return new Iterator<>() { - private final Iterator<S3SourceRecord> internalIterator = readNext().iterator(); - - private List<S3SourceRecord> readNext() { - - final List<S3SourceRecord> sourceRecords = new ArrayList<>(); - - final long numberOfRecsAlreadyProcessed = offsetManager.recordsProcessedForObjectKey(partitionMap, - currentObjectKey); - - // Optimizing without reading stream again. - if (checkBytesTransformation(transformer, numberOfRecsAlreadyProcessed)) { - return sourceRecords; - } - - try (Stream<Object> recordStream = transformer.getRecords(s3Object, topic, topicPartition, - s3SourceConfig, numberOfRecsAlreadyProcessed)) { - - final Iterator<Object> recordIterator = recordStream.iterator(); - while (recordIterator.hasNext()) { - final Object record = recordIterator.next(); - - sourceRecords.add(getSourceRecord(topic, topicPartition, offsetManager, startOffset, - partitionMap, transformer.getValueData(record, topic, s3SourceConfig), - transformer.getKeyData(currentObjectKey, topic, s3SourceConfig))); - - // Break if we have reached the max records per poll - if (sourceRecords.size() >= s3SourceConfig.getMaxPollRecords()) { - break; - } - } - } - - return sourceRecords; - } - - // For bytes transformation, read whole file as 1 record - private boolean checkBytesTransformation(final Transformer transformer, - final long numberOfRecsAlreadyProcessed) { - return transformer instanceof ByteArrayTransformer - && numberOfRecsAlreadyProcessed == BYTES_TRANSFORMATION_NUM_OF_RECS; - } - - private S3SourceRecord getSourceRecord(final String topic, final int topicPartition, - final OffsetManager offsetManager, final long startOffset, final Map<String, Object> partitionMap, - final SchemaAndValue valueData, final SchemaAndValue keyData) { - - long currentOffset; - - if (offsetManager.getOffsets().containsKey(partitionMap)) { - LOGGER.info("***** offsetManager.getOffsets() ***** {}", offsetManager.getOffsets()); - currentOffset = offsetManager.incrementAndUpdateOffsetMap(partitionMap, currentObjectKey, - startOffset); - } else { - LOGGER.info("Into else block ..."); - currentOffset = startOffset; - offsetManager.createNewOffsetMap(partitionMap, currentObjectKey, currentOffset); - } - - final Map<String, Object> offsetMap = offsetManager.getOffsetValueMap(currentObjectKey, currentOffset); - - return new S3SourceRecord(partitionMap, offsetMap, topic, topicPartition, currentObjectKey, keyData, - valueData); - } - - @Override - public boolean hasNext() { - return internalIterator.hasNext(); - } - - @Override - public S3SourceRecord next() { - return internalIterator.next(); - } - }; + // call filters out bad file names and extracts topic/partition + inner = IteratorUtils.filteredIterator(sourceClient.getS3ObjectIterator(null), + s3Object -> this.fileNamePredicate.test(s3Object)); + outer = Collections.emptyIterator(); } @Override public boolean hasNext() { - return recordIterator.hasNext() || objectListIterator.hasNext(); + while (!outer.hasNext() && inner.hasNext()) { + outer = convert(inner.next()).iterator(); + } + return outer.hasNext(); } @Override public S3SourceRecord next() { - if (!recordIterator.hasNext()) { - nextS3Object(); - } - - if (!recordIterator.hasNext()) { - // If there are still no records, return null or throw an exception - return null; // Or throw new NoSuchElementException(); - } - - return recordIterator.next(); + return outer.next(); } @Override @@ -229,4 +110,63 @@ public void remove() { throw new UnsupportedOperationException("This iterator is unmodifiable"); } + /** + * Converts the S3Object into stream of S3SourceRecords. + * + * @param s3Object + * the S3Object to read data from. + * @return a stream of S3SourceRecords created from the input stream of the S3Object. + */ + private Stream<S3SourceRecord> convert(final S3Object s3Object) { + + final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(topic, partitionId, bucketName); + final long recordCount = offsetManager.recordsProcessedForObjectKey(partitionMap, s3Object.key()); + + // Optimizing without reading stream again. + if (transformer instanceof ByteArrayTransformer && recordCount > 0) { + return Stream.empty(); + } + + final SchemaAndValue keyData = transformer.getKeyData(s3Object.key(), topic, s3SourceConfig); + + return transformer + .getRecords(sourceClient.getObject(s3Object.key()), topic, partitionId, s3SourceConfig, recordCount) + .map(new Mapper(partitionMap, recordCount, keyData, s3Object.key())); + } + + /** + * maps the data from the @{link Transformer} stream to an S3SourceRecord given all the additional data required. + */ + class Mapper implements Function<SchemaAndValue, S3SourceRecord> { + /** + * The partition map + */ + private final Map<String, Object> partitionMap; + /** + * The record number for the record being created. + */ + private long recordCount; + /** + * The schema and value for the key + */ + private final SchemaAndValue keyData; + /** + * The object key from S3 + */ + private final String objectKey; + + public Mapper(final Map<String, Object> partitionMap, final long recordCount, final SchemaAndValue keyData, + final String objectKey) { + this.partitionMap = partitionMap; + this.recordCount = recordCount; + this.keyData = keyData; + this.objectKey = objectKey; + } + + @Override + public S3SourceRecord apply(final SchemaAndValue valueData) { + recordCount++; + return new S3SourceRecord(partitionMap, recordCount, topic, partitionId, objectKey, keyData, valueData); + } + } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java index 13ac66844..944ccbfdf 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -23,10 +23,14 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -import java.lang.reflect.Field; import java.net.URI; import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Random; @@ -37,38 +41,47 @@ import org.apache.kafka.connect.source.SourceTaskContext; import org.apache.kafka.connect.storage.OffsetStorageReader; +import io.aiven.kafka.connect.common.config.SourceConfigFragment; +import io.aiven.kafka.connect.common.source.AbstractSourceTask; import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; import io.aiven.kafka.connect.common.source.input.InputFormat; -import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.config.s3.S3ConfigFragment; import io.aiven.kafka.connect.iam.AwsCredentialProviderFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; +import io.aiven.kafka.connect.s3.source.utils.ConnectUtils; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; import io.aiven.kafka.connect.s3.source.utils.S3SourceRecord; -import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; import io.findify.s3mock.S3Mock; +import org.apache.commons.lang3.time.StopWatch; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.Mock; -import org.mockito.junit.jupiter.MockitoExtension; import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration; import software.amazon.awssdk.core.retry.RetryMode; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.S3Configuration; -@ExtendWith(MockitoExtension.class) final class S3SourceTaskTest { + /** + * The amount of extra time that we will allow for timing errors. + */ + private static final long TIMING_DELTA = 500; + private static final Random RANDOM = new Random(); private Map<String, String> properties; - private static BucketAccessor testBucketAccessor; private static final String TEST_BUCKET = "test-bucket"; + + private static final String TOPIC = "TOPIC1"; + + private static final int PARTITION = 1; + + private static final String OBJECT_KEY = "object_key"; + // TODO S3Mock has not been maintained in 4 years // Adobe have an alternative we can move to. private static S3Mock s3Api; @@ -76,12 +89,6 @@ final class S3SourceTaskTest { private static Map<String, String> commonProperties; - @Mock - private SourceTaskContext mockedSourceTaskContext; - - @Mock - private OffsetStorageReader mockedOffsetStorageReader; - @BeforeAll public static void setUpClass() throws URISyntaxException { final int s3Port = RANDOM.nextInt(10_000) + 10_000; @@ -107,9 +114,6 @@ public static void setUpClass() throws URISyntaxException { .serviceConfiguration(S3Configuration.builder().pathStyleAccessEnabled(true).build()) .credentialsProvider(credentialFactory.getAwsV2Provider(config.getS3ConfigFragment())) .build(); - - testBucketAccessor = new BucketAccessor(s3Client, TEST_BUCKET); - testBucketAccessor.createBucket(); } @AfterAll @@ -121,8 +125,6 @@ public static void tearDownClass() { public void setUp() { properties = new HashMap<>(commonProperties); s3Client.createBucket(create -> create.bucket(TEST_BUCKET).build()); - mockedSourceTaskContext = mock(SourceTaskContext.class); - mockedOffsetStorageReader = mock(OffsetStorageReader.class); } @AfterEach @@ -135,74 +137,271 @@ void testS3SourceTaskInitialization() { final S3SourceTask s3SourceTask = new S3SourceTask(); startSourceTask(s3SourceTask); - final Transformer transformer = s3SourceTask.getTransformer(); - assertThat(transformer).isInstanceOf(ByteArrayTransformer.class); + assertThat(s3SourceTask.getTransformer()).isInstanceOf(ByteArrayTransformer.class); - final boolean taskInitialized = s3SourceTask.isTaskInitialized(); - assertThat(taskInitialized).isTrue(); + assertThat(s3SourceTask.isRunning()).isTrue(); } @Test - void testPoll() throws Exception { + void testStop() { final S3SourceTask s3SourceTask = new S3SourceTask(); startSourceTask(s3SourceTask); + s3SourceTask.stop(); + + assertThat(s3SourceTask.isRunning()).isFalse(); + } - SourceRecordIterator mockSourceRecordIterator; + private static S3SourceRecord createS3SourceRecord(final String topicName, final Integer defaultPartitionId, + final String bucketName, final String objectKey, final byte[] key, final byte[] value) { + return new S3SourceRecord(ConnectUtils.getPartitionMap(topicName, defaultPartitionId, bucketName), 0L, + topicName, defaultPartitionId, objectKey, new SchemaAndValue(Schema.OPTIONAL_BYTES_SCHEMA, key), + new SchemaAndValue(Schema.OPTIONAL_BYTES_SCHEMA, value)); + } + + private void startSourceTask(final S3SourceTask s3SourceTask) { + final SourceTaskContext mockedSourceTaskContext = mock(SourceTaskContext.class); + final OffsetStorageReader mockedOffsetStorageReader = mock(OffsetStorageReader.class); + when(mockedSourceTaskContext.offsetStorageReader()).thenReturn(mockedOffsetStorageReader); + s3SourceTask.initialize(mockedSourceTaskContext); - mockSourceRecordIterator = mock(SourceRecordIterator.class); - setPrivateField(s3SourceTask, "sourceRecordIterator", mockSourceRecordIterator); - when(mockSourceRecordIterator.hasNext()).thenReturn(true).thenReturn(true).thenReturn(false); + setBasicProperties(); + s3SourceTask.start(properties); + } - final S3SourceRecord s3SourceRecordList = getAivenS3SourceRecord(); - when(mockSourceRecordIterator.next()).thenReturn(s3SourceRecordList); + private void setBasicProperties() { + properties.putIfAbsent(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); + properties.putIfAbsent("name", "test_source_connector"); + properties.putIfAbsent("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + properties.putIfAbsent("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + properties.putIfAbsent("tasks.max", "1"); + properties.putIfAbsent("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); + properties.putIfAbsent(TARGET_TOPIC_PARTITIONS, "0,1"); + properties.putIfAbsent(TARGET_TOPICS, "testtopic"); - final List<SourceRecord> sourceRecordList = s3SourceTask.poll(); - assertThat(sourceRecordList).isNotEmpty(); } @Test - void testStop() { - final S3SourceTask s3SourceTask = new S3SourceTask(); + void testPollWithNoDataReturned() { + final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); + when(s3SourceConfig.getMaxPollRecords()).thenReturn(5); + final Iterator<S3SourceRecord> sourceRecordIterator = Collections.emptyIterator(); + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + startSourceTask(s3SourceTask); - s3SourceTask.stop(); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + final List<SourceRecord> results = s3SourceTask.poll(); + stopWatch.stop(); + assertThat(results).isNull(); + assertThat(stopWatch.getTime()).isLessThan(AbstractSourceTask.MAX_POLL_TIME.toMillis() + TIMING_DELTA); + } + + private void assertEquals(final S3SourceRecord s3Record, final SourceRecord sourceRecord) { + assertThat(sourceRecord).isNotNull(); + assertThat(sourceRecord.sourcePartition()).isEqualTo(s3Record.getPartitionMap()); + final Map<String, Object> map = (Map<String, Object>) sourceRecord.sourceOffset(); + + assertThat(map.get(OffsetManager.getObjectMapKey(s3Record.getObjectKey()))) + .isEqualTo(s3Record.getRecordNumber()); + assertThat(sourceRecord.key()).isEqualTo(s3Record.getKey().value()); + assertThat(sourceRecord.value()).isEqualTo(s3Record.getValue().value()); + } + + @Test + void testPollsWithRecords() { + final List<S3SourceRecord> lst = createS3SourceRecords(2); + final Iterator<S3SourceRecord> sourceRecordIterator = lst.iterator(); + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); - final boolean taskInitialized = s3SourceTask.isTaskInitialized(); - assertThat(taskInitialized).isFalse(); - assertThat(s3SourceTask.getConnectorStopped()).isTrue(); + startSourceTask(s3SourceTask); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + final List<SourceRecord> results = s3SourceTask.poll(); + stopWatch.stop(); + + assertThat(results).hasSize(2); + assertEquals(lst.get(0), results.get(0)); + assertEquals(lst.get(1), results.get(1)); + assertThat(stopWatch.getTime()).isLessThan(AbstractSourceTask.MAX_POLL_TIME.toMillis()); } - private static S3SourceRecord getAivenS3SourceRecord() { - return new S3SourceRecord(new HashMap<>(), new HashMap<>(), "testtopic", 0, "", - new SchemaAndValue(Schema.OPTIONAL_BYTES_SCHEMA, new byte[0]), - new SchemaAndValue(Schema.OPTIONAL_BYTES_SCHEMA, new byte[0])); + private List<S3SourceRecord> createS3SourceRecords(final int count) { + final List<S3SourceRecord> lst = new ArrayList<>(); + if (count > 0) { + lst.add(createS3SourceRecord(TOPIC, PARTITION, TEST_BUCKET, OBJECT_KEY, + "Hello".getBytes(StandardCharsets.UTF_8), "Hello World".getBytes(StandardCharsets.UTF_8))); + for (int i = 1; i < count; i++) { + lst.add(createS3SourceRecord(TOPIC, PARTITION, TEST_BUCKET, OBJECT_KEY + i, + "Goodbye".getBytes(StandardCharsets.UTF_8), + String.format("Goodbye cruel World (%s)", i).getBytes(StandardCharsets.UTF_8))); + } + } + return lst; } - @SuppressWarnings("PMD.AvoidAccessibilityAlteration") - private void setPrivateField(final Object object, final String fieldName, final Object value) - throws NoSuchFieldException, IllegalAccessException { - Field field; - field = object.getClass().getDeclaredField(fieldName); - field.setAccessible(true); - field.set(object, value); + @Test + void testPollWithInterruptedIterator() { + final List<S3SourceRecord> lst = createS3SourceRecords(3); + + final Iterator<S3SourceRecord> inner1 = lst.subList(0, 2).iterator(); + final Iterator<S3SourceRecord> inner2 = lst.subList(2, 3).iterator(); + final Iterator<S3SourceRecord> sourceRecordIterator = new Iterator<>() { + Iterator<S3SourceRecord> inner = inner1; + @Override + public boolean hasNext() { + if (inner == null) { + inner = inner2; + return false; + } + return inner.hasNext(); + } + + @Override + public S3SourceRecord next() { + final S3SourceRecord result = inner.next(); + if (!inner.hasNext()) { + inner = null; // NOPMD null assignment + } + return result; + } + }; + + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + startSourceTask(s3SourceTask); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + List<SourceRecord> results = s3SourceTask.poll(); + stopWatch.stop(); + + assertThat(results).hasSize(2); + assertEquals(lst.get(0), results.get(0)); + assertEquals(lst.get(1), results.get(1)); + + results = s3SourceTask.poll(); + assertThat(results).hasSize(1); + + assertThat(stopWatch.getTime()).isLessThan(AbstractSourceTask.MAX_POLL_TIME.toMillis()); + } - private void startSourceTask(final S3SourceTask s3SourceTask) { - s3SourceTask.initialize(mockedSourceTaskContext); - when(mockedSourceTaskContext.offsetStorageReader()).thenReturn(mockedOffsetStorageReader); + @Test + void testPollWithSlowProducer() { + final List<S3SourceRecord> lst = createS3SourceRecords(3); + + final Iterator<S3SourceRecord> sourceRecordIterator = new Iterator<>() { + final Iterator<S3SourceRecord> inner = lst.iterator(); + @Override + public boolean hasNext() { + return inner.hasNext(); + } + + @Override + public S3SourceRecord next() { + try { + Thread.sleep(Duration.ofSeconds(6).toMillis()); + } catch (InterruptedException e) { + // do nothing. + } + return inner.next(); + } + }; + + final List<SourceRecord> results = new ArrayList<>(); + // since the polling is returning data at or near the time limit the 3 record may be returned as follows + // Record 1 may be returned in Poll1 or Poll2 + // Record 2 may be returned in Poll2 or Poll2 + // Record 3 may be returned in Poll3 or Poll4 + + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + startSourceTask(s3SourceTask); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + // poll 1 + List<SourceRecord> pollResult = s3SourceTask.poll(); + stopWatch.stop(); + if (pollResult != null) { + results.addAll(pollResult); + } + assertThat(results).hasSizeLessThanOrEqualTo(1); + // poll 2 + stopWatch.reset(); + stopWatch.start(); + pollResult = s3SourceTask.poll(); + stopWatch.stop(); + if (pollResult != null) { + results.addAll(pollResult); + } + assertThat(results).hasSizeLessThanOrEqualTo(2); + // poll 3 + stopWatch.reset(); + stopWatch.start(); + pollResult = s3SourceTask.poll(); + stopWatch.stop(); + if (pollResult != null) { + results.addAll(pollResult); + } + assertThat(results).hasSizeLessThanOrEqualTo(3); + // poll 4 + stopWatch.reset(); + stopWatch.start(); + pollResult = s3SourceTask.poll(); + stopWatch.stop(); + if (results.size() == lst.size()) { + assertThat(pollResult).isNull(); + } else { + results.addAll(pollResult); + } + assertThat(results).hasSize(3); + } - setBasicProperties(); - s3SourceTask.start(properties); + @Test + void testPollsWithExcessRecords() { + // test that multiple polls to get all records succeeds. + properties.put(SourceConfigFragment.MAX_POLL_RECORDS, "2"); + + final List<S3SourceRecord> lst = createS3SourceRecords(3); + + final Iterator<S3SourceRecord> sourceRecordIterator = lst.iterator(); + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + + startSourceTask(s3SourceTask); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + List<SourceRecord> results = s3SourceTask.poll(); + assertThat(results).hasSize(2); + results = s3SourceTask.poll(); + assertThat(results).hasSize(1); + stopWatch.stop(); + assertThat(stopWatch.getTime()).isLessThan(AbstractSourceTask.MAX_POLL_TIME.toMillis() * 2); } - private void setBasicProperties() { - properties.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); - properties.put("name", "test_source_connector"); - properties.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); - properties.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); - properties.put("tasks.max", "1"); - properties.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); - properties.put(TARGET_TOPIC_PARTITIONS, "0,1"); - properties.put(TARGET_TOPICS, "testtopic"); + @Test + void testPollWhenConnectorStopped() { + final List<S3SourceRecord> lst = createS3SourceRecords(3); + final Iterator<S3SourceRecord> sourceRecordIterator = lst.iterator(); + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + + startSourceTask(s3SourceTask); + s3SourceTask.stop(); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + final List<SourceRecord> results = s3SourceTask.poll(); + stopWatch.stop(); + assertThat(results).isNull(); + assertThat(stopWatch.getTime()).isLessThan(TIMING_DELTA); + + } + + private static class TestingS3SourceTask extends S3SourceTask { // NOPMD not a test class + + TestingS3SourceTask(final Iterator<S3SourceRecord> realIterator) { + super(); + super.setS3SourceRecordIterator(realIterator); + } + @Override + protected void setS3SourceRecordIterator(final Iterator<S3SourceRecord> iterator) { + // do nothing. + } } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java index e02135d18..cc9db65cd 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -17,19 +17,17 @@ package io.aiven.kafka.connect.s3.source.utils; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; import static org.mockito.internal.verification.VerificationModeFactory.times; -import java.net.ConnectException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Supplier; +import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.errors.DataException; import org.apache.kafka.connect.source.SourceRecord; import org.apache.kafka.connect.storage.Converter; @@ -38,7 +36,6 @@ import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; @@ -61,75 +58,46 @@ class RecordProcessorTest { @Mock private AWSV2SourceClient sourceClient; - private AtomicBoolean connectorStopped; - private Iterator<S3SourceRecord> sourceRecordIterator; - - @BeforeEach - void setUp() { - connectorStopped = new AtomicBoolean(false); - sourceRecordIterator = mock(Iterator.class); - } + private static final Supplier<Boolean> TRUE = () -> true; + private static final Supplier<Boolean> FALSE = () -> false; @Test - void testProcessRecordsNoRecords() { - when(s3SourceConfig.getMaxPollRecords()).thenReturn(5); - when(sourceRecordIterator.hasNext()).thenReturn(false); - - final List<SourceRecord> results = new ArrayList<>(); - final List<SourceRecord> processedRecords = RecordProcessor.processRecords( - sourceRecordIterator, - results, - s3SourceConfig, - connectorStopped, - sourceClient, offsetManager - ); - - assertThat(processedRecords).as("Processed records should be empty when there are no records.").isEmpty(); + void testCreateSourceRecord() { + + final SourceRecord mockSourceRecord = mock(SourceRecord.class); + final S3SourceRecord mockRecord = mock(S3SourceRecord.class); + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenReturn(mockSourceRecord); + + final SourceRecord result = RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, + offsetManager); + + verify(mockRecord, times(1)).getSourceRecord(any()); + assertThat(result).isEqualTo(mockSourceRecord); + } @Test - void testProcessRecordsWithRecords() throws ConnectException { - when(s3SourceConfig.getMaxPollRecords()).thenReturn(5); - when(sourceRecordIterator.hasNext()).thenReturn(true, false); // One iteration with records + void testCreateSourceRecordWithDataError() { final S3SourceRecord mockRecord = mock(S3SourceRecord.class); - when(sourceRecordIterator.next()).thenReturn(mockRecord); - - final List<SourceRecord> results = new ArrayList<>(); - RecordProcessor.processRecords( - sourceRecordIterator, - results, - s3SourceConfig, - connectorStopped, - sourceClient, offsetManager - ); - - assertThat(results).hasSize(1); - verify(sourceRecordIterator, times(1)).next(); - } + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenThrow(new DataException("Testing exception")); - @Test - void testProcessRecordsConnectorStopped() { - when(s3SourceConfig.getMaxPollRecords()).thenReturn(5); - connectorStopped.set(true); // Simulate connector stopped - - final List<SourceRecord> results = new ArrayList<>(); - final List<SourceRecord> processedRecords = RecordProcessor.processRecords( - sourceRecordIterator, - results, - s3SourceConfig, - connectorStopped, - sourceClient, offsetManager - ); - - assertThat(processedRecords).as("Processed records should be empty when connector is stopped.").isEmpty(); - verify(sourceRecordIterator, never()).next(); + when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.NONE); + + assertThatExceptionOfType(ConnectException.class).as("Errors tolerance: NONE") + .isThrownBy(() -> RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, + offsetManager)); + + when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.ALL); + final SourceRecord result = RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, + offsetManager); + assertThat(result).isNull(); } @Test void testCreateSourceRecords() { final S3SourceRecord mockRecord = mock(S3SourceRecord.class); - when(mockRecord.getSourceRecord()).thenReturn(mock(SourceRecord.class)); + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenReturn(mock(SourceRecord.class)); final SourceRecord sourceRecords = RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, offsetManager); @@ -140,13 +108,13 @@ void testCreateSourceRecords() { @Test void errorToleranceOnNONE() { final S3SourceRecord mockRecord = mock(S3SourceRecord.class); - when(mockRecord.getSourceRecord()).thenThrow(new DataException("generic issue")); + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenThrow(new DataException("generic issue")); when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.NONE); assertThatThrownBy( () -> RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, offsetManager)) - .isInstanceOf(org.apache.kafka.connect.errors.ConnectException.class) + .isInstanceOf(ConnectException.class) .hasMessage("Data Exception caught during S3 record to source record transformation"); } @@ -154,7 +122,7 @@ void errorToleranceOnNONE() { @Test void errorToleranceOnALL() { final S3SourceRecord mockRecord = mock(S3SourceRecord.class); - when(mockRecord.getSourceRecord()).thenThrow(new DataException("generic issue")); + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenThrow(new DataException("generic issue")); when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.ALL); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java index b701ea85d..af9b679fa 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -31,16 +31,24 @@ import java.io.ByteArrayInputStream; import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; import java.util.Collections; +import java.util.Iterator; import java.util.stream.Stream; +import org.apache.kafka.connect.data.SchemaAndValue; + import io.aiven.kafka.connect.common.source.input.AvroTransformer; import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; +import io.aiven.kafka.connect.common.source.input.InputFormat; import io.aiven.kafka.connect.common.source.input.Transformer; +import io.aiven.kafka.connect.common.source.input.TransformerFactory; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import software.amazon.awssdk.services.s3.model.S3Object; final class SourceRecordIteratorTest { @@ -67,25 +75,26 @@ void testIteratorProcessesS3Objects() throws Exception { try (InputStream mockInputStream = new ByteArrayInputStream(new byte[] {})) { when(mockSourceApiClient.getObject(anyString())).thenReturn(() -> mockInputStream); - when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) - .thenReturn(Stream.of(new Object())); + mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); - when(mockSourceApiClient.getListOfObjectKeys(any())).thenReturn(Collections.emptyIterator()); - SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, + when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Collections.emptyIterator()); + Iterator<S3SourceRecord> iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, mockSourceApiClient); assertThat(iterator.hasNext()).isFalse(); - assertThat(iterator.next()).isNull(); - when(mockSourceApiClient.getListOfObjectKeys(any())) - .thenReturn(Collections.singletonList(key).listIterator()); + final S3Object obj = S3Object.builder().key(key).build(); + final ByteArrayInputStream bais = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8)); + when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(obj).iterator()); + when(mockSourceApiClient.getObject(any())).thenReturn(() -> bais); iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, mockSourceApiClient); - assertThat(iterator.hasNext()).isTrue(); + assertThat(iterator).hasNext(); assertThat(iterator.next()).isNotNull(); + assertThat(iterator).isExhausted(); } } @@ -93,15 +102,17 @@ void testIteratorProcessesS3Objects() throws Exception { void testIteratorProcessesS3ObjectsForByteArrayTransformer() throws Exception { final String key = "topic-00001-abc123.txt"; + final S3Object s3Object = S3Object.builder().key(key).build(); - // Mock InputStream - try (InputStream mockInputStream = new ByteArrayInputStream(new byte[] {})) { - when(mockSourceApiClient.getObject(anyString())).thenReturn(() -> mockInputStream); + // With ByteArrayTransformer + try (InputStream inputStream = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8))) { + when(mockSourceApiClient.getObject(key)).thenReturn(() -> inputStream); + + when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(s3Object).iterator()); - // With ByteArrayTransformer mockTransformer = mock(ByteArrayTransformer.class); when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) - .thenReturn(Stream.of(new Object())); + .thenReturn(Stream.of(SchemaAndValue.NULL)); when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); @@ -110,24 +121,36 @@ void testIteratorProcessesS3ObjectsForByteArrayTransformer() throws Exception { when(mockOffsetManager.recordsProcessedForObjectKey(anyMap(), anyString())) .thenReturn(BYTES_TRANSFORMATION_NUM_OF_RECS); - SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, - mockSourceApiClient); - assertThat(iterator.hasNext()).isTrue(); - iterator.next(); + // should skip if any records were produced by source record iterator. + final Iterator<S3SourceRecord> iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, mockSourceApiClient); + assertThat(iterator.hasNext()).isFalse(); + verify(mockSourceApiClient, never()).getObject(any()); verify(mockTransformer, never()).getRecords(any(), anyString(), anyInt(), any(), anyLong()); + } - // With AvroTransformer + // With AvroTransformer + try (InputStream inputStream = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8))) { + when(mockSourceApiClient.getObject(key)).thenReturn(() -> inputStream); + when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(s3Object).iterator()); mockTransformer = mock(AvroTransformer.class); when(mockSourceApiClient.getListOfObjectKeys(any())) .thenReturn(Collections.singletonList(key).listIterator()); + when(mockOffsetManager.recordsProcessedForObjectKey(anyMap(), anyString())) .thenReturn(BYTES_TRANSFORMATION_NUM_OF_RECS); - iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, mockSourceApiClient); + when(mockTransformer.getKeyData(anyString(), anyString(), any())).thenReturn(SchemaAndValue.NULL); + when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) + .thenReturn(Arrays.asList(SchemaAndValue.NULL).stream()); + + final Iterator<S3SourceRecord> iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, mockSourceApiClient); assertThat(iterator.hasNext()).isTrue(); iterator.next(); verify(mockTransformer, times(1)).getRecords(any(), anyString(), anyInt(), any(), anyLong()); } } + } diff --git a/settings.gradle.kts b/settings.gradle.kts index 21aca87b9..a4451cb5e 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -6,6 +6,7 @@ val avroConverterVersion by extra("7.2.2") val avroDataVersion by extra("7.2.2") val awaitilityVersion by extra("4.2.1") val commonsTextVersion by extra("1.11.0") +val commonsCollections4Version by extra("4.4") val hadoopVersion by extra("3.4.0") val hamcrestVersion by extra("2.2") val jacksonVersion by extra("2.15.3") @@ -30,6 +31,9 @@ dependencyResolutionManagement { create("apache") { library("avro", "org.apache.avro:avro:$avroVersion") library("commons-text", "org.apache.commons:commons-text:$commonsTextVersion") + library( + "commons-collection4", + "org.apache.commons:commons-collections4:$commonsCollections4Version") library("kafka-connect-api", "org.apache.kafka:connect-api:$kafkaVersion") library("kafka-connect-json", "org.apache.kafka:connect-json:$kafkaVersion") library("kafka-connect-runtime", "org.apache.kafka:connect-runtime:$kafkaVersion") From 6b967d378e093db17b14d6546589263c4c17a4e0 Mon Sep 17 00:00:00 2001 From: Murali Basani <muralidhar.basani@aiven.io> Date: Tue, 14 Jan 2025 13:17:44 +0100 Subject: [PATCH 87/90] Tasks assignment strategy - commons integration - [KCON-63] (#384) [KCON-63] - Integrate Task assignment strategies of common module into s3 release feature branch - Delete hard coding of file pattern from s3 iterator class - Update existing tests - Added new integration tests to verify other strategy use cases --- .../common/config/FileNameFragment.java | 20 +- .../common/config/SourceCommonConfig.java | 5 + .../common/config/SourceConfigFragment.java | 32 +- .../common/source/input/JsonTransformer.java | 2 +- .../source/input/utils/FilePatternUtils.java | 88 ++++++ .../source/task/DistributionStrategy.java | 51 +++ ...egy.java => HashDistributionStrategy.java} | 20 +- .../task/ObjectDistributionStrategy.java | 91 ------ .../task/PartitionDistributionStrategy.java | 84 +++++ ...rtitionInFilenameDistributionStrategy.java | 112 ------- .../PartitionInPathDistributionStrategy.java | 105 ------ .../enums/ObjectDistributionStrategy.java | 48 +++ ...java => HashDistributionStrategyTest.java} | 19 +- .../PartitionDistributionStrategyTest.java | 299 ++++++++++++++++++ ...ionInFilenameDistributionStrategyTest.java | 161 ---------- ...rtitionInPathDistributionStrategyTest.java | 168 ---------- .../connect/s3/source/AwsIntegrationTest.java | 62 ++-- .../connect/s3/source/IntegrationBase.java | 5 +- .../connect/s3/source/IntegrationTest.java | 133 +++++--- .../kafka/connect/s3/source/S3SourceTask.java | 38 ++- .../s3/source/config/S3SourceConfig.java | 6 + .../s3/source/utils/AWSV2SourceClient.java | 77 +---- .../s3/source/utils/RecordProcessor.java | 1 - .../s3/source/utils/SourceRecordIterator.java | 57 ++-- .../connect/s3/source/S3SourceTaskTest.java | 1 + .../source/utils/AWSV2SourceClientTest.java | 78 ++--- .../utils/SourceRecordIteratorTest.java | 89 +++++- 27 files changed, 951 insertions(+), 901 deletions(-) create mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtils.java create mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionStrategy.java rename commons/src/main/java/io/aiven/kafka/connect/common/source/task/{HashObjectDistributionStrategy.java => HashDistributionStrategy.java} (75%) delete mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/task/ObjectDistributionStrategy.java create mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategy.java delete mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionInFilenameDistributionStrategy.java delete mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionInPathDistributionStrategy.java create mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/task/enums/ObjectDistributionStrategy.java rename commons/src/test/java/io/aiven/kafka/connect/common/source/task/{HashObjectDistributionStrategyTest.java => HashDistributionStrategyTest.java} (82%) create mode 100644 commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategyTest.java delete mode 100644 commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionInFilenameDistributionStrategyTest.java delete mode 100644 commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionInPathDistributionStrategyTest.java diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/FileNameFragment.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/FileNameFragment.java index 8d3156e22..467ea2cb2 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/FileNameFragment.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/FileNameFragment.java @@ -43,9 +43,12 @@ public final class FileNameFragment extends ConfigFragment { static final String FILE_MAX_RECORDS = "file.max.records"; static final String FILE_NAME_TIMESTAMP_TIMEZONE = "file.name.timestamp.timezone"; static final String FILE_NAME_TIMESTAMP_SOURCE = "file.name.timestamp.source"; - static final String FILE_NAME_TEMPLATE_CONFIG = "file.name.template"; + public static final String FILE_NAME_TEMPLATE_CONFIG = "file.name.template"; static final String DEFAULT_FILENAME_TEMPLATE = "{{topic}}-{{partition}}-{{start_offset}}"; + public static final String FILE_PATH_PREFIX_TEMPLATE_CONFIG = "file.prefix.template"; + static final String DEFAULT_FILE_PATH_PREFIX_TEMPLATE = "topics/{{topic}}/partition={{partition}}/"; + public FileNameFragment(final AbstractConfig cfg) { super(cfg); } @@ -109,9 +112,18 @@ public void ensureValid(final String name, final Object value) { configDef.define(FILE_NAME_TIMESTAMP_SOURCE, ConfigDef.Type.STRING, TimestampSource.Type.WALLCLOCK.name(), new TimestampSourceValidator(), ConfigDef.Importance.LOW, "Specifies the the timestamp variable source. Default is wall-clock.", GROUP_FILE, fileGroupCounter++, // NOPMD - // UnusedAssignment ConfigDef.Width.SHORT, FILE_NAME_TIMESTAMP_SOURCE); + configDef.define(FILE_PATH_PREFIX_TEMPLATE_CONFIG, ConfigDef.Type.STRING, DEFAULT_FILE_PATH_PREFIX_TEMPLATE, + new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, + "The template for file prefix on S3. " + + "Supports `{{ variable }}` placeholders for substituting variables. " + + "Currently supported variables are `topic` and `partition` " + + "and are mandatory to have these in the directory structure." + + "Example prefix : topics/{{topic}}/partition/{{partition}}/", + GROUP_FILE, fileGroupCounter++, // NOPMD UnusedAssignment + ConfigDef.Width.LONG, FILE_PATH_PREFIX_TEMPLATE_CONFIG); + return configDef; } @@ -185,4 +197,8 @@ public int getMaxRecordsPerFile() { return cfg.getInt(FILE_MAX_RECORDS); } + public String getFilePathPrefixTemplateConfig() { + return cfg.getString(FILE_PATH_PREFIX_TEMPLATE_CONFIG); + } + } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java index 954c9151d..2c9cafe61 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java @@ -24,6 +24,7 @@ import io.aiven.kafka.connect.common.source.input.InputFormat; import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.common.source.input.TransformerFactory; +import io.aiven.kafka.connect.common.source.task.enums.ObjectDistributionStrategy; public class SourceCommonConfig extends CommonConfig { @@ -69,6 +70,10 @@ public ErrorsTolerance getErrorsTolerance() { return sourceConfigFragment.getErrorsTolerance(); } + public ObjectDistributionStrategy getObjectDistributionStrategy() { + return sourceConfigFragment.getObjectDistributionStrategy(); + } + public int getMaxPollRecords() { return sourceConfigFragment.getMaxPollRecords(); } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java index 58befa60e..f3955a7e3 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java @@ -16,12 +16,16 @@ package io.aiven.kafka.connect.common.config; +import static io.aiven.kafka.connect.common.source.task.enums.ObjectDistributionStrategy.OBJECT_HASH; +import static io.aiven.kafka.connect.common.source.task.enums.ObjectDistributionStrategy.PARTITION_IN_FILENAME; + import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; +import io.aiven.kafka.connect.common.source.task.enums.ObjectDistributionStrategy; -import org.codehaus.plexus.util.StringUtils; +import org.apache.commons.lang3.StringUtils; public final class SourceConfigFragment extends ConfigFragment { private static final String GROUP_OTHER = "OTHER_CFG"; @@ -32,6 +36,8 @@ public final class SourceConfigFragment extends ConfigFragment { public static final String TARGET_TOPICS = "topics"; public static final String ERRORS_TOLERANCE = "errors.tolerance"; + public static final String OBJECT_DISTRIBUTION_STRATEGY = "object.distribution.strategy"; + /** * Construct the ConfigFragment.. * @@ -67,7 +73,14 @@ public static ConfigDef update(final ConfigDef configDef) { ConfigDef.Width.NONE, TARGET_TOPIC_PARTITIONS); configDef.define(TARGET_TOPICS, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "eg : connect-storage-offsets", GROUP_OFFSET_TOPIC, - offsetStorageGroupCounter++, ConfigDef.Width.NONE, TARGET_TOPICS); // NOPMD + offsetStorageGroupCounter++, ConfigDef.Width.NONE, TARGET_TOPICS); + configDef.define(OBJECT_DISTRIBUTION_STRATEGY, ConfigDef.Type.STRING, OBJECT_HASH.name(), + new ObjectDistributionStrategyValidator(), ConfigDef.Importance.MEDIUM, + "Based on tasks.max config and this strategy, objects are processed in distributed" + + " way by Kafka connect workers, supported values : " + OBJECT_HASH + ", " + + PARTITION_IN_FILENAME, + GROUP_OTHER, offsetStorageGroupCounter++, ConfigDef.Width.NONE, OBJECT_DISTRIBUTION_STRATEGY); // NOPMD + // UnusedAssignment return configDef; } @@ -92,6 +105,10 @@ public ErrorsTolerance getErrorsTolerance() { return ErrorsTolerance.forName(cfg.getString(ERRORS_TOLERANCE)); } + public ObjectDistributionStrategy getObjectDistributionStrategy() { + return ObjectDistributionStrategy.forName(cfg.getString(OBJECT_DISTRIBUTION_STRATEGY)); + } + private static class ErrorsToleranceValidator implements ConfigDef.Validator { @Override public void ensureValid(final String name, final Object value) { @@ -103,4 +120,15 @@ public void ensureValid(final String name, final Object value) { } } + private static class ObjectDistributionStrategyValidator implements ConfigDef.Validator { + @Override + public void ensureValid(final String name, final Object value) { + final String objectDistributionStrategy = (String) value; + if (StringUtils.isNotBlank(objectDistributionStrategy)) { + // This will throw an Exception if not a valid value. + ObjectDistributionStrategy.forName(objectDistributionStrategy); + } + } + } + } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java index c6aea0e82..8069d08c1 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java @@ -30,7 +30,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.io.function.IOSupplier; -import org.codehaus.plexus.util.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtils.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtils.java new file mode 100644 index 000000000..546c0c4c4 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtils.java @@ -0,0 +1,88 @@ +/* + * Copyright 2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input.utils; + +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.kafka.common.config.ConfigException; + +import org.apache.commons.lang3.StringUtils; + +public final class FilePatternUtils { + + public static final String PATTERN_PARTITION_KEY = "partition"; + public static final String PATTERN_TOPIC_KEY = "topic"; + public static final String START_OFFSET_PATTERN = "{{start_offset}}"; + public static final String TIMESTAMP_PATTERN = "{{timestamp}}"; + public static final String PARTITION_PATTERN = "{{" + PATTERN_PARTITION_KEY + "}}"; + public static final String TOPIC_PATTERN = "{{" + PATTERN_TOPIC_KEY + "}}"; + + // Use a named group to return the partition in a complex string to always get the correct information for the + // partition number. + public static final String PARTITION_NAMED_GROUP_REGEX_PATTERN = "(?<" + PATTERN_PARTITION_KEY + ">\\d+)"; + public static final String NUMBER_REGEX_PATTERN = "(?:\\d+)"; + public static final String TOPIC_NAMED_GROUP_REGEX_PATTERN = "(?<" + PATTERN_TOPIC_KEY + ">[a-zA-Z0-9\\-_.]+)"; + + private FilePatternUtils() { + // hidden + } + public static Pattern configurePattern(final String expectedSourceNameFormat) { + if (expectedSourceNameFormat == null || !expectedSourceNameFormat.contains(PARTITION_PATTERN)) { + throw new ConfigException(String.format( + "Source name format %s missing partition pattern {{partition}} please configure the expected source to include the partition pattern.", + expectedSourceNameFormat)); + } + // Build REGEX Matcher + String regexString = StringUtils.replace(expectedSourceNameFormat, START_OFFSET_PATTERN, NUMBER_REGEX_PATTERN); + regexString = StringUtils.replace(regexString, TIMESTAMP_PATTERN, NUMBER_REGEX_PATTERN); + regexString = StringUtils.replace(regexString, TOPIC_PATTERN, TOPIC_NAMED_GROUP_REGEX_PATTERN); + regexString = StringUtils.replace(regexString, PARTITION_PATTERN, PARTITION_NAMED_GROUP_REGEX_PATTERN); + try { + return Pattern.compile(regexString); + } catch (IllegalArgumentException iae) { + throw new ConfigException( + String.format("Unable to compile the regex pattern %s to retrieve the partition id.", regexString), + iae); + } + } + + public static Optional<String> getTopic(final Pattern filePattern, final String sourceName) { + return matchPattern(filePattern, sourceName).map(matcher -> matcher.group(PATTERN_TOPIC_KEY)); + } + + public static Optional<Integer> getPartitionId(final Pattern filePattern, final String sourceName) { + return matchPattern(filePattern, sourceName).flatMap(matcher -> { + try { + return Optional.of(Integer.parseInt(matcher.group(PATTERN_PARTITION_KEY))); + } catch (NumberFormatException e) { + return Optional.empty(); + } + }); + } + + private static Optional<Matcher> matchPattern(final Pattern filePattern, final String sourceName) { + if (filePattern == null || sourceName == null) { + throw new IllegalArgumentException("filePattern and sourceName must not be null"); + } + + final Matcher matcher = filePattern.matcher(sourceName); + return matcher.find() ? Optional.of(matcher) : Optional.empty(); + } + +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionStrategy.java new file mode 100644 index 000000000..8d370c689 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionStrategy.java @@ -0,0 +1,51 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.task; + +import java.util.regex.Pattern; + +/** + * An {@link DistributionStrategy} provides a mechanism to share the work of processing records from objects (or files) + * into tasks, which are subsequently processed (potentially in parallel) by Kafka Connect workers. + * <p> + * The number of objects in cloud storage can be very high, and they are distributed amongst tasks to minimize the + * overhead of assigning work to Kafka worker threads. All objects assigned to the same task will be processed together + * sequentially by the same worker, which can be useful for maintaining order between objects. There are usually fewer + * workers than tasks, and they will be assigned the remaining tasks as work completes. + */ +public interface DistributionStrategy { + /** + * Check if the object should be processed by the task with the given {@code taskId}. Any single object should be + * assigned deterministically to a single taskId. + * + * @param taskId + * a task ID, usually for the currently running task + * @param valueToBeEvaluated + * The value to be evaluated to determine if it should be processed by the task. + * @return true if the task should process the object, false if it should not. + */ + boolean isPartOfTask(int taskId, String valueToBeEvaluated, Pattern filePattern); + + /** + * When a connector receives a reconfigure event this method should be called to ensure that the distribution + * strategy is updated correctly. + * + * @param maxTasks + * The maximum number of tasks created for the Connector + */ + void configureDistributionStrategy(int maxTasks); +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/HashObjectDistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategy.java similarity index 75% rename from commons/src/main/java/io/aiven/kafka/connect/common/source/task/HashObjectDistributionStrategy.java rename to commons/src/main/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategy.java index c39676ad0..4928f30d9 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/HashObjectDistributionStrategy.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategy.java @@ -16,25 +16,27 @@ package io.aiven.kafka.connect.common.source.task; +import java.util.regex.Pattern; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * {@link HashObjectDistributionStrategy} evenly distributes cloud storage objects between tasks using the hashcode of - * the object's filename, which is uniformly distributed and deterministic across workers. + * {@link HashDistributionStrategy} evenly distributes cloud storage objects between tasks using the hashcode of the + * object's filename, which is uniformly distributed and deterministic across workers. * <p> * This is well-suited to use cases where the order of events between records from objects is not important, especially * when ingesting files into Kafka that were not previously created by a supported cloud storage Sink. */ -public final class HashObjectDistributionStrategy implements ObjectDistributionStrategy { - private final static Logger LOG = LoggerFactory.getLogger(HashObjectDistributionStrategy.class); +public final class HashDistributionStrategy implements DistributionStrategy { + private final static Logger LOG = LoggerFactory.getLogger(HashDistributionStrategy.class); private int maxTasks; - HashObjectDistributionStrategy(final int maxTasks) { - this.maxTasks = maxTasks; + public HashDistributionStrategy(final int maxTasks) { + configureDistributionStrategy(maxTasks); } @Override - public boolean isPartOfTask(final int taskId, final String filenameToBeEvaluated) { + public boolean isPartOfTask(final int taskId, final String filenameToBeEvaluated, final Pattern filePattern) { if (filenameToBeEvaluated == null) { LOG.warn("Ignoring as it is not passing a correct filename to be evaluated."); return false; @@ -46,8 +48,8 @@ public boolean isPartOfTask(final int taskId, final String filenameToBeEvaluated } @Override - public void reconfigureDistributionStrategy(final int maxTasks, final String expectedFormat) { - setMaxTasks(maxTasks); + public void configureDistributionStrategy(final int maxTasks) { + this.maxTasks = maxTasks; } public void setMaxTasks(final int maxTasks) { diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/ObjectDistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/ObjectDistributionStrategy.java deleted file mode 100644 index 5925d880d..000000000 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/ObjectDistributionStrategy.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task; - -/** - * An {@link ObjectDistributionStrategy} provides a mechanism to share the work of processing records from objects (or - * files) into tasks, which are subsequently processed (potentially in parallel) by Kafka Connect workers. - * <p> - * The number of objects in cloud storage can be very high, and they are distributed amongst tasks to minimize the - * overhead of assigning work to Kafka worker threads. All objects assigned to the same task will be processed together - * sequentially by the same worker, which can be useful for maintaining order between objects. There are usually fewer - * workers than tasks, and they will be assigned the remaining tasks as work completes. - */ -public interface ObjectDistributionStrategy { - - /** - * Check if the object should be processed by the task with the given {@code taskId}. Any single object should be - * assigned deterministically to a single taskId. - * - * @param taskId - * a task ID, usually for the currently running task - * @param valueToBeEvaluated - * The value to be evaluated to determine if it should be processed by the task. - * @return true if the task should process the object, false if it should not. - */ - boolean isPartOfTask(int taskId, String valueToBeEvaluated); - - /** - * When a connector receives a reconfigure event this method should be called to ensure that the distribution - * strategy is updated correctly. - * - * @param maxTasks - * The maximum number of tasks created for the Connector - * @param expectedFormat - * The expected format, of files, path, table names or other ways to partition the tasks. - */ - void reconfigureDistributionStrategy(int maxTasks, String expectedFormat); - - /** - * Check if the task is responsible for this set of files by checking if the given task matches the partition id. - * - * @param taskId - * the current running task - * @param partitionId - * The partitionId recovered from the file path. - * @return true if this task is responsible for this partition. false if it is not responsible for this task. - */ - default boolean taskMatchesPartition(final int taskId, final int partitionId) { - // The partition id and task id are both expected to start at 0 but if the task id is changed to start at 1 this - // will break. - return taskId == partitionId; - } - - /** - * In the event of more partitions existing then tasks configured, the task will be required to take up additional - * tasks that match. - * - * @param taskId - * the current running task. - * @param maxTasks - * The maximum number of configured tasks allowed to run for this connector. - * @param partitionId - * The partitionId recovered from the file path. - * @return true if the task supplied should handle the supplied partition - */ - default boolean taskMatchesModOfPartitionAndMaxTask(final int taskId, final int maxTasks, final int partitionId) { - - return taskMatchesPartition(taskId, partitionId % maxTasks); - } - - default boolean toBeProcessedByThisTask(final int taskId, final int maxTasks, final int partitionId) { - return partitionId < maxTasks - ? taskMatchesPartition(taskId, partitionId) - : taskMatchesModOfPartitionAndMaxTask(taskId, maxTasks, partitionId); - - } -} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategy.java new file mode 100644 index 000000000..25f22dfc0 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategy.java @@ -0,0 +1,84 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.task; + +import java.util.Optional; +import java.util.regex.Pattern; + +import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * The {@link PartitionDistributionStrategy} finds a partition in the object's filename by matching it to an expected + * format, and assigns all partitions to the same task. + * <p> + * This useful when a sink connector has created the object name in a format like + * {@code topicname-{{partition}}-{{start_offset}}}, and we want all objects with the same partition to be processed + * within a single task. + */ +public final class PartitionDistributionStrategy implements DistributionStrategy { + private final static Logger LOG = LoggerFactory.getLogger(PartitionDistributionStrategy.class); + private int maxTasks; + + public PartitionDistributionStrategy(final int maxTasks) { + this.maxTasks = maxTasks; + } + + /** + * + * @param sourceNameToBeEvaluated + * is the filename/table name of the source for the connector. + * @return Predicate to confirm if the given source name matches + */ + @Override + public boolean isPartOfTask(final int taskId, final String sourceNameToBeEvaluated, final Pattern filePattern) { + if (sourceNameToBeEvaluated == null) { + LOG.warn("Ignoring as it is not passing a correct filename to be evaluated."); + return false; + } + final Optional<Integer> optionalPartitionId = FilePatternUtils.getPartitionId(filePattern, + sourceNameToBeEvaluated); + + if (optionalPartitionId.isPresent()) { + return optionalPartitionId.get() < maxTasks + ? taskMatchesPartition(taskId, optionalPartitionId.get()) + : taskMatchesPartition(taskId, optionalPartitionId.get() % maxTasks); + } + LOG.warn("Unable to find the partition from this file name {}", sourceNameToBeEvaluated); + return false; + } + + boolean taskMatchesPartition(final int taskId, final int partitionId) { + // The partition id and task id are both expected to start at 0 but if the task id is changed to start at 1 this + // will break. + return taskId == partitionId; + } + + /** + * When a connector reconfiguration event is received this method should be called to ensure the correct strategy is + * being implemented by the connector. + * + * @param maxTasks + * maximum number of configured tasks for this connector + */ + @Override + public void configureDistributionStrategy(final int maxTasks) { + this.maxTasks = maxTasks; + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionInFilenameDistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionInFilenameDistributionStrategy.java deleted file mode 100644 index f74e56826..000000000 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionInFilenameDistributionStrategy.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.kafka.common.config.ConfigException; - -import org.codehaus.plexus.util.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * The {@link PartitionInFilenameDistributionStrategy} finds a partition in the object's filename by matching it to an - * expected format, and assigns all partitions to the same task. - * <p> - * This useful when a sink connector has created the object name in a format like - * {@code topicname-{{partition}}-{{start_offset}}}, and we want all objects with the same partition to be processed - * within a single task. - */ -public final class PartitionInFilenameDistributionStrategy implements ObjectDistributionStrategy { - private final static Logger LOG = LoggerFactory.getLogger(PartitionInFilenameDistributionStrategy.class); - private final static String NUMBER_REGEX_PATTERN = "(\\d)+"; - // Use a named group to return the partition in a complex string to always get the correct information for the - // partition number. - private final static String PARTITION_NAMED_GROUP_REGEX_PATTERN = "(?<partition>\\d)+"; - private final static String PARTITION_PATTERN = "\\{\\{partition}}"; - private final static String START_OFFSET_PATTERN = "\\{\\{start_offset}}"; - private final static String TIMESTAMP_PATTERN = "\\{\\{timestamp}}"; - public static final String PARTITION = "partition"; - private Pattern partitionPattern; - - private int maxTasks; - - PartitionInFilenameDistributionStrategy(final int maxTasks, final String expectedSourceNameFormat) { - configureDistributionStrategy(maxTasks, expectedSourceNameFormat); - } - - /** - * - * @param sourceNameToBeEvaluated - * is the filename/table name of the source for the connector. - * @return Predicate to confirm if the given source name matches - */ - @Override - public boolean isPartOfTask(final int taskId, final String sourceNameToBeEvaluated) { - if (sourceNameToBeEvaluated == null) { - LOG.warn("Ignoring as it is not passing a correct filename to be evaluated."); - return false; - } - final Matcher match = partitionPattern.matcher(sourceNameToBeEvaluated); - if (match.find()) { - return toBeProcessedByThisTask(taskId, maxTasks, Integer.parseInt(match.group(PARTITION))); - } - LOG.warn("Unable to find the partition from this file name {}", sourceNameToBeEvaluated); - return false; - } - - /** - * When a connector reconfiguration event is received this method should be called to ensure the correct strategy is - * being implemented by the connector. - * - * @param maxTasks - * maximum number of configured tasks for this connector - * @param expectedSourceNameFormat - * what the format of the source should appear like so to configure the task distribution. - */ - @Override - public void reconfigureDistributionStrategy(final int maxTasks, final String expectedSourceNameFormat) { - configureDistributionStrategy(maxTasks, expectedSourceNameFormat); - } - - private void configureDistributionStrategy(final int maxTasks, final String expectedSourceNameFormat) { - if (expectedSourceNameFormat == null || !expectedSourceNameFormat.contains(PARTITION_PATTERN)) { - throw new ConfigException(String.format( - "Source name format %s missing partition pattern {{partition}}, please configure the expected source to include the partition pattern.", - expectedSourceNameFormat)); - } - setMaxTasks(maxTasks); - // Build REGEX Matcher - String regexString = StringUtils.replace(expectedSourceNameFormat, START_OFFSET_PATTERN, NUMBER_REGEX_PATTERN); - regexString = StringUtils.replace(regexString, TIMESTAMP_PATTERN, NUMBER_REGEX_PATTERN); - regexString = StringUtils.replace(regexString, PARTITION_PATTERN, PARTITION_NAMED_GROUP_REGEX_PATTERN); - try { - partitionPattern = Pattern.compile(regexString); - } catch (IllegalArgumentException iae) { - throw new ConfigException( - String.format("Unable to compile the regex pattern %s to retrieve the partition id.", regexString), - iae); - } - } - - private void setMaxTasks(final int maxTasks) { - this.maxTasks = maxTasks; - } - -} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionInPathDistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionInPathDistributionStrategy.java deleted file mode 100644 index 85e1c3e75..000000000 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionInPathDistributionStrategy.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task; - -import org.apache.kafka.common.config.ConfigException; -import org.apache.kafka.connect.errors.ConnectException; - -import org.apache.commons.lang3.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * The {@link PartitionInPathDistributionStrategy} finds a partition number in the path by matching a - * {@code {{partition}} } marker in the path. - * <p> - * This useful when a sink connector has created the object name in a path like - * {@code /PREFIX/partition={{partition}}/YYYY/MM/DD/mm/}}, and we want all objects with the same partition to be - * processed within a single task. - * <p> - * Partitions are evenly distributed between tasks. For example, in Connect with 10 Partitions and 3 tasks: - * - * <pre> - * | Task | Partitions | - * |------|------------| - * | 0 | 0, 3, 6, 9 | - * | 1 | 1, 4, 7 | - * | 2 | 2, 5, 8 | - * </pre> - */ -public final class PartitionInPathDistributionStrategy implements ObjectDistributionStrategy { - public static final String PARTITION_ID_PATTERN = "\\{\\{partition}}"; - private final static Logger LOG = LoggerFactory.getLogger(PartitionInPathDistributionStrategy.class); - - private String prefix; - private int maxTasks; - - PartitionInPathDistributionStrategy(final int maxTasks, final String expectedPathFormat) { - configureDistributionStrategy(maxTasks, expectedPathFormat); - } - - @Override - public boolean isPartOfTask(final int taskId, final String pathToBeEvaluated) { - if (pathToBeEvaluated == null || !pathToBeEvaluated.startsWith(prefix)) { - LOG.warn("Ignoring path {}, does not contain the preconfigured prefix {} set up at startup", - pathToBeEvaluated, prefix); - return false; - } - final String modifiedPath = StringUtils.substringAfter(pathToBeEvaluated, prefix); - if (!modifiedPath.contains("/")) { - LOG.warn("Ignoring path {}, does not contain any sub folders after partitionId prefix {}", - pathToBeEvaluated, prefix); - return false; - } - final String partitionId = StringUtils.substringBefore(modifiedPath, "/"); - - try { - return toBeProcessedByThisTask(taskId, maxTasks, Integer.parseInt(partitionId)); - } catch (NumberFormatException ex) { - throw new ConnectException(String - .format("Unexpected non integer value found parsing path for partitionId: %s", pathToBeEvaluated)); - } - } - - /** - * - * @param maxTasks - * The maximum number of configured tasks for this - * @param expectedPathFormat - * The format of the path and where to identify - */ - @Override - public void reconfigureDistributionStrategy(final int maxTasks, final String expectedPathFormat) { - configureDistributionStrategy(maxTasks, expectedPathFormat); - } - - private void configureDistributionStrategy(final int maxTasks, final String expectedPathFormat) { - setMaxTasks(maxTasks); - - if (StringUtils.isEmpty(expectedPathFormat) || !expectedPathFormat.contains(PARTITION_ID_PATTERN)) { - throw new ConfigException(String.format( - "Expected path format %s is missing the identifier '%s' to correctly select the partition", - expectedPathFormat, PARTITION_ID_PATTERN)); - } - prefix = StringUtils.substringBefore(expectedPathFormat, PARTITION_ID_PATTERN); - } - - private void setMaxTasks(final int maxTasks) { - this.maxTasks = maxTasks; - } - -} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/enums/ObjectDistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/enums/ObjectDistributionStrategy.java new file mode 100644 index 000000000..26c1efa94 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/enums/ObjectDistributionStrategy.java @@ -0,0 +1,48 @@ +/* + * Copyright 2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.task.enums; + +import java.util.Arrays; +import java.util.Objects; + +import org.apache.kafka.common.config.ConfigException; + +public enum ObjectDistributionStrategy { + + OBJECT_HASH("object_hash"), PARTITION_IN_FILENAME("partition_in_filename"); + + private final String name; + + public String value() { + return name; + } + + ObjectDistributionStrategy(final String name) { + this.name = name; + } + + public static ObjectDistributionStrategy forName(final String name) { + Objects.requireNonNull(name, "name cannot be null"); + for (final ObjectDistributionStrategy objectDistributionStrategy : ObjectDistributionStrategy.values()) { + if (objectDistributionStrategy.name.equalsIgnoreCase(name)) { + return objectDistributionStrategy; + } + } + throw new ConfigException(String.format("Unknown object.distribution.strategy type: %s, allowed values %s ", + name, Arrays.toString(ObjectDistributionStrategy.values()))); + } +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashObjectDistributionStrategyTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategyTest.java similarity index 82% rename from commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashObjectDistributionStrategyTest.java rename to commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategyTest.java index 63a6a76f5..50ef73964 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashObjectDistributionStrategyTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategyTest.java @@ -21,10 +21,12 @@ import java.util.ArrayList; import java.util.List; +import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; + import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.CsvSource; -final class HashObjectDistributionStrategyTest { +final class HashDistributionStrategyTest { @ParameterizedTest @CsvSource({ "logs-0-0002.txt", "logs-1-0002.txt", "logs-2-0002.txt", "logs-3-0002.txt", "logs-4-0002.txt", @@ -34,10 +36,11 @@ final class HashObjectDistributionStrategyTest { "reallylongfilenamecreatedonS3tohisdesomedata and alsohassome spaces.txt" }) void hashDistributionExactlyOnce(final String path) { final int maxTaskId = 10; - final ObjectDistributionStrategy taskDistribution = new HashObjectDistributionStrategy(maxTaskId); + final DistributionStrategy taskDistribution = new HashDistributionStrategy(maxTaskId); final List<Boolean> results = new ArrayList<>(); for (int taskId = 0; taskId < maxTaskId; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); + results.add(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern("{{topic}}-{{partition}}-{{start_offset}}"))); } assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); @@ -51,18 +54,20 @@ void hashDistributionExactlyOnce(final String path) { "reallylongfilenamecreatedonS3tohisdesomedata and alsohassome spaces.txt" }) void hashDistributionExactlyOnceWithReconfigureEvent(final String path) { int maxTasks = 10; - final ObjectDistributionStrategy taskDistribution = new HashObjectDistributionStrategy(maxTasks); + final DistributionStrategy taskDistribution = new HashDistributionStrategy(maxTasks); final List<Boolean> results = new ArrayList<>(); for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); + results.add(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern("{{topic}}-{{partition}}-{{start_offset}}"))); } assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); results.clear(); maxTasks = 5; - taskDistribution.reconfigureDistributionStrategy(maxTasks, null); + taskDistribution.configureDistributionStrategy(maxTasks); for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); + results.add(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern("{{topic}}-{{partition}}-{{start_offset}}"))); } assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategyTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategyTest.java new file mode 100644 index 000000000..c62fbb9bc --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategyTest.java @@ -0,0 +1,299 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.task; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.kafka.common.config.ConfigException; + +import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + +final class PartitionDistributionStrategyTest { + + @Test + void partitionInFileNameDefaultAivenS3Sink() { + final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(2); + assertThat(taskDistribution.isPartOfTask(1, "logs-1-00112.gz", + FilePatternUtils.configurePattern("{{topic}}-{{partition}}-{{start_offset}}"))).isTrue(); + } + + @Test + void partitionLocationNotSetExpectException() { + assertThatThrownBy(() -> new PartitionDistributionStrategy(2).isPartOfTask(1, "", + FilePatternUtils.configurePattern("logs-23-<partition>-<start_offset>"))) + .isInstanceOf(ConfigException.class) + .hasMessage( + "Source name format logs-23-<partition>-<start_offset> missing partition pattern {{partition}} please configure the expected source to include the partition pattern."); + + } + + @ParameterizedTest(name = "[{index}] Pattern: {0}, Filename: {1}") + @CsvSource({ "{{topic}}-{{partition}}-{{start_offset}},logs-0-00112.gz", + "{{topic}}-2024-{{timestamp}}-{{partition}}-{{start_offset}},logs-2024-20220201-0-00112.gz", + "{{topic}}-2023-{{partition}}-{{start_offset}},logs-2023-0-00112.gz", + "logs-2023-{{partition}}-{{start_offset}},logs-2023-0-00112.gz", + "{{topic}}-{{timestamp}}-{{timestamp}}-{{timestamp}}-{{partition}}-{{start_offset}},logs1-2022-10-02-10-00112.gz", + "{{topic}}{{partition}}-{{start_offset}},89521-00112.gz", + "{{topic}}-{{partition}},Emergency-TEST1-00112.gz", + "Emergency-TEST1-{{partition}},Emergency-TEST1-00112.gz", + "{{topic}}-{{partition}}-{{start_offset}},PROD-logs-1-00112.gz", + "{{topic}}-{{partition}},DEV_team_1-00112.gz", + "{{topic}}-{{partition}}-{{start_offset}},timeseries-1-00112.gz" }) + void testPartitionFileNamesAndExpectedOutcomes(final String configuredFilenamePattern, final String filename) { + final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(1); + // This test is testing the filename matching not the task allocation. + assertThat(taskDistribution.isPartOfTask(0, filename, + FilePatternUtils.configurePattern(configuredFilenamePattern))).isTrue(); + } + + @ParameterizedTest(name = "[{index}] Pattern: {0}, Filename: {1}") + @CsvSource({ "different-topic-{{partition}}-{{start_offset}},logs-1-00112.gz", + "no-seperator-in-date-partition-offset-{{timestamp}}-{{partition}}-{{start_offset}},no-seperator-in-date-partition-offset-202420220201100112.gz", + "logs-2024-{{timestamp}}-{{partition}}-{{start_offset}},logs-20201-1-00112.gz", + "logs-2024-{{timestamp}}{{partition}}-{{start_offset}},logs-202011-00112.gz", + "logs-2024-{{timestamp}}{{partition}}-{{start_offset}}, ", + "logs-2023-{{partition}}-{{start_offset}},logs-2023-one-00112.gz" }) + void expectFalseOnMalformedFilenames(final String configuredFilenamePattern, final String filename) { + final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(1); + // This test is testing the filename matching not the task allocation. + assertThat(taskDistribution.isPartOfTask(0, filename, + FilePatternUtils.configurePattern(configuredFilenamePattern))).isFalse(); + } + + @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1}, Filename: {1}") + @CsvSource({ "0,10,topics/logs/0/logs-0-0002.txt", "1,10,topics/logs/1/logs-1-0002.txt", + "2,10,topics/logs/2/logs-2-0002.txt", "3,10,topics/logs/3/logs-3-0002.txt", + "4,10,topics/logs/4/logs-4-0002.txt", "5,10,topics/logs/5/logs-5-0002.txt", + "6,10,topics/logs/6/logs-6-0002.txt", "7,10,topics/logs/7/logs-7-0002.txt", + "8,10,topics/logs/8/logs-8-0002.txt", "9,10,topics/logs/9/logs-9-0002.txt" }) + void checkCorrectDistributionAcrossTasksOnFileName(final int taskId, final int maxTasks, final String path) { + + final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTasks); + + assertThat(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern("logs-{{partition}}-{{start_offset}}"))).isTrue(); + } + + @ParameterizedTest(name = "[{index}] MaxTasks: {0}, Filename: {1}") + @CsvSource({ "10,topics/logs/0/logs-0002.txt", "10,topics/logs/1/logs-001.txt", "10,topics/logs/2/logs-0002.txt", + "10,topics/logs/3/logs-0002.txt", "10,topics/logs/4/logs-0002.txt", "10,topics/logs/5/logs-0002.txt", + "10,topics/logs/6/logs-0002.txt", "10,topics/logs/7/logs-0002.txt", "10,topics/logs/8/logs-0002.txt", + "10,topics/logs/9/logs-0002.txt" }) + void filenameDistributionExactlyOnceDistribution(final int maxTasks, final String path) { + + final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTasks); + final List<Boolean> results = new ArrayList<>(); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern("logs-{{partition}}.txt"))); + } + assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, + Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); + } + + @ParameterizedTest(name = "[{index}] MaxTasks: {0}, TaskId: {1}, Filename: {2}") + @CsvSource({ "10,5,topics/logs/0/logs-0002.txt", "10,5,topics/logs/1/logs-001.txt", + "10,5,topics/logs/2/logs-0002.txt", "10,5,topics/logs/3/logs-0002.txt", "10,5,topics/logs/4/logs-0002.txt", + "10,5,topics/logs/5/logs-0002.txt", "10,5,topics/logs/6/logs-0002.txt", "10,5,topics/logs/7/logs-0002.txt", + "10,5,topics/logs/8/logs-0002.txt", "10,5,topics/logs/9/logs-0002.txt" }) + void filenameDistributionExactlyOnceDistributionWithTaskReconfiguration(final int maxTasks, + final int maxTaskAfterReConfig, final String path) { + + final String expectedSourceNameFormat = "logs-{{partition}}.txt"; + final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTasks); + final List<Boolean> results = new ArrayList<>(); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern(expectedSourceNameFormat))); + } + assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, + Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); + taskDistribution.configureDistributionStrategy(maxTaskAfterReConfig); + + results.clear(); + for (int taskId = 0; taskId < maxTaskAfterReConfig; taskId++) { + results.add(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern(expectedSourceNameFormat))); + } + assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, + Boolean.FALSE); + } + + @ParameterizedTest + @CsvSource({ + "{topic}}-1.txt,'Source name format {topic}}-1.txt missing partition pattern {{partition}} please configure the expected source to include the partition pattern.'", + " ,'Source name format null missing partition pattern {{partition}} please configure the expected source to include the partition pattern.'", + "empty-pattern,'Source name format empty-pattern missing partition pattern {{partition}} please configure the expected source to include the partition pattern.'" }) + void malformedFilenameSetup(final String expectedSourceFormat, final String expectedErrorMessage) { + final int maxTaskId = 1; + assertThatThrownBy(() -> new PartitionDistributionStrategy(maxTaskId).isPartOfTask(1, "", + FilePatternUtils.configurePattern(expectedSourceFormat))).isInstanceOf(ConfigException.class) + .hasMessage(expectedErrorMessage); + } + + @Test + void errorExpectedNullGivenForSourceNameFormat() { + final int maxTaskId = 1; + assertThatThrownBy(() -> new PartitionDistributionStrategy(maxTaskId).isPartOfTask(1, "", + FilePatternUtils.configurePattern(null))).isInstanceOf(ConfigException.class) + .hasMessage("Source name format null missing partition pattern {{partition}} please configure" + + " the expected source to include the partition pattern."); + } + + @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") + @CsvSource({ "0,1,topics/logs/partition=5/logs+5+0002.txt,true", + "0,4,topics/logs/partition=5/logs+5+0002.txt,false", "1,4,topics/logs/partition=5/logs+5+0002.txt,true", + "0,3,topics/logs/partition=5/logs+5+0002.txt,false", "0,5,topics/logs/partition=5/logs+5+0002.txt,true", + "2,3,topics/logs/partition=5/logs+5+0002.txt,true" }) + void withLeadingStringPartitionNamingConvention(final int taskId, final int maxTasks, final String path, + final boolean expectedResult) { + + final PartitionDistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTasks); + + assertThat(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern("topics/{{topic}}/partition={{partition}}/.*$"))) + .isEqualTo(expectedResult); + } + + @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") + @CsvSource({ "0,1,bucket/topics/topic-1/5/logs+5+0002.txt,true", + "0,4,bucket/topics/topic-1/5/logs+5+0002.txt,false", "1,4,bucket/topics/topic-1/5/logs+5+0002.txt,true", + "0,3,bucket/topics/topic-1/5/logs+5+0002.txt,false", "0,5,bucket/topics/topic-1/5/logs+5+0002.txt,true", + "2,3,bucket/topics/topic-1/5/logs+5+0002.txt,true" }) + void partitionInPathConvention(final int taskId, final int maxTaskId, final String path, + final boolean expectedResult) { + + final PartitionDistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTaskId); + + assertThat(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern("bucket/topics/{{topic}}/{{partition}}/.*$"))) + .isEqualTo(expectedResult); + } + + @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") + @CsvSource({ "0,10,topics/logs/0/logs-0002.txt", "1,10,topics/logs/1/logs-0002.txt", + "2,10,topics/logs/2/logs-0002.txt", "3,10,topics/logs/3/logs-0002.txt", "4,10,topics/logs/4/logs-0002.txt", + "5,10,topics/logs/5/logs-0002.txt", "6,10,topics/logs/6/logs-0002.txt", "7,10,topics/logs/7/logs-0002.txt", + "8,10,topics/logs/8/logs-0002.txt", "9,10,topics/logs/9/logs-0002.txt" }) + void checkCorrectDistributionAcrossTasks(final int taskId, final int maxTaskId, final String path) { + + final PartitionDistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTaskId); + + assertThat(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern("topics/{{topic}}/{{partition}}/.*$"))).isTrue(); + } + + @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") + @CsvSource({ "1,10,topcs/logs/0/logs-0002.txt", "2,10,topics/logs/1", "3,10,S3/logs/2/logs-0002.txt", + "4,10,topics/log/3/logs-0002.txt", "5,10,prod/logs/4/logs-0002.txt", "6,10,misspelt/logs/5/logs-0002.txt", + "7,10,test/logs/6/logs-0002.txt", "8,10,random/logs/7/logs-0002.txt", "9,10,DEV/logs/8/logs-0002.txt", + "10,10,poll/logs/9/logs-0002.txt" }) + void expectNoMatchOnUnconfiguredPaths(final int taskId, final int maxTaskId, final String path) { + + final PartitionDistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTaskId); + + assertThat(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern("topics/{{topic}}/{{partition}}/.*$"))).isFalse(); + } + + @Test + void expectExceptionOnNonIntPartitionSupplied() { + final int taskId = 1; + final int maxTaskId = 1; + final String path = "topics/logs/one/test-001.txt"; + + final PartitionDistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTaskId); + assertThat(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern("topics/{{topic}}/{{partition}}/.*$"))).isFalse(); + } + + @Test + void malformedRegexSetup() { + final int maxTaskId = 1; + + assertThatThrownBy(() -> new PartitionDistributionStrategy(maxTaskId).isPartOfTask(1, "", + FilePatternUtils.configurePattern("topics/{{topic}}/"))).isInstanceOf(ConfigException.class) + .hasMessage( + "Source name format topics/{{topic}}/ missing partition pattern {{partition}} please configure the expected source to include the partition pattern."); + } + + @ParameterizedTest + @CsvSource({ + ",Source name format null missing partition pattern {{partition}} please configure the expected source to include the partition pattern.", + "@adsfs,Source name format @adsfs missing partition pattern {{partition}} please configure the expected source to include the partition pattern.", + "empty-path,Source name format empty-path missing partition pattern {{partition}} please configure the expected source to include the partition pattern." }) + void malformedPathSetup(final String expectedPathFormat, final String expectedErrorMessage) { + final int maxTaskId = 1; + + assertThatThrownBy(() -> new PartitionDistributionStrategy(maxTaskId).isPartOfTask(1, expectedPathFormat, + FilePatternUtils.configurePattern(expectedPathFormat))).isInstanceOf(ConfigException.class) + .hasMessage(expectedErrorMessage); + } + + @ParameterizedTest + @CsvSource({ "10,topics/logs/0/logs-0002.txt", "10,topics/logs/1/logs-001.log", "10,topics/logs/2/logs-0002.txt", + "10,topics/logs/3/logs-0002.txt", "10,topics/logs/4/logs-0002.txt", "10,topics/logs/5/logs-0002.txt", + "10,topics/logs/6/logs-0002.txt", "10,topics/logs/7/logs-0002.txt", "10,topics/logs/8/logs-0002.txt", + "10,topics/logs/9/logs-0002.txt" }) + void partitionPathDistributionExactlyOnceDistribution(final int maxTasks, final String path) { + final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTasks); + final List<Boolean> results = new ArrayList<>(); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern("topics/{{topic}}/{{partition}}/.*$"))); + } + assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, + Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); + } + + @ParameterizedTest + @CsvSource({ "10,5,topics/logs/0/logs-0002.txt", "10,5,topics/logs/1/logs-001.txt", + "10,5,topics/logs/2/logs-0002.txt", "10,5,topics/logs/3/logs-0002.txt", "10,5,topics/logs/4/logs-0002.txt", + "10,5,topics/logs/5/logs-0002.txt", "10,5,topics/logs/6/logs-0002.txt", "10,5,topics/logs/7/logs-0002.txt", + "10,5,topics/logs/8/logs-0002.txt", "10,5,topics/logs/9/logs-0002.txt" }) + void partitionPathDistributionExactlyOnceDistributionWithTaskReconfiguration(final int maxTasks, + final int maxTaskAfterReConfig, final String path) { + + final String expectedSourceNameFormat = "topics/{{topic}}/{{partition}}/.*$"; + final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTasks); + final List<Boolean> results = new ArrayList<>(); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern(expectedSourceNameFormat))); + } + assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, + Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); + taskDistribution.configureDistributionStrategy(maxTaskAfterReConfig); + + results.clear(); + for (int taskId = 0; taskId < maxTaskAfterReConfig; taskId++) { + results.add(taskDistribution.isPartOfTask(taskId, path, + FilePatternUtils.configurePattern(expectedSourceNameFormat))); + } + assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, + Boolean.FALSE); + } + +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionInFilenameDistributionStrategyTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionInFilenameDistributionStrategyTest.java deleted file mode 100644 index f1993ecba..000000000 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionInFilenameDistributionStrategyTest.java +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.kafka.common.config.ConfigException; - -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.CsvSource; - -final class PartitionInFilenameDistributionStrategyTest { - - @Test - void partitionInFileNameDefaultAivenS3Sink() { - final ObjectDistributionStrategy taskDistribution = new PartitionInFilenameDistributionStrategy(2, - "logs-\\{\\{partition}}-\\{\\{start_offset}}"); - assertThat(taskDistribution.isPartOfTask(1, "logs-1-00112.gz")).isTrue(); - } - - @Test - void partitionLocationNotSetExpectException() { - assertThatThrownBy(() -> new PartitionInFilenameDistributionStrategy(2, "logs-23-<partition>-<start_offset>")) - .isInstanceOf(ConfigException.class) - .hasMessage( - "Source name format logs-23-<partition>-<start_offset> missing partition pattern {{partition}}, please configure the expected source to include the partition pattern."); - - } - - @ParameterizedTest(name = "[{index}] Pattern: {0}, Filename: {1}") - @CsvSource({ "logs-\\{\\{partition}}-\\{\\{start_offset}},logs-0-00112.gz", - "logs-2024-\\{\\{timestamp}}-\\{\\{partition}}-\\{\\{start_offset}},logs-2024-20220201-0-00112.gz", - "logs-2023-\\{\\{partition}}-\\{\\{start_offset}},logs-2023-0-00112.gz", - "logs1-\\{\\{timestamp}}-\\{\\{timestamp}}-\\{\\{timestamp}}-\\{\\{partition}}-\\{\\{start_offset}},logs1-2022-10-02-10-00112.gz", - "8952\\{\\{partition}}-\\{\\{start_offset}},89521-00112.gz", - "Emergency-TEST\\{\\{partition}}-\\{\\{start_offset}},Emergency-TEST1-00112.gz", - "PROD-logs-\\{\\{partition}}-\\{\\{start_offset}},PROD-logs-1-00112.gz", - "DEV_team_\\{\\{partition}}-\\{\\{start_offset}},DEV_team_1-00112.gz", - "timeseries-\\{\\{partition}}-\\{\\{start_offset}},timeseries-1-00112.gz" }) - void testPartitionFileNamesAndExpectedOutcomes(final String configuredFilenamePattern, final String filename) { - final ObjectDistributionStrategy taskDistribution = new PartitionInFilenameDistributionStrategy(1, - configuredFilenamePattern); - // This test is testing the filename matching not the task allocation. - assertThat(taskDistribution.isPartOfTask(0, filename)).isTrue(); - } - - @ParameterizedTest(name = "[{index}] Pattern: {0}, Filename: {1}") - @CsvSource({ "different-topic-\\{\\{partition}}-\\{\\{start_offset}},logs-1-00112.gz", - "no-seperator-in-date-partition-offset-\\{\\{timestamp}}-\\{\\{partition}}-\\{\\{start_offset}},no-seperator-in-date-partition-offset-202420220201100112.gz", - "logs-2024-\\{\\{timestamp}}-\\{\\{partition}}-\\{\\{start_offset}},logs-20201-1-00112.gz", - "logs-2024-\\{\\{timestamp}}\\{\\{partition}}-\\{\\{start_offset}},logs-202011-00112.gz", - "logs-2024-\\{\\{timestamp}}\\{\\{partition}}-\\{\\{start_offset}}, ", - "logs-2023-\\{\\{partition}}-\\{\\{start_offset}},logs-2023-one-00112.gz" }) - void expectFalseOnMalformedFilenames(final String configuredFilenamePattern, final String filename) { - final ObjectDistributionStrategy taskDistribution = new PartitionInFilenameDistributionStrategy(1, - configuredFilenamePattern); - // This test is testing the filename matching not the task allocation. - assertThat(taskDistribution.isPartOfTask(0, filename)).isFalse(); - } - - @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1}, Filename: {1}") - @CsvSource({ "0,10,topics/logs/0/logs-0-0002.txt", "1,10,topics/logs/1/logs-1-0002.txt", - "2,10,topics/logs/2/logs-2-0002.txt", "3,10,topics/logs/3/logs-3-0002.txt", - "4,10,topics/logs/4/logs-4-0002.txt", "5,10,topics/logs/5/logs-5-0002.txt", - "6,10,topics/logs/6/logs-6-0002.txt", "7,10,topics/logs/7/logs-7-0002.txt", - "8,10,topics/logs/8/logs-8-0002.txt", "9,10,topics/logs/9/logs-9-0002.txt" }) - void checkCorrectDistributionAcrossTasks(final int taskId, final int maxTasks, final String path) { - - final ObjectDistributionStrategy taskDistribution = new PartitionInFilenameDistributionStrategy(maxTasks, - "logs-\\{\\{partition}}-\\{\\{start_offset}}"); - - assertThat(taskDistribution.isPartOfTask(taskId, path)).isTrue(); - } - - @ParameterizedTest(name = "[{index}] MaxTasks: {0}, Filename: {1}") - @CsvSource({ "10,topics/logs/0/logs-0002.txt", "10,topics/logs/1/logs-001.txt", "10,topics/logs/2/logs-0002.txt", - "10,topics/logs/3/logs-0002.txt", "10,topics/logs/4/logs-0002.txt", "10,topics/logs/5/logs-0002.txt", - "10,topics/logs/6/logs-0002.txt", "10,topics/logs/7/logs-0002.txt", "10,topics/logs/8/logs-0002.txt", - "10,topics/logs/9/logs-0002.txt" }) - void filenameDistributionExactlyOnceDistribution(final int maxTasks, final String path) { - - final ObjectDistributionStrategy taskDistribution = new PartitionInFilenameDistributionStrategy(maxTasks, - "logs-\\{\\{partition}}.txt"); - final List<Boolean> results = new ArrayList<>(); - for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); - } - - @ParameterizedTest(name = "[{index}] MaxTasks: {0}, TaskId: {1}, Filename: {2}") - @CsvSource({ "10,5,topics/logs/0/logs-0002.txt", "10,5,topics/logs/1/logs-001.txt", - "10,5,topics/logs/2/logs-0002.txt", "10,5,topics/logs/3/logs-0002.txt", "10,5,topics/logs/4/logs-0002.txt", - "10,5,topics/logs/5/logs-0002.txt", "10,5,topics/logs/6/logs-0002.txt", "10,5,topics/logs/7/logs-0002.txt", - "10,5,topics/logs/8/logs-0002.txt", "10,5,topics/logs/9/logs-0002.txt" }) - void filenameDistributionExactlyOnceDistributionWithTaskReconfiguration(final int maxTasks, - final int maxTaskAfterReConfig, final String path) { - - final String expectedSourceNameFormat = "logs-\\{\\{partition}}.txt"; - final ObjectDistributionStrategy taskDistribution = new PartitionInFilenameDistributionStrategy(maxTasks, - expectedSourceNameFormat); - final List<Boolean> results = new ArrayList<>(); - for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); - taskDistribution.reconfigureDistributionStrategy(maxTaskAfterReConfig, expectedSourceNameFormat); - - results.clear(); - for (int taskId = 0; taskId < maxTaskAfterReConfig; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE); - } - - @ParameterizedTest - @CsvSource({ - "logs-{{partition}}.txt,'Source name format logs-{{partition}}.txt missing partition pattern {{partition}}, please configure the expected source to include the partition pattern.'", - " ,'Source name format null missing partition pattern {{partition}}, please configure the expected source to include the partition pattern.'", - "empty-pattern,'Source name format empty-pattern missing partition pattern {{partition}}, please configure the expected source to include the partition pattern.'" }) - void malformedFilenameSetup(final String expectedSourceFormat, final String expectedErrorMessage) { - final int maxTaskId = 1; - - assertThatThrownBy(() -> new PartitionInFilenameDistributionStrategy(maxTaskId, expectedSourceFormat)) - .isInstanceOf(ConfigException.class) - .hasMessage(expectedErrorMessage); - } - - @Test - void errorExpectedNullGivenForSourceNameFormat() { - final int maxTaskId = 1; - - assertThatThrownBy(() -> new PartitionInFilenameDistributionStrategy(maxTaskId, null)) - .isInstanceOf(ConfigException.class) - .hasMessage( - "Source name format null missing partition pattern {{partition}}, please configure the expected source to include the partition pattern."); - } - -} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionInPathDistributionStrategyTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionInPathDistributionStrategyTest.java deleted file mode 100644 index 4c2a6fede..000000000 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionInPathDistributionStrategyTest.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.kafka.common.config.ConfigException; -import org.apache.kafka.connect.errors.ConnectException; - -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.CsvSource; - -final class PartitionInPathDistributionStrategyTest { - - @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") - @CsvSource({ "0,1,topics/logs/partition=5/logs+5+0002.txt,true", - "0,4,topics/logs/partition=5/logs+5+0002.txt,false", "1,4,topics/logs/partition=5/logs+5+0002.txt,true", - "0,3,topics/logs/partition=5/logs+5+0002.txt,false", "0,5,topics/logs/partition=5/logs+5+0002.txt,true", - "2,3,topics/logs/partition=5/logs+5+0002.txt,true" }) - void withLeadingStringPartitionNamingConvention(final int taskId, final int maxTasks, final String path, - final boolean expectedResult) { - - final PartitionInPathDistributionStrategy taskDistribution = new PartitionInPathDistributionStrategy(maxTasks, - "topics/logs/partition=\\{\\{partition}}/"); - - assertThat(taskDistribution.isPartOfTask(taskId, path)).isEqualTo(expectedResult); - } - - @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") - @CsvSource({ "0,1,bucket/topics/topic-1/5/logs+5+0002.txt,true", - "0,4,bucket/topics/topic-1/5/logs+5+0002.txt,false", "1,4,bucket/topics/topic-1/5/logs+5+0002.txt,true", - "0,3,bucket/topics/topic-1/5/logs+5+0002.txt,false", "0,5,bucket/topics/topic-1/5/logs+5+0002.txt,true", - "2,3,bucket/topics/topic-1/5/logs+5+0002.txt,true" }) - void partitionInPathConvention(final int taskId, final int maxTaskId, final String path, - final boolean expectedResult) { - - final PartitionInPathDistributionStrategy taskDistribution = new PartitionInPathDistributionStrategy(maxTaskId, - "bucket/topics/topic-1/\\{\\{partition}}/"); - - assertThat(taskDistribution.isPartOfTask(taskId, path)).isEqualTo(expectedResult); - } - - @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") - @CsvSource({ "0,10,topics/logs/0/logs-0002.txt", "1,10,topics/logs/1/logs-0002.txt", - "2,10,topics/logs/2/logs-0002.txt", "3,10,topics/logs/3/logs-0002.txt", "4,10,topics/logs/4/logs-0002.txt", - "5,10,topics/logs/5/logs-0002.txt", "6,10,topics/logs/6/logs-0002.txt", "7,10,topics/logs/7/logs-0002.txt", - "8,10,topics/logs/8/logs-0002.txt", "9,10,topics/logs/9/logs-0002.txt" }) - void checkCorrectDistributionAcrossTasks(final int taskId, final int maxTaskId, final String path) { - - final PartitionInPathDistributionStrategy taskDistribution = new PartitionInPathDistributionStrategy(maxTaskId, - "topics/logs/\\{\\{partition}}/"); - - assertThat(taskDistribution.isPartOfTask(taskId, path)).isTrue(); - } - - @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") - @CsvSource({ "1,10,topcs/logs/0/logs-0002.txt", "2,10,topics/logs/1", "3,10,S3/logs/2/logs-0002.txt", - "4,10,topics/log/3/logs-0002.txt", "5,10,prod/logs/4/logs-0002.txt", "6,10,misspelt/logs/5/logs-0002.txt", - "7,10,test/logs/6/logs-0002.txt", "8,10,random/logs/7/logs-0002.txt", "9,10,DEV/logs/8/logs-0002.txt", - "10,10,poll/logs/9/logs-0002.txt" }) - void expectNoMatchOnUnconfiguredPaths(final int taskId, final int maxTaskId, final String path) { - - final PartitionInPathDistributionStrategy taskDistribution = new PartitionInPathDistributionStrategy(maxTaskId, - "topics/logs/\\{\\{partition}}/"); - - assertThat(taskDistribution.isPartOfTask(taskId, path)).isFalse(); - } - - @Test - void expectExceptionOnNonIntPartitionSupplied() { - final int taskId = 1; - final int maxTaskId = 1; - final String path = "topics/logs/one/test-001.txt"; - - final PartitionInPathDistributionStrategy taskDistribution = new PartitionInPathDistributionStrategy(maxTaskId, - "topics/logs/\\{\\{partition}}/"); - assertThatThrownBy(() -> taskDistribution.isPartOfTask(taskId, path)).isInstanceOf(ConnectException.class) - .hasMessage( - "Unexpected non integer value found parsing path for partitionId: topics/logs/one/test-001.txt"); - } - - @Test - void malformedRegexSetup() { - final int maxTaskId = 1; - - assertThatThrownBy(() -> new PartitionInPathDistributionStrategy(maxTaskId, "topics/logs/{{partition}}/")) - .isInstanceOf(ConfigException.class) - .hasMessage( - "Expected path format topics/logs/{{partition}}/ is missing the identifier '\\{\\{partition}}' to correctly select the partition"); - } - - @ParameterizedTest - @CsvSource({ - ",Expected path format null is missing the identifier '\\{\\{partition}}' to correctly select the partition", - "@adsfs,Expected path format @adsfs is missing the identifier '\\{\\{partition}}' to correctly select the partition", - "empty-path,Expected path format empty-path is missing the identifier '\\{\\{partition}}' to correctly select the partition" }) - void malformedPathSetup(final String expectedPathFormat, final String expectedErrorMessage) { - final int maxTaskId = 1; - - assertThatThrownBy(() -> new PartitionInPathDistributionStrategy(maxTaskId, expectedPathFormat)) - .isInstanceOf(ConfigException.class) - .hasMessage(expectedErrorMessage); - } - - @ParameterizedTest - @CsvSource({ "10,topics/logs/0/logs-0002.txt", "10,topics/logs/1/logs-001.log", "10,topics/logs/2/logs-0002.txt", - "10,topics/logs/3/logs-0002.txt", "10,topics/logs/4/logs-0002.txt", "10,topics/logs/5/logs-0002.txt", - "10,topics/logs/6/logs-0002.txt", "10,topics/logs/7/logs-0002.txt", "10,topics/logs/8/logs-0002.txt", - "10,topics/logs/9/logs-0002.txt" }) - void partitionPathDistributionExactlyOnceDistribution(final int maxTasks, final String path) { - - final ObjectDistributionStrategy taskDistribution = new PartitionInPathDistributionStrategy(maxTasks, - "topics/logs/\\{\\{partition}}"); - final List<Boolean> results = new ArrayList<>(); - for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); - } - - @ParameterizedTest - @CsvSource({ "10,5,topics/logs/0/logs-0002.txt", "10,5,topics/logs/1/logs-001.txt", - "10,5,topics/logs/2/logs-0002.txt", "10,5,topics/logs/3/logs-0002.txt", "10,5,topics/logs/4/logs-0002.txt", - "10,5,topics/logs/5/logs-0002.txt", "10,5,topics/logs/6/logs-0002.txt", "10,5,topics/logs/7/logs-0002.txt", - "10,5,topics/logs/8/logs-0002.txt", "10,5,topics/logs/9/logs-0002.txt" }) - void partitionPathDistributionExactlyOnceDistributionWithTaskReconfiguration(final int maxTasks, - final int maxTaskAfterReConfig, final String path) { - - final String expectedSourceNameFormat = "topics/logs/\\{\\{partition}}"; - final ObjectDistributionStrategy taskDistribution = new PartitionInPathDistributionStrategy(maxTasks, - expectedSourceNameFormat); - final List<Boolean> results = new ArrayList<>(); - for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); - taskDistribution.reconfigureDistributionStrategy(maxTaskAfterReConfig, expectedSourceNameFormat); - - results.clear(); - for (int taskId = 0; taskId < maxTaskAfterReConfig; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE); - } - -} diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java index 42d10aad7..5d95d6ebd 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java @@ -23,7 +23,6 @@ import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_ACCESS_KEY_ID_CONFIG; import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG; import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG; -import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG; import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; @@ -34,8 +33,6 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; -import java.time.ZonedDateTime; -import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -49,6 +46,8 @@ import io.aiven.kafka.connect.common.source.input.InputFormat; import io.aiven.kafka.connect.common.source.input.TransformerFactory; +import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; +import io.aiven.kafka.connect.common.source.task.HashDistributionStrategy; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import io.aiven.kafka.connect.s3.source.utils.AWSV2SourceClient; @@ -58,7 +57,6 @@ import org.apache.avro.Schema; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; @@ -76,28 +74,16 @@ class AwsIntegrationTest implements IntegrationBase { @Container public static final LocalStackContainer LOCALSTACK = IntegrationBase.createS3Container(); - private static String s3Prefix; - private S3Client s3Client; private String s3Endpoint; private BucketAccessor testBucketAccessor; - @Override - public String getS3Prefix() { - return s3Prefix; - } - @Override public S3Client getS3Client() { return s3Client; } - @BeforeAll - static void setUpAll() { - s3Prefix = COMMON_PREFIX + ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) + "/"; - } - @BeforeEach void setupAWS() { s3Client = IntegrationBase.createS3Client(LOCALSTACK); @@ -118,7 +104,6 @@ private Map<String, String> getConfig(final String topics, final int maxTasks) { config.put(AWS_SECRET_ACCESS_KEY_CONFIG, S3_SECRET_ACCESS_KEY); config.put(AWS_S3_ENDPOINT_CONFIG, s3Endpoint); config.put(AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET_NAME); - config.put(AWS_S3_PREFIX_CONFIG, getS3Prefix()); config.put(TARGET_TOPIC_PARTITIONS, "0,1"); config.put(TARGET_TOPICS, topics); config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); @@ -146,14 +131,14 @@ void sourceRecordIteratorBytesTest(final TestInfo testInfo) { final List<String> offsetKeys = new ArrayList<>(); final List<String> expectedKeys = new ArrayList<>(); // write 2 objects to s3 - expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000")); - expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00000")); - expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00001")); - expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00001")); + expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "0")); + expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "0")); + expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "1")); + expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "1")); // we don't expext the empty one. offsetKeys.addAll(expectedKeys); - offsetKeys.add(writeToS3(topicName, new byte[0], "00003")); + offsetKeys.add(writeToS3(topicName, new byte[0], "3")); assertThat(testBucketAccessor.listObjects()).hasSize(5); @@ -165,10 +150,11 @@ void sourceRecordIteratorBytesTest(final TestInfo testInfo) { final OffsetManager offsetManager = new OffsetManager(context, s3SourceConfig); - final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig, new HashSet<>()); + final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig); final Iterator<S3SourceRecord> sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, offsetManager, - TransformerFactory.getTransformer(InputFormat.BYTES), sourceClient); + TransformerFactory.getTransformer(InputFormat.BYTES), sourceClient, new HashDistributionStrategy(1), + FilePatternUtils.configurePattern("{{topic}}-{{partition}}-{{start_offset}}"), 0); final HashSet<String> seenKeys = new HashSet<>(); while (sourceRecordIterator.hasNext()) { @@ -183,8 +169,10 @@ void sourceRecordIteratorBytesTest(final TestInfo testInfo) { @Test void sourceRecordIteratorAvroTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); + final int maxTasks = 1; + final int taskId = 0; - final Map<String, String> configData = getConfig(topicName, 1); + final Map<String, String> configData = getConfig(topicName, maxTasks); configData.put(INPUT_FORMAT_KEY, InputFormat.AVRO.getValue()); configData.put(VALUE_CONVERTER_KEY, "io.confluent.connect.avro.AvroConverter"); @@ -211,12 +199,12 @@ void sourceRecordIteratorAvroTest(final TestInfo testInfo) throws IOException { final Set<String> offsetKeys = new HashSet<>(); - offsetKeys.add(writeToS3(topicName, outputStream1, "00001")); - offsetKeys.add(writeToS3(topicName, outputStream2, "00001")); + offsetKeys.add(writeToS3(topicName, outputStream1, "1")); + offsetKeys.add(writeToS3(topicName, outputStream2, "1")); - offsetKeys.add(writeToS3(topicName, outputStream3, "00002")); - offsetKeys.add(writeToS3(topicName, outputStream4, "00002")); - offsetKeys.add(writeToS3(topicName, outputStream5, "00002")); + offsetKeys.add(writeToS3(topicName, outputStream3, "2")); + offsetKeys.add(writeToS3(topicName, outputStream4, "2")); + offsetKeys.add(writeToS3(topicName, outputStream5, "2")); assertThat(testBucketAccessor.listObjects()).hasSize(5); @@ -228,10 +216,12 @@ void sourceRecordIteratorAvroTest(final TestInfo testInfo) throws IOException { final OffsetManager offsetManager = new OffsetManager(context, s3SourceConfig); - final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig, new HashSet<>()); + final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig); final Iterator<S3SourceRecord> sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, offsetManager, - TransformerFactory.getTransformer(InputFormat.AVRO), sourceClient); + TransformerFactory.getTransformer(InputFormat.AVRO), sourceClient, + new HashDistributionStrategy(maxTasks), + FilePatternUtils.configurePattern("{{topic}}-{{partition}}-{{start_offset}}"), taskId); final HashSet<String> seenKeys = new HashSet<>(); final Map<String, List<Long>> seenRecords = new HashMap<>(); @@ -275,15 +265,15 @@ void verifyIteratorRehydration(final TestInfo testInfo) { final List<String> actualKeys = new ArrayList<>(); // write 2 objects to s3 - expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000") + expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "0") .substring((OBJECT_KEY + SEPARATOR).length())); - expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00000") + expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "0") .substring((OBJECT_KEY + SEPARATOR).length())); assertThat(testBucketAccessor.listObjects()).hasSize(2); final S3SourceConfig s3SourceConfig = new S3SourceConfig(configData); - final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig, new HashSet<>()); + final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig); final Iterator<S3Object> iter = sourceClient.getS3ObjectIterator(null); assertThat(iter).hasNext(); @@ -296,7 +286,7 @@ void verifyIteratorRehydration(final TestInfo testInfo) { assertThat(actualKeys).containsAll(expectedKeys); // write 3rd object to s3 - expectedKeys.add(writeToS3(topicName, testData3.getBytes(StandardCharsets.UTF_8), "00000") + expectedKeys.add(writeToS3(topicName, testData3.getBytes(StandardCharsets.UTF_8), "0") .substring((OBJECT_KEY + SEPARATOR).length())); assertThat(testBucketAccessor.listObjects()).hasSize(3); diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java index a8b91a197..fa4f60b76 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -102,8 +102,6 @@ static byte[] generateNextAvroMessagesStartingFromId(final int messageId, final S3Client getS3Client(); - String getS3Prefix(); - /** * Write file to s3 with the specified key and data. * @@ -134,8 +132,7 @@ default void writeToS3WithKey(final String objectKey, final byte[] testDataBytes * {@link io.aiven.kafka.connect.s3.source.utils.OffsetManager#SEPARATOR} */ default String writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) { - final String objectKey = org.apache.commons.lang3.StringUtils.defaultIfBlank(getS3Prefix(), "") + topicName - + "-" + partitionId + "-" + System.currentTimeMillis() + ".txt"; + final String objectKey = topicName + "-" + partitionId + "-" + System.currentTimeMillis() + ".txt"; writeToS3WithKey(objectKey, testDataBytes); return OBJECT_KEY + SEPARATOR + objectKey; } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index 083d8627e..ad31acc88 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -16,10 +16,13 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.common.config.FileNameFragment.FILE_NAME_TEMPLATE_CONFIG; +import static io.aiven.kafka.connect.common.config.FileNameFragment.FILE_PATH_PREFIX_TEMPLATE_CONFIG; import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.AVRO_VALUE_SERIALIZER; import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.VALUE_CONVERTER_SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.OBJECT_DISTRIBUTION_STRATEGY; import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPIC_PARTITIONS; import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_ACCESS_KEY_ID_CONFIG; @@ -27,6 +30,8 @@ import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG; import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_PREFIX_CONFIG; import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; +import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; import static java.util.Map.entry; import static org.assertj.core.api.Assertions.assertThat; import static org.awaitility.Awaitility.await; @@ -36,8 +41,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.time.Duration; -import java.time.ZonedDateTime; -import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -55,17 +58,21 @@ import org.apache.kafka.common.serialization.ByteArrayDeserializer; import io.aiven.kafka.connect.common.source.input.InputFormat; +import io.aiven.kafka.connect.common.source.task.enums.ObjectDistributionStrategy; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; import com.fasterxml.jackson.databind.JsonNode; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; +import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testcontainers.containers.localstack.LocalStackContainer; @@ -80,7 +87,6 @@ final class IntegrationTest implements IntegrationBase { private static final Logger LOGGER = LoggerFactory.getLogger(IntegrationTest.class); private static final String CONNECTOR_NAME = "aiven-s3-source-connector"; - private static final String COMMON_PREFIX = "s3-source-connector-for-apache-kafka-test-"; private static final int OFFSET_FLUSH_INTERVAL_MS = 500; private static String s3Endpoint; @@ -95,22 +101,16 @@ final class IntegrationTest implements IntegrationBase { private ConnectRunner connectRunner; private static S3Client s3Client; + private TestInfo testInfo; @Override public S3Client getS3Client() { return s3Client; } - @Override - public String getS3Prefix() { - return s3Prefix; - } - public @BeforeAll static void setUpAll() throws IOException, InterruptedException { - s3Prefix = COMMON_PREFIX + ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) + "/"; - s3Client = IntegrationBase.createS3Client(LOCALSTACK); s3Endpoint = LOCALSTACK.getEndpoint().toString(); testBucketAccessor = new BucketAccessor(s3Client, TEST_BUCKET_NAME); @@ -122,6 +122,7 @@ public String getS3Prefix() { @BeforeEach void setUp(final TestInfo testInfo) throws Exception { testBucketAccessor.createBucket(); + this.testInfo = testInfo; connectRunner = new ConnectRunner(OFFSET_FLUSH_INTERVAL_MS); final List<Integer> ports = IntegrationBase.getKafkaListenerPorts(); @@ -151,10 +152,25 @@ void tearDown() { testBucketAccessor.removeBucket(); } - @Test - void bytesTest(final TestInfo testInfo) { + @ParameterizedTest + @ValueSource(booleans = { true, false }) + void bytesTest(final boolean addPrefix) { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 1); + final ObjectDistributionStrategy objectDistributionStrategy; + final int partitionId = 0; + final String prefixPattern = "topics/{{topic}}/partition={{partition}}/"; + String s3Prefix = ""; + if (addPrefix) { + objectDistributionStrategy = ObjectDistributionStrategy.PARTITION_IN_FILENAME; + s3Prefix = "topics/" + topicName + "/partition=" + partitionId + "/"; + } else { + objectDistributionStrategy = ObjectDistributionStrategy.PARTITION_IN_FILENAME; + } + + final String fileNamePatternSeparator = "_"; + + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 1, objectDistributionStrategy, + addPrefix, s3Prefix, prefixPattern, fileNamePatternSeparator); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); @@ -165,11 +181,15 @@ void bytesTest(final TestInfo testInfo) { final List<String> offsetKeys = new ArrayList<>(); // write 2 objects to s3 - offsetKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00000")); - offsetKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00000")); - offsetKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "00001")); - offsetKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "00001")); - offsetKeys.add(writeToS3(topicName, new byte[0], "00003")); + offsetKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "0", s3Prefix, + fileNamePatternSeparator)); + offsetKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "0", s3Prefix, + fileNamePatternSeparator)); + offsetKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "1", s3Prefix, + fileNamePatternSeparator)); + offsetKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "1", s3Prefix, + fileNamePatternSeparator)); + offsetKeys.add(writeToS3(topicName, new byte[0], "3", s3Prefix, "-")); assertThat(testBucketAccessor.listObjects()).hasSize(5); @@ -190,7 +210,9 @@ void bytesTest(final TestInfo testInfo) { @Test void avroTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getAvroConfig(topicName, InputFormat.AVRO); + final boolean addPrefix = false; + final Map<String, String> connectorConfig = getAvroConfig(topicName, InputFormat.AVRO, addPrefix, "", "", + ObjectDistributionStrategy.OBJECT_HASH); connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); @@ -215,12 +237,14 @@ void avroTest(final TestInfo testInfo) throws IOException { final Set<String> offsetKeys = new HashSet<>(); - offsetKeys.add(writeToS3(topicName, outputStream1, "00001")); - offsetKeys.add(writeToS3(topicName, outputStream2, "00001")); + final String s3Prefix = ""; + + offsetKeys.add(writeToS3(topicName, outputStream1, "1", s3Prefix, "-")); + offsetKeys.add(writeToS3(topicName, outputStream2, "1", s3Prefix, "-")); - offsetKeys.add(writeToS3(topicName, outputStream3, "00002")); - offsetKeys.add(writeToS3(topicName, outputStream4, "00002")); - offsetKeys.add(writeToS3(topicName, outputStream5, "00002")); + offsetKeys.add(writeToS3(topicName, outputStream3, "2", s3Prefix, "-")); + offsetKeys.add(writeToS3(topicName, outputStream4, "2", s3Prefix, "-")); + offsetKeys.add(writeToS3(topicName, outputStream5, "2", s3Prefix, "-")); assertThat(testBucketAccessor.listObjects()).hasSize(5); @@ -244,16 +268,26 @@ void avroTest(final TestInfo testInfo) throws IOException { connectRunner.getBootstrapServers()); } - @Test - void parquetTest(final TestInfo testInfo) throws IOException { + @ParameterizedTest + @ValueSource(booleans = { true, false }) + void parquetTest(final boolean addPrefix) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); - final String partition = "00000"; - final String fileName = org.apache.commons.lang3.StringUtils.defaultIfBlank(getS3Prefix(), "") + topicName + "-" - + partition + "-" + System.currentTimeMillis() + ".txt"; + final String partition = "0"; + final ObjectDistributionStrategy objectDistributionStrategy; + final String prefixPattern = "bucket/topics/{{topic}}/partition/{{partition}}/"; + String s3Prefix = ""; + objectDistributionStrategy = ObjectDistributionStrategy.PARTITION_IN_FILENAME; + if (addPrefix) { + s3Prefix = "bucket/topics/" + topicName + "/partition/" + partition + "/"; + } + + final String fileName = (StringUtils.isNotBlank(s3Prefix) ? s3Prefix : "") + topicName + "-" + partition + "-" + + System.currentTimeMillis() + ".txt"; final String name = "testuser"; - final Map<String, String> connectorConfig = getAvroConfig(topicName, InputFormat.PARQUET); + final Map<String, String> connectorConfig = getAvroConfig(topicName, InputFormat.PARQUET, addPrefix, s3Prefix, + prefixPattern, objectDistributionStrategy); connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); final Path path = ContentUtils.getTmpFilePath(name); @@ -275,8 +309,11 @@ void parquetTest(final TestInfo testInfo) throws IOException { .containsExactlyInAnyOrderElementsOf(expectedRecordNames); } - private Map<String, String> getAvroConfig(final String topicName, final InputFormat inputFormat) { - final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 4); + private Map<String, String> getAvroConfig(final String topicName, final InputFormat inputFormat, + final boolean addPrefix, final String s3Prefix, final String prefixPattern, + final ObjectDistributionStrategy objectDistributionStrategy) { + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 4, objectDistributionStrategy, + addPrefix, s3Prefix, prefixPattern, "-"); connectorConfig.put(INPUT_FORMAT_KEY, inputFormat.getValue()); connectorConfig.put(SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); connectorConfig.put(VALUE_CONVERTER_KEY, "io.confluent.connect.avro.AvroConverter"); @@ -288,7 +325,8 @@ private Map<String, String> getAvroConfig(final String topicName, final InputFor @Test void jsonTest(final TestInfo testInfo) { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 1); + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 1, + ObjectDistributionStrategy.PARTITION_IN_FILENAME, false, "", "", "-"); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.JSONL.getValue()); connectorConfig.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.json.JsonConverter"); @@ -301,7 +339,7 @@ void jsonTest(final TestInfo testInfo) { } final byte[] jsonBytes = jsonBuilder.toString().getBytes(StandardCharsets.UTF_8); - final String offsetKey = writeToS3(topicName, jsonBytes, "00001"); + final String offsetKey = writeToS3(topicName, jsonBytes, "1", "", "-"); // Poll Json messages from the Kafka topic and deserialize them final List<JsonNode> records = IntegrationBase.consumeJsonMessages(topicName, 500, @@ -316,25 +354,36 @@ void jsonTest(final TestInfo testInfo) { verifyOffsetPositions(Map.of(offsetKey, 500), connectRunner.getBootstrapServers()); } - private Map<String, String> getConfig(final String connectorName, final String topics, final int maxTasks) { - final Map<String, String> config = new HashMap<>(basicS3ConnectorConfig()); + private Map<String, String> getConfig(final String connectorName, final String topics, final int maxTasks, + final ObjectDistributionStrategy taskDistributionConfig, final boolean addPrefix, final String s3Prefix, + final String prefixPattern, final String fileNameSeparator) { + final Map<String, String> config = new HashMap<>(basicS3ConnectorConfig(addPrefix, s3Prefix)); config.put("name", connectorName); config.put(TARGET_TOPICS, topics); config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); config.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.converters.ByteArrayConverter"); config.put("tasks.max", String.valueOf(maxTasks)); + config.put(OBJECT_DISTRIBUTION_STRATEGY, taskDistributionConfig.value()); + config.put(FILE_NAME_TEMPLATE_CONFIG, + "{{topic}}" + fileNameSeparator + "{{partition}}" + fileNameSeparator + "{{start_offset}}"); + if (addPrefix) { + config.put(FILE_PATH_PREFIX_TEMPLATE_CONFIG, prefixPattern); + } return config; } - private static Map<String, String> basicS3ConnectorConfig() { + private static Map<String, String> basicS3ConnectorConfig(final boolean addPrefix, final String s3Prefix) { final Map<String, String> config = new HashMap<>(); config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); config.put(AWS_ACCESS_KEY_ID_CONFIG, S3_ACCESS_KEY_ID); config.put(AWS_SECRET_ACCESS_KEY_CONFIG, S3_SECRET_ACCESS_KEY); config.put(AWS_S3_ENDPOINT_CONFIG, s3Endpoint); config.put(AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET_NAME); - config.put(AWS_S3_PREFIX_CONFIG, s3Prefix); + if (addPrefix) { + config.put(AWS_S3_PREFIX_CONFIG, s3Prefix); + } config.put(TARGET_TOPIC_PARTITIONS, "0,1"); + return config; } @@ -351,4 +400,12 @@ static void verifyOffsetPositions(final Map<String, Object> expectedRecords, fin }); } } + + String writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId, + final String s3Prefix, final String separator) { + final String objectKey = (StringUtils.isNotBlank(s3Prefix) ? s3Prefix : "") + topicName + separator + + partitionId + separator + System.currentTimeMillis() + ".txt"; + writeToS3WithKey(objectKey, testDataBytes); + return OBJECT_KEY + SEPARATOR + objectKey; + } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 1bfc55580..3ed3fdafd 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -16,17 +16,21 @@ package io.aiven.kafka.connect.s3.source; -import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Objects; -import java.util.Set; +import java.util.regex.Pattern; import org.apache.kafka.connect.source.SourceRecord; import io.aiven.kafka.connect.common.config.SourceCommonConfig; import io.aiven.kafka.connect.common.source.AbstractSourceTask; import io.aiven.kafka.connect.common.source.input.Transformer; +import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; +import io.aiven.kafka.connect.common.source.task.DistributionStrategy; +import io.aiven.kafka.connect.common.source.task.HashDistributionStrategy; +import io.aiven.kafka.connect.common.source.task.PartitionDistributionStrategy; +import io.aiven.kafka.connect.common.source.task.enums.ObjectDistributionStrategy; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.utils.AWSV2SourceClient; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; @@ -63,12 +67,13 @@ public class S3SourceTask extends AbstractSourceTask { /** The AWS Source client */ private AWSV2SourceClient awsv2SourceClient; - /** The list of failed object keys */ - private final Set<String> failedObjectKeys = new HashSet<>(); /** The offset manager this task uses */ private OffsetManager offsetManager; private S3SourceConfig s3SourceConfig; + private int taskId; + private Pattern filePattern; + public S3SourceTask() { super(LOGGER); } @@ -130,9 +135,9 @@ protected SourceCommonConfig configure(final Map<String, String> props) { this.s3SourceConfig = new S3SourceConfig(props); this.transformer = s3SourceConfig.getTransformer(); offsetManager = new OffsetManager(context, s3SourceConfig); - awsv2SourceClient = new AWSV2SourceClient(s3SourceConfig, failedObjectKeys); - setS3SourceRecordIterator( - new SourceRecordIterator(s3SourceConfig, offsetManager, this.transformer, awsv2SourceClient)); + awsv2SourceClient = new AWSV2SourceClient(s3SourceConfig); + setS3SourceRecordIterator(new SourceRecordIterator(s3SourceConfig, offsetManager, this.transformer, + awsv2SourceClient, initializeObjectDistributionStrategy(), filePattern, taskId)); return s3SourceConfig; } @@ -173,4 +178,23 @@ protected void closeResources() { public Transformer getTransformer() { return transformer; } + + private DistributionStrategy initializeObjectDistributionStrategy() { + final ObjectDistributionStrategy objectDistributionStrategy = s3SourceConfig.getObjectDistributionStrategy(); + final int maxTasks = Integer.parseInt(s3SourceConfig.originals().get("tasks.max").toString()); + this.taskId = Integer.parseInt(s3SourceConfig.originals().get("task.id").toString()) % maxTasks; + DistributionStrategy distributionStrategy; + + if (objectDistributionStrategy == ObjectDistributionStrategy.PARTITION_IN_FILENAME) { + this.filePattern = FilePatternUtils + .configurePattern(s3SourceConfig.getS3FileNameFragment().getFilenameTemplate().toString()); + distributionStrategy = new PartitionDistributionStrategy(maxTasks); + } else { + this.filePattern = FilePatternUtils + .configurePattern(s3SourceConfig.getS3FileNameFragment().getFilenameTemplate().toString()); + distributionStrategy = new HashDistributionStrategy(maxTasks); + } + + return distributionStrategy; + } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index 23dc69e9e..ebcffdba5 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -42,9 +42,11 @@ final public class S3SourceConfig extends SourceCommonConfig { public static final Logger LOGGER = LoggerFactory.getLogger(S3SourceConfig.class); private final S3ConfigFragment s3ConfigFragment; + private final FileNameFragment s3FileNameFragment; public S3SourceConfig(final Map<String, String> properties) { super(configDef(), handleDeprecatedYyyyUppercase(properties)); s3ConfigFragment = new S3ConfigFragment(this); + s3FileNameFragment = new FileNameFragment(this); validate(); // NOPMD ConstructorCallsOverridableMethod getStsRole is called } @@ -129,4 +131,8 @@ public S3ConfigFragment getS3ConfigFragment() { return s3ConfigFragment; } + public FileNameFragment getS3FileNameFragment() { + return s3FileNameFragment; + } + } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java index ed460a500..d9dbc0d45 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java @@ -17,10 +17,8 @@ package io.aiven.kafka.connect.s3.source.utils; import java.io.InputStream; -import java.util.HashSet; import java.util.Iterator; import java.util.Objects; -import java.util.Set; import java.util.function.Predicate; import java.util.stream.Stream; @@ -29,8 +27,6 @@ import org.apache.commons.io.function.IOSupplier; import org.apache.commons.lang3.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import software.amazon.awssdk.core.ResponseBytes; import software.amazon.awssdk.services.s3.S3Client; import software.amazon.awssdk.services.s3.model.GetObjectRequest; @@ -44,26 +40,19 @@ */ public class AWSV2SourceClient { - private static final Logger LOGGER = LoggerFactory.getLogger(AWSV2SourceClient.class); public static final int PAGE_SIZE_FACTOR = 2; private final S3SourceConfig s3SourceConfig; private final S3Client s3Client; private final String bucketName; private Predicate<S3Object> filterPredicate = s3Object -> s3Object.size() > 0; - private final Set<String> failedObjectKeys; - - private final int taskId; - private final int maxTasks; /** * @param s3SourceConfig * configuration for Source connector - * @param failedObjectKeys - * all objectKeys which have already been tried but have been unable to process. */ - public AWSV2SourceClient(final S3SourceConfig s3SourceConfig, final Set<String> failedObjectKeys) { - this(new S3ClientFactory().createAmazonS3Client(s3SourceConfig), s3SourceConfig, failedObjectKeys); + public AWSV2SourceClient(final S3SourceConfig s3SourceConfig) { + this(new S3ClientFactory().createAmazonS3Client(s3SourceConfig), s3SourceConfig); } /** @@ -73,47 +62,11 @@ public AWSV2SourceClient(final S3SourceConfig s3SourceConfig, final Set<String> * amazonS3Client * @param s3SourceConfig * configuration for Source connector - * @param failedObjectKeys - * all objectKeys which have already been tried but have been unable to process. */ - AWSV2SourceClient(final S3Client s3Client, final S3SourceConfig s3SourceConfig, - final Set<String> failedObjectKeys) { + AWSV2SourceClient(final S3Client s3Client, final S3SourceConfig s3SourceConfig) { this.s3SourceConfig = s3SourceConfig; this.s3Client = s3Client; this.bucketName = s3SourceConfig.getAwsS3BucketName(); - this.failedObjectKeys = new HashSet<>(failedObjectKeys); - - // TODO the code below should be configured in some sort of taks assignement method/process/call. - int maxTasks; - try { - final Object value = s3SourceConfig.originals().get("tasks.max"); - if (value == null) { - LOGGER.info("Setting tasks.max to 1"); - maxTasks = 1; - } else { - maxTasks = Integer.parseInt(value.toString()); - } - } catch (NumberFormatException e) { // NOPMD catch null pointer - LOGGER.warn("Invalid tasks.max: {}", e.getMessage()); - LOGGER.info("Setting tasks.max to 1"); - maxTasks = 1; - } - this.maxTasks = maxTasks; - int taskId; - try { - final Object value = s3SourceConfig.originals().get("task.id"); - if (value == null) { - LOGGER.info("Setting task.id to 0"); - taskId = 0; - } else { - taskId = Integer.parseInt(value.toString()) % maxTasks; - } - } catch (NumberFormatException e) { // NOPMD catch null pointer - LOGGER.warn("Invalid task.id: {}", e.getMessage()); - LOGGER.info("Setting task.id to 0"); - taskId = 0; - } - this.taskId = taskId; } /** @@ -142,12 +95,7 @@ private Stream<S3Object> getS3ObjectStream(final String startToken) { return null; } - }) - .flatMap(response -> response.contents() - .stream() - .filter(filterPredicate) - .filter(objectSummary -> assignObjectToTask(objectSummary.key())) - .filter(objectSummary -> !failedObjectKeys.contains(objectSummary.key()))); + }).flatMap(response -> response.contents().stream().filter(filterPredicate)); } /** @@ -180,23 +128,14 @@ public IOSupplier<InputStream> getObject(final String objectKey) { return s3ObjectResponse::asInputStream; } - public void addFailedObjectKeys(final String objectKey) { - this.failedObjectKeys.add(objectKey); - } - - public void setFilterPredicate(final Predicate<S3Object> predicate) { - filterPredicate = predicate; - } - - private boolean assignObjectToTask(final String objectKey) { - final int taskAssignment = Math.floorMod(objectKey.hashCode(), maxTasks); - return taskAssignment == taskId; - } - public void shutdown() { s3Client.close(); } + public void addPredicate(final Predicate<S3Object> objectPredicate) { + this.filterPredicate = this.filterPredicate.and(objectPredicate); + } + /** * An iterator that reads from */ diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java index e945c2565..cab511693 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -42,7 +42,6 @@ public static SourceRecord createSourceRecord(final S3SourceRecord s3SourceRecor if (ErrorsTolerance.NONE.equals(s3SourceConfig.getErrorsTolerance())) { throw new ConnectException("Data Exception caught during S3 record to source record transformation", e); } else { - sourceClient.addFailedObjectKeys(s3SourceRecord.getObjectKey()); LOGGER.warn( "Data Exception caught during S3 record to source record transformation {} . errors.tolerance set to 'all', logging warning and continuing to process.", e.getMessage(), e); diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index bded51d1b..820be20aa 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -19,9 +19,8 @@ import java.util.Collections; import java.util.Iterator; import java.util.Map; +import java.util.Optional; import java.util.function.Function; -import java.util.function.Predicate; -import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Stream; @@ -29,9 +28,10 @@ import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; import io.aiven.kafka.connect.common.source.input.Transformer; +import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; +import io.aiven.kafka.connect.common.source.task.DistributionStrategy; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; -import org.apache.commons.collections4.IteratorUtils; import software.amazon.awssdk.services.s3.model.S3Object; /** @@ -39,11 +39,6 @@ * Parquet). */ public final class SourceRecordIterator implements Iterator<S3SourceRecord> { - public static final String PATTERN_TOPIC_KEY = "topicName"; - public static final String PATTERN_PARTITION_KEY = "partitionId"; - - public static final Pattern FILE_DEFAULT_PATTERN = Pattern.compile("(?<topicName>[^/]+?)-" - + "(?<partitionId>\\d{5})-" + "(?<uniqueId>[a-zA-Z0-9]+)" + "\\.(?<fileExtension>[^.]+)$"); // topic-00001.txt public static final long BYTES_TRANSFORMATION_NUM_OF_RECS = 1L; private final OffsetManager offsetManager; @@ -59,25 +54,17 @@ public final class SourceRecordIterator implements Iterator<S3SourceRecord> { private String topic; private int partitionId; + private final DistributionStrategy distributionStrategy; + private final int taskId; + private final Iterator<S3Object> inner; private Iterator<S3SourceRecord> outer; - - private final Predicate<S3Object> fileNamePredicate = s3Object -> { - - final Matcher fileMatcher = FILE_DEFAULT_PATTERN.matcher(s3Object.key()); - - if (fileMatcher.find()) { - // TODO move this from the SourceRecordIterator so that we can decouple it from S3 and make it API agnostic - topic = fileMatcher.group(PATTERN_TOPIC_KEY); - partitionId = Integer.parseInt(fileMatcher.group(PATTERN_PARTITION_KEY)); - return true; - } - return false; - }; + private final Pattern filePattern; public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final OffsetManager offsetManager, - final Transformer transformer, final AWSV2SourceClient sourceClient) { + final Transformer transformer, final AWSV2SourceClient sourceClient, + final DistributionStrategy distributionStrategy, final Pattern filePattern, final int taskId) { super(); this.s3SourceConfig = s3SourceConfig; this.offsetManager = offsetManager; @@ -85,13 +72,35 @@ public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final OffsetMan this.bucketName = s3SourceConfig.getAwsS3BucketName(); this.transformer = transformer; this.sourceClient = sourceClient; + this.filePattern = filePattern; + this.distributionStrategy = distributionStrategy; + this.taskId = taskId; + + // Initialize predicates + sourceClient.addPredicate(this::isFileMatchingPattern); + sourceClient.addPredicate(this::isFileAssignedToTask); // call filters out bad file names and extracts topic/partition - inner = IteratorUtils.filteredIterator(sourceClient.getS3ObjectIterator(null), - s3Object -> this.fileNamePredicate.test(s3Object)); + inner = sourceClient.getS3ObjectIterator(null); outer = Collections.emptyIterator(); } + public boolean isFileMatchingPattern(final S3Object s3Object) { + final Optional<String> optionalTopic = FilePatternUtils.getTopic(filePattern, s3Object.key()); + final Optional<Integer> optionalPartitionId = FilePatternUtils.getPartitionId(filePattern, s3Object.key()); + + if (optionalTopic.isPresent() && optionalPartitionId.isPresent()) { + topic = optionalTopic.get(); + partitionId = optionalPartitionId.get(); + return true; + } + return false; + } + + public boolean isFileAssignedToTask(final S3Object s3Object) { + return distributionStrategy.isPartOfTask(taskId, s3Object.key(), filePattern); + } + @Override public boolean hasNext() { while (!outer.hasNext() && inner.hasNext()) { diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java index 944ccbfdf..c915376c9 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -174,6 +174,7 @@ private void setBasicProperties() { properties.putIfAbsent("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); properties.putIfAbsent("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); properties.putIfAbsent("tasks.max", "1"); + properties.put("task.id", "1"); properties.putIfAbsent("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); properties.putIfAbsent(TARGET_TOPIC_PARTITIONS, "0,1"); properties.putIfAbsent(TARGET_TOPICS, "testtopic"); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java index beed0681c..1a160d780 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java @@ -35,8 +35,6 @@ import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.CsvSource; import org.mockito.ArgumentCaptor; import org.mockito.Captor; import software.amazon.awssdk.services.s3.S3Client; @@ -53,19 +51,16 @@ class AWSV2SourceClientTest { @Captor ArgumentCaptor<ListObjectsV2Request> requestCaptor; - private static Map<String, String> getConfigMap(final int maxTasks, final int taskId) { + private static Map<String, String> getConfigMap() { final Map<String, String> configMap = new HashMap<>(); - configMap.put("tasks.max", String.valueOf(maxTasks)); - configMap.put("task.id", String.valueOf(taskId)); configMap.put(AWS_S3_BUCKET_NAME_CONFIG, "test-bucket"); return configMap; } - @ParameterizedTest - @CsvSource({ "3, 1" }) - void testFetchObjectSummariesWithNoObjects(final int maxTasks, final int taskId) { - initializeWithTaskConfigs(maxTasks, taskId); + @Test + void testFetchObjectSummariesWithNoObjects() { + initializeWithTaskConfigs(); final ListObjectsV2Response listObjectsV2Response = createListObjectsV2Response(Collections.emptyList(), null); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Response); @@ -73,54 +68,31 @@ void testFetchObjectSummariesWithNoObjects(final int maxTasks, final int taskId) assertThat(summaries).isExhausted(); } - @ParameterizedTest - @CsvSource({ "1, 0" }) - void testFetchObjectSummariesWithOneObjectWithBasicConfig(final int maxTasks, final int taskId) { + @Test + void testFetchObjectSummariesWithOneObjectWithBasicConfig() { final String objectKey = "any-key"; - initializeWithTaskConfigs(maxTasks, taskId); - final Iterator<String> summaries = getS3ObjectKeysIterator(objectKey); - assertThat(summaries).hasNext(); - } - - @ParameterizedTest - @CsvSource({ "4, 2, key1", "4, 3, key2", "4, 0, key3", "4, 1, key4" }) - void testFetchObjectSummariesWithOneNonZeroByteObjectWithTaskIdAssigned(final int maxTasks, final int taskId, - final String objectKey) { - initializeWithTaskConfigs(maxTasks, taskId); + initializeWithTaskConfigs(); final Iterator<String> summaries = getS3ObjectKeysIterator(objectKey); assertThat(summaries).hasNext(); } - @ParameterizedTest - @CsvSource({ "4, 1, key1", "4, 3, key1", "4, 0, key1", "4, 1, key2", "4, 2, key2", "4, 0, key2", "4, 1, key3", - "4, 2, key3", "4, 3, key3", "4, 0, key4", "4, 2, key4", "4, 3, key4" }) - void testFetchObjectSummariesWithOneNonZeroByteObjectWithTaskIdUnassigned(final int maxTasks, final int taskId, - final String objectKey) { - initializeWithTaskConfigs(maxTasks, taskId); - final Iterator<String> summaries = getS3ObjectKeysIterator(objectKey); - - assertThat(summaries).isExhausted(); - } - - @ParameterizedTest - @CsvSource({ "4, 3", "4, 0" }) - void testFetchObjectSummariesWithZeroByteObject(final int maxTasks, final int taskId) { - initializeWithTaskConfigs(maxTasks, taskId); + @Test + void testFetchObjectSummariesWithZeroByteObject() { + initializeWithTaskConfigs(); final ListObjectsV2Response listObjectsV2Response = getListObjectsV2Response(); when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Response); final Iterator<String> summaries = awsv2SourceClient.getListOfObjectKeys(null); - // assigned 1 object to taskid - assertThat(summaries).hasNext(); + assertThat(summaries.next()).isNotBlank(); assertThat(summaries.next()).isNotBlank(); assertThat(summaries).isExhausted(); } @Test void testFetchObjectSummariesWithPagination() throws IOException { - initializeWithTaskConfigs(4, 3); + initializeWithTaskConfigs(); final S3Object object1 = createObjectSummary(1, "key1"); final S3Object object2 = createObjectSummary(2, "key2"); final List<S3Object> firstBatch = List.of(object1); @@ -134,19 +106,19 @@ void testFetchObjectSummariesWithPagination() throws IOException { final Iterator<String> summaries = awsv2SourceClient.getListOfObjectKeys(null); verify(s3Client, times(1)).listObjectsV2(any(ListObjectsV2Request.class)); assertThat(summaries.next()).isNotNull(); - assertThat(summaries).isExhausted(); + assertThat(summaries.next()).isNotNull(); } @Test void testFetchObjectWithPrefix() { - final Map<String, String> configMap = getConfigMap(1, 0); + final Map<String, String> configMap = getConfigMap(); configMap.put(AWS_S3_PREFIX_CONFIG, "test/"); final S3SourceConfig s3SourceConfig = new S3SourceConfig(configMap); s3Client = mock(S3Client.class); - awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig, Collections.emptySet()); + awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig); requestCaptor = ArgumentCaptor.forClass(ListObjectsV2Request.class); - final S3Object object1 = createObjectSummary(1, "key1"); - final S3Object object2 = createObjectSummary(1, "key2"); + final S3Object object1 = createObjectSummary(1, "topics/key1/1/key1.txt"); + final S3Object object2 = createObjectSummary(1, "topics/key2/2/key2.txt"); final ListObjectsV2Response firstResult = createListObjectsV2Response(List.of(object1), "nextToken"); final ListObjectsV2Response secondResult = createListObjectsV2Response(List.of(object2), null); @@ -167,19 +139,18 @@ void testFetchObjectWithPrefix() { // Not required with continuation token assertThat(allRequests.get(1).prefix()).isNull(); assertThat(allRequests.get(1).continuationToken()).isEqualTo("nextToken"); - } @Test void testFetchObjectWithInitialStartAfter() { - final Map<String, String> configMap = getConfigMap(1, 0); + final Map<String, String> configMap = getConfigMap(); final String startAfter = "file-option-1-12000.txt"; final S3SourceConfig s3SourceConfig = new S3SourceConfig(configMap); s3Client = mock(S3Client.class); - awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig, Collections.emptySet()); + awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig); requestCaptor = ArgumentCaptor.forClass(ListObjectsV2Request.class); - final S3Object object1 = createObjectSummary(1, "key1"); - final S3Object object2 = createObjectSummary(1, "key2"); + final S3Object object1 = createObjectSummary(1, "key1-1-10000"); + final S3Object object2 = createObjectSummary(1, "key2-2-20000"); final ListObjectsV2Response firstResult = createListObjectsV2Response(List.of(object1), "nextToken"); final ListObjectsV2Response secondResult = createListObjectsV2Response(List.of(object2), null); @@ -227,12 +198,11 @@ private Iterator<String> getS3ObjectKeysIterator(final String objectKey) { return awsv2SourceClient.getListOfObjectKeys(null); } - public void initializeWithTaskConfigs(final int maxTasks, final int taskId) { - final Map<String, String> configMap = getConfigMap(maxTasks, taskId); + private void initializeWithTaskConfigs() { + final Map<String, String> configMap = getConfigMap(); final S3SourceConfig s3SourceConfig = new S3SourceConfig(configMap); s3Client = mock(S3Client.class); - awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig, Collections.emptySet()); - + awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig); } private ListObjectsV2Response getListObjectsV2Response() { diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java index af9b679fa..f7559ddfd 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -16,6 +16,8 @@ package io.aiven.kafka.connect.s3.source.utils; +import static io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils.PATTERN_PARTITION_KEY; +import static io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils.PATTERN_TOPIC_KEY; import static io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator.BYTES_TRANSFORMATION_NUM_OF_RECS; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.anyMap; @@ -35,6 +37,9 @@ import java.util.Arrays; import java.util.Collections; import java.util.Iterator; +import java.util.function.Predicate; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.stream.Stream; import org.apache.kafka.connect.data.SchemaAndValue; @@ -44,10 +49,14 @@ import io.aiven.kafka.connect.common.source.input.InputFormat; import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.common.source.input.TransformerFactory; +import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; +import io.aiven.kafka.connect.common.source.task.HashDistributionStrategy; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; import software.amazon.awssdk.services.s3.model.S3Object; final class SourceRecordIteratorTest { @@ -78,35 +87,38 @@ void testIteratorProcessesS3Objects() throws Exception { mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); + final Pattern filePattern = mock(Pattern.class); when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Collections.emptyIterator()); Iterator<S3SourceRecord> iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, - mockSourceApiClient); + mockSourceApiClient, new HashDistributionStrategy(1), + FilePatternUtils.configurePattern("{{topic}}-{{partition}}-{{start_offset}}"), 0); assertThat(iterator.hasNext()).isFalse(); + mockPatternMatcher(filePattern); final S3Object obj = S3Object.builder().key(key).build(); final ByteArrayInputStream bais = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8)); when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(obj).iterator()); when(mockSourceApiClient.getObject(any())).thenReturn(() -> bais); - iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, mockSourceApiClient); + iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, mockSourceApiClient, + new HashDistributionStrategy(1), filePattern, 0); - assertThat(iterator).hasNext(); + assertThat(iterator.hasNext()).isTrue(); assertThat(iterator.next()).isNotNull(); - assertThat(iterator).isExhausted(); } } @Test void testIteratorProcessesS3ObjectsForByteArrayTransformer() throws Exception { - final String key = "topic-00001-abc123.txt"; final S3Object s3Object = S3Object.builder().key(key).build(); // With ByteArrayTransformer try (InputStream inputStream = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8))) { when(mockSourceApiClient.getObject(key)).thenReturn(() -> inputStream); + final Pattern filePattern = mock(Pattern.class); when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(s3Object).iterator()); @@ -120,10 +132,11 @@ void testIteratorProcessesS3ObjectsForByteArrayTransformer() throws Exception { .thenReturn(Collections.singletonList(key).listIterator()); when(mockOffsetManager.recordsProcessedForObjectKey(anyMap(), anyString())) .thenReturn(BYTES_TRANSFORMATION_NUM_OF_RECS); + mockPatternMatcher(filePattern); // should skip if any records were produced by source record iterator. final Iterator<S3SourceRecord> iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, - mockTransformer, mockSourceApiClient); + mockTransformer, mockSourceApiClient, new HashDistributionStrategy(1), filePattern, 0); assertThat(iterator.hasNext()).isFalse(); verify(mockSourceApiClient, never()).getObject(any()); verify(mockTransformer, never()).getRecords(any(), anyString(), anyInt(), any(), anyLong()); @@ -132,6 +145,7 @@ void testIteratorProcessesS3ObjectsForByteArrayTransformer() throws Exception { // With AvroTransformer try (InputStream inputStream = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8))) { when(mockSourceApiClient.getObject(key)).thenReturn(() -> inputStream); + final Pattern filePattern = mock(Pattern.class); when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(s3Object).iterator()); mockTransformer = mock(AvroTransformer.class); when(mockSourceApiClient.getListOfObjectKeys(any())) @@ -139,18 +153,73 @@ void testIteratorProcessesS3ObjectsForByteArrayTransformer() throws Exception { when(mockOffsetManager.recordsProcessedForObjectKey(anyMap(), anyString())) .thenReturn(BYTES_TRANSFORMATION_NUM_OF_RECS); + mockPatternMatcher(filePattern); when(mockTransformer.getKeyData(anyString(), anyString(), any())).thenReturn(SchemaAndValue.NULL); when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) .thenReturn(Arrays.asList(SchemaAndValue.NULL).stream()); final Iterator<S3SourceRecord> iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, - mockTransformer, mockSourceApiClient); - assertThat(iterator.hasNext()).isTrue(); - iterator.next(); + mockTransformer, mockSourceApiClient, new HashDistributionStrategy(1), filePattern, 0); + assertThat(iterator.hasNext()).isFalse(); - verify(mockTransformer, times(1)).getRecords(any(), anyString(), anyInt(), any(), anyLong()); + verify(mockTransformer, times(0)).getRecords(any(), anyString(), anyInt(), any(), anyLong()); } } + @ParameterizedTest + @CsvSource({ "4, 2, key1", "4, 3, key2", "4, 0, key3", "4, 1, key4" }) + void testFetchObjectSummariesWithOneNonZeroByteObjectWithTaskIdAssigned(final int maxTasks, final int taskId, + final String objectKey) { + + mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); + when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); + final Pattern filePattern = mock(Pattern.class); + + mockPatternMatcher(filePattern); + + final S3Object obj = S3Object.builder().key(objectKey).build(); + + final ByteArrayInputStream bais = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8)); + when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(obj).iterator()); + when(mockSourceApiClient.getObject(any())).thenReturn(() -> bais); + final SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, + mockSourceApiClient, new HashDistributionStrategy(maxTasks), filePattern, taskId); + final Predicate<S3Object> s3ObjectPredicate = s3Object -> iterator.isFileMatchingPattern(s3Object) + && iterator.isFileAssignedToTask(s3Object); + // Assert + assertThat(s3ObjectPredicate).accepts(obj); + } + + @ParameterizedTest + @CsvSource({ "4, 1, topic1-2-0", "4, 3, key1", "4, 0, key1", "4, 1, key2", "4, 2, key2", "4, 0, key2", "4, 1, key3", + "4, 2, key3", "4, 3, key3", "4, 0, key4", "4, 2, key4", "4, 3, key4" }) + void testFetchObjectSummariesWithOneNonZeroByteObjectWithTaskIdUnassigned(final int maxTasks, final int taskId, + final String objectKey) { + mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); + when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); + final Pattern filePattern = mock(Pattern.class); + + mockPatternMatcher(filePattern); + + final S3Object obj = S3Object.builder().key(objectKey).build(); + + final ByteArrayInputStream bais = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8)); + when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(obj).iterator()); + when(mockSourceApiClient.getObject(any())).thenReturn(() -> bais); + final SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, + mockSourceApiClient, new HashDistributionStrategy(maxTasks), filePattern, taskId); + final Predicate<S3Object> stringPredicate = s3Object -> iterator.isFileMatchingPattern(s3Object) + && iterator.isFileAssignedToTask(s3Object); + // Assert + assertThat(stringPredicate.test(obj)).as("Predicate should accept the objectKey: " + objectKey).isFalse(); + } + + private static void mockPatternMatcher(final Pattern filePattern) { + final Matcher fileMatcher = mock(Matcher.class); + when(filePattern.matcher(any())).thenReturn(fileMatcher); + when(fileMatcher.find()).thenReturn(true); + when(fileMatcher.group(PATTERN_TOPIC_KEY)).thenReturn("testtopic"); + when(fileMatcher.group(PATTERN_PARTITION_KEY)).thenReturn("0"); + } } From 343f23c9602816db1f6785e0cdef353a751406fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aindri=C3=BA=20Lavelle?= <121855584+aindriu-aiven@users.noreply.github.com> Date: Mon, 20 Jan 2025 09:53:15 +0000 Subject: [PATCH 88/90] Update the S3 source connector to use a later version of Kafka (#383) Previously this was on Version 1.1.0 and upgrading to be on version 3.3.0. This will allow us in the future to use the exactly once kip that was introduced in 3.3.0 without causing any upgrade path issues for users. Signed-off-by: Aindriu Lavelle <aindriu.lavelle@aiven.io> --- s3-source-connector/build.gradle.kts | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts index db1b4a7d0..5d8e44ac7 100644 --- a/s3-source-connector/build.gradle.kts +++ b/s3-source-connector/build.gradle.kts @@ -21,7 +21,7 @@ plugins { id("aiven-apache-kafka-connectors-all.java-conventions") } val amazonS3Version by extra("2.29.34") val amazonSTSVersion by extra("2.29.34") val s3mockVersion by extra("0.2.6") -val testKafkaVersion by extra("3.7.1") +val kafkaVersion by extra("3.3.0") val integrationTest: SourceSet = sourceSets.create("integrationTest") { @@ -62,8 +62,8 @@ idea { } dependencies { - compileOnly(apache.kafka.connect.api) - compileOnly(apache.kafka.connect.runtime) + compileOnly("org.apache.kafka:connect-api:$kafkaVersion") + compileOnly("org.apache.kafka:connect-runtime:$kafkaVersion") implementation(apache.commons.collection4) implementation(project(":commons")) @@ -81,9 +81,9 @@ dependencies { testImplementation(compressionlibs.snappy) testImplementation(compressionlibs.zstd.jni) - testImplementation(apache.kafka.connect.api) - testImplementation(apache.kafka.connect.runtime) - testImplementation(apache.kafka.connect.json) + testImplementation("org.apache.kafka:connect-api:$kafkaVersion") + testImplementation("org.apache.kafka:connect-runtime:$kafkaVersion") + testImplementation("org.apache.kafka:connect-json:$kafkaVersion") testImplementation(testinglibs.junit.jupiter) testImplementation(testinglibs.assertj.core) @@ -184,11 +184,11 @@ dependencies { exclude(group = "io.netty", module = "netty") } - integrationTestImplementation("org.apache.kafka:connect-runtime:${testKafkaVersion}:test") - integrationTestImplementation("org.apache.kafka:connect-runtime:${testKafkaVersion}") - integrationTestImplementation("org.apache.kafka:kafka-clients:${testKafkaVersion}:test") - integrationTestImplementation("org.apache.kafka:kafka_2.13:${testKafkaVersion}:test") - integrationTestImplementation("org.apache.kafka:kafka_2.13:${testKafkaVersion}") + integrationTestImplementation("org.apache.kafka:connect-runtime:${kafkaVersion}:test") + integrationTestImplementation("org.apache.kafka:connect-runtime:${kafkaVersion}") + integrationTestImplementation("org.apache.kafka:kafka-clients:${kafkaVersion}:test") + integrationTestImplementation("org.apache.kafka:kafka_2.13:${kafkaVersion}:test") + integrationTestImplementation("org.apache.kafka:kafka_2.13:${kafkaVersion}") // Make test utils from 'test' available in 'integration-test' integrationTestImplementation(sourceSets["test"].output) From f68b0a442fb2a390dc9365ec3d10162709dc4c9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aindri=C3=BA=20Lavelle?= <121855584+aindriu-aiven@users.noreply.github.com> Date: Mon, 20 Jan 2025 12:48:53 +0000 Subject: [PATCH 89/90] Adding the context to the source connectors (#388) Updating and refactoring the distribution strategies based on feedback. --------- Signed-off-by: Aindriu Lavelle <aindriu.lavelle@aiven.io> --- .../connect/common/config/CommonConfig.java | 23 ++ .../common/config/SourceCommonConfig.java | 6 +- .../common/config/SourceConfigFragment.java | 30 +- .../source/input/utils/FilePatternUtils.java | 126 +++++-- .../connect/common/source/task/Context.java | 71 ++++ .../source/task/DistributionStrategy.java | 49 ++- .../common/source/task/DistributionType.java | 88 +++++ .../source/task/HashDistributionStrategy.java | 58 ---- .../task/PartitionDistributionStrategy.java | 84 ----- .../enums/ObjectDistributionStrategy.java | 48 --- .../input/utils/FilePatternUtilsTest.java | 63 ++++ .../task/HashDistributionStrategyTest.java | 124 +++++-- .../PartitionDistributionStrategyTest.java | 277 ++++++--------- .../connect/s3/source/AwsIntegrationTest.java | 25 +- .../connect/s3/source/IntegrationTest.java | 39 ++- .../AivenKafkaConnectS3SourceConnector.java | 4 +- .../kafka/connect/s3/source/S3SourceTask.java | 31 +- .../s3/source/utils/SourceRecordIterator.java | 72 ++-- .../connect/s3/source/S3SourceTaskTest.java | 6 +- .../utils/SourceRecordIteratorTest.java | 326 ++++++++++++------ 20 files changed, 921 insertions(+), 629 deletions(-) create mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/task/Context.java create mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionType.java delete mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategy.java delete mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategy.java delete mode 100644 commons/src/main/java/io/aiven/kafka/connect/common/source/task/enums/ObjectDistributionStrategy.java create mode 100644 commons/src/test/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtilsTest.java diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/CommonConfig.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/CommonConfig.java index 8c4683a34..0242d40b7 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/CommonConfig.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/CommonConfig.java @@ -27,6 +27,8 @@ public class CommonConfig extends AbstractConfig { protected static final String GROUP_COMPRESSION = "File Compression"; protected static final String GROUP_FORMAT = "Format"; + public static final String TASK_ID = "task.id"; + public static final String MAX_TASKS = "tasks.max"; /** * @deprecated No longer needed. @@ -58,4 +60,25 @@ public Long getKafkaRetryBackoffMs() { return new BackoffPolicyConfig(this).getKafkaRetryBackoffMs(); } + /** + * + * Get the maximum number of tasks that should be run by this connector configuration Max Tasks is set within the + * Kafka Connect framework and so is retrieved slightly differently in ConnectorConfig.java + * + * @return The maximum number of tasks that should be run by this connector configuration + */ + public int getMaxTasks() { + // TODO when Connect framework is upgraded it will be possible to retrieve this information from the configDef + // as tasksMax + return Integer.parseInt(this.originalsStrings().get(MAX_TASKS)); + } + /** + * Get the task id for this configuration + * + * @return The task id for this configuration + */ + public int getTaskId() { + return Integer.parseInt(this.originalsStrings().get(TASK_ID)); + } + } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java index 2c9cafe61..68036bd68 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java @@ -24,7 +24,7 @@ import io.aiven.kafka.connect.common.source.input.InputFormat; import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.common.source.input.TransformerFactory; -import io.aiven.kafka.connect.common.source.task.enums.ObjectDistributionStrategy; +import io.aiven.kafka.connect.common.source.task.DistributionType; public class SourceCommonConfig extends CommonConfig { @@ -70,8 +70,8 @@ public ErrorsTolerance getErrorsTolerance() { return sourceConfigFragment.getErrorsTolerance(); } - public ObjectDistributionStrategy getObjectDistributionStrategy() { - return sourceConfigFragment.getObjectDistributionStrategy(); + public DistributionType getDistributionType() { + return sourceConfigFragment.getDistributionType(); } public int getMaxPollRecords() { diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java index f3955a7e3..7f5d6276f 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java @@ -16,14 +16,16 @@ package io.aiven.kafka.connect.common.config; -import static io.aiven.kafka.connect.common.source.task.enums.ObjectDistributionStrategy.OBJECT_HASH; -import static io.aiven.kafka.connect.common.source.task.enums.ObjectDistributionStrategy.PARTITION_IN_FILENAME; +import static io.aiven.kafka.connect.common.source.task.DistributionType.OBJECT_HASH; + +import java.util.Arrays; +import java.util.stream.Collectors; import org.apache.kafka.common.config.AbstractConfig; import org.apache.kafka.common.config.ConfigDef; import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; -import io.aiven.kafka.connect.common.source.task.enums.ObjectDistributionStrategy; +import io.aiven.kafka.connect.common.source.task.DistributionType; import org.apache.commons.lang3.StringUtils; @@ -36,7 +38,7 @@ public final class SourceConfigFragment extends ConfigFragment { public static final String TARGET_TOPICS = "topics"; public static final String ERRORS_TOLERANCE = "errors.tolerance"; - public static final String OBJECT_DISTRIBUTION_STRATEGY = "object.distribution.strategy"; + public static final String DISTRIBUTION_TYPE = "distribution.type"; /** * Construct the ConfigFragment.. @@ -74,13 +76,15 @@ public static ConfigDef update(final ConfigDef configDef) { configDef.define(TARGET_TOPICS, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "eg : connect-storage-offsets", GROUP_OFFSET_TOPIC, offsetStorageGroupCounter++, ConfigDef.Width.NONE, TARGET_TOPICS); - configDef.define(OBJECT_DISTRIBUTION_STRATEGY, ConfigDef.Type.STRING, OBJECT_HASH.name(), + configDef.define(DISTRIBUTION_TYPE, ConfigDef.Type.STRING, OBJECT_HASH.name(), new ObjectDistributionStrategyValidator(), ConfigDef.Importance.MEDIUM, - "Based on tasks.max config and this strategy, objects are processed in distributed" - + " way by Kafka connect workers, supported values : " + OBJECT_HASH + ", " - + PARTITION_IN_FILENAME, - GROUP_OTHER, offsetStorageGroupCounter++, ConfigDef.Width.NONE, OBJECT_DISTRIBUTION_STRATEGY); // NOPMD - // UnusedAssignment + "Based on tasks.max config and the type of strategy selected, objects are processed in distributed" + + " way by Kafka connect workers, supported values : " + + Arrays.stream(DistributionType.values()) + .map(DistributionType::value) + .collect(Collectors.joining(", ")), + GROUP_OTHER, offsetStorageGroupCounter++, ConfigDef.Width.NONE, DISTRIBUTION_TYPE); // NOPMD + // UnusedAssignment return configDef; } @@ -105,8 +109,8 @@ public ErrorsTolerance getErrorsTolerance() { return ErrorsTolerance.forName(cfg.getString(ERRORS_TOLERANCE)); } - public ObjectDistributionStrategy getObjectDistributionStrategy() { - return ObjectDistributionStrategy.forName(cfg.getString(OBJECT_DISTRIBUTION_STRATEGY)); + public DistributionType getDistributionType() { + return DistributionType.forName(cfg.getString(DISTRIBUTION_TYPE)); } private static class ErrorsToleranceValidator implements ConfigDef.Validator { @@ -126,7 +130,7 @@ public void ensureValid(final String name, final Object value) { final String objectDistributionStrategy = (String) value; if (StringUtils.isNotBlank(objectDistributionStrategy)) { // This will throw an Exception if not a valid value. - ObjectDistributionStrategy.forName(objectDistributionStrategy); + DistributionType.forName(objectDistributionStrategy); } } } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtils.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtils.java index 546c0c4c4..3f78431ea 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtils.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtils.java @@ -22,12 +22,22 @@ import org.apache.kafka.common.config.ConfigException; +import io.aiven.kafka.connect.common.source.task.Context; + import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +/** + * FilePatternUtils allows the construction of a regex pattern to extract the + * {@link io.aiven.kafka.connect.common.source.task.Context Context} from an Object Key. + * + */ public final class FilePatternUtils { - + private static final Logger LOGGER = LoggerFactory.getLogger(FilePatternUtils.class); public static final String PATTERN_PARTITION_KEY = "partition"; public static final String PATTERN_TOPIC_KEY = "topic"; + public static final String PATTERN_START_OFFSET_KEY = "startOffset"; // no undercore allowed as it breaks the regex. public static final String START_OFFSET_PATTERN = "{{start_offset}}"; public static final String TIMESTAMP_PATTERN = "{{timestamp}}"; public static final String PARTITION_PATTERN = "{{" + PATTERN_PARTITION_KEY + "}}"; @@ -36,20 +46,47 @@ public final class FilePatternUtils { // Use a named group to return the partition in a complex string to always get the correct information for the // partition number. public static final String PARTITION_NAMED_GROUP_REGEX_PATTERN = "(?<" + PATTERN_PARTITION_KEY + ">\\d+)"; + public static final String START_OFFSET_NAMED_GROUP_REGEX_PATTERN = "(?<" + PATTERN_START_OFFSET_KEY + ">\\d+)"; public static final String NUMBER_REGEX_PATTERN = "(?:\\d+)"; public static final String TOPIC_NAMED_GROUP_REGEX_PATTERN = "(?<" + PATTERN_TOPIC_KEY + ">[a-zA-Z0-9\\-_.]+)"; + public static final String START_OFFSET = "Start offset"; - private FilePatternUtils() { - // hidden + final Pattern pattern; + private final boolean startOffsetConfigured; + private final boolean partitionConfigured; + private final boolean topicConfigured; + + /** + * Creates an instance of FilePatternUtils, this constructor is used to configure the Pattern that is used to + * extract Context from Object 'K'. + * + * @param pattern + */ + public FilePatternUtils(final String pattern) { + this.pattern = configurePattern(pattern); + startOffsetConfigured = pattern.contains(START_OFFSET_PATTERN); + partitionConfigured = pattern.contains(PARTITION_PATTERN); + topicConfigured = pattern.contains(TOPIC_PATTERN); } - public static Pattern configurePattern(final String expectedSourceNameFormat) { - if (expectedSourceNameFormat == null || !expectedSourceNameFormat.contains(PARTITION_PATTERN)) { - throw new ConfigException(String.format( - "Source name format %s missing partition pattern {{partition}} please configure the expected source to include the partition pattern.", - expectedSourceNameFormat)); + + /** + * Sets a Regex Pattern based on initial configuration that allows group regex to be used to extract information + * from the toString() of Object K which is passed in for Context extraction. + * + * @param expectedSourceNameFormat + * This is a string in the expected compatible format which will allow object name or keys to have unique + * information such as partition number, topic name, offset and timestamp information. + * @return A pattern which is configured to allow extraction of the key information from object names and keys. + */ + private Pattern configurePattern(final String expectedSourceNameFormat) { + if (expectedSourceNameFormat == null) { + throw new ConfigException( + "Source name format is missing please configure the expected source to include the partition pattern."); } + // Build REGEX Matcher - String regexString = StringUtils.replace(expectedSourceNameFormat, START_OFFSET_PATTERN, NUMBER_REGEX_PATTERN); + String regexString = StringUtils.replace(expectedSourceNameFormat, START_OFFSET_PATTERN, + START_OFFSET_NAMED_GROUP_REGEX_PATTERN); regexString = StringUtils.replace(regexString, TIMESTAMP_PATTERN, NUMBER_REGEX_PATTERN); regexString = StringUtils.replace(regexString, TOPIC_PATTERN, TOPIC_NAMED_GROUP_REGEX_PATTERN); regexString = StringUtils.replace(regexString, PARTITION_PATTERN, PARTITION_NAMED_GROUP_REGEX_PATTERN); @@ -62,26 +99,71 @@ public static Pattern configurePattern(final String expectedSourceNameFormat) { } } - public static Optional<String> getTopic(final Pattern filePattern, final String sourceName) { - return matchPattern(filePattern, sourceName).map(matcher -> matcher.group(PATTERN_TOPIC_KEY)); + public <K extends Comparable<K>> Optional<Context<K>> process(final K sourceName) { + final Optional<Matcher> matcher = fileMatches(sourceName.toString()); + if (matcher.isPresent()) { + final Context<K> ctx = new Context<>(sourceName); + getTopic(matcher.get(), sourceName.toString()).ifPresent(ctx::setTopic); + getPartitionId(matcher.get(), sourceName.toString()).ifPresent(ctx::setPartition); + getOffset(matcher.get(), sourceName.toString()).ifPresent(ctx::setOffset); + return Optional.of(ctx); + } + return Optional.empty(); + + } + + private Optional<Matcher> fileMatches(final String sourceName) { + return matchPattern(sourceName); } - public static Optional<Integer> getPartitionId(final Pattern filePattern, final String sourceName) { - return matchPattern(filePattern, sourceName).flatMap(matcher -> { - try { - return Optional.of(Integer.parseInt(matcher.group(PATTERN_PARTITION_KEY))); - } catch (NumberFormatException e) { - return Optional.empty(); + private Optional<String> getTopic(final Matcher matcher, final String sourceName) { + + try { + return Optional.of(matcher.group(PATTERN_TOPIC_KEY)); + } catch (IllegalArgumentException ex) { + // It is possible that when checking for the group it does not match and returns an + // illegalArgumentException + if (topicConfigured) { + LOGGER.warn("Unable to extract Topic from {} and 'topics' not configured.", sourceName); } - }); + return Optional.empty(); + } + } - private static Optional<Matcher> matchPattern(final Pattern filePattern, final String sourceName) { - if (filePattern == null || sourceName == null) { - throw new IllegalArgumentException("filePattern and sourceName must not be null"); + private Optional<Integer> getPartitionId(final Matcher matcher, final String sourceName) { + try { + return Optional.of(Integer.parseInt(matcher.group(PATTERN_PARTITION_KEY))); + } catch (IllegalArgumentException e) { + // It is possible that when checking for the group it does not match and returns an + // illegalStateException, Number format exception is also covered by this in this case. + if (partitionConfigured) { + LOGGER.warn("Unable to extract Partition id from {}.", sourceName); + } + return Optional.empty(); + } + + } + + private Optional<Integer> getOffset(final Matcher matcher, final String sourceName) { + try { + return Optional.of(Integer.parseInt(matcher.group(PATTERN_START_OFFSET_KEY))); + } catch (IllegalArgumentException e) { + // It is possible that when checking for the group it does not match and returns an + // illegalStateException, Number format exception is also covered by this in this case. + if (startOffsetConfigured) { + LOGGER.warn("Unable to extract start offset from {}.", sourceName); + } + return Optional.empty(); } - final Matcher matcher = filePattern.matcher(sourceName); + } + + private Optional<Matcher> matchPattern(final String sourceName) { + if (sourceName == null) { + throw new IllegalArgumentException("filePattern and sourceName must not be null"); + } + final Matcher matcher = pattern.matcher(sourceName); return matcher.find() ? Optional.of(matcher) : Optional.empty(); } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/Context.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/Context.java new file mode 100644 index 000000000..265ade6db --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/Context.java @@ -0,0 +1,71 @@ +/* + * Copyright 2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.task; + +import java.util.Optional; + +/** + * A Context which captures all the details about the source which are required to successfully send a source record + * onto Kafka + * + * @param <K> + * is is the type/class of the key unique to the object the context is being created about + */ +public class Context<K extends Comparable<K>> { + + private String topic; + private Integer partition; + private Integer offset; + private K storageKey; + + public Context(final K storageKey) { + + this.storageKey = storageKey; + } + + public Optional<String> getTopic() { + return Optional.ofNullable(topic); + } + + public void setTopic(final String topic) { + this.topic = topic; + } + + public Optional<Integer> getPartition() { + return Optional.ofNullable(partition); + } + + public void setPartition(final Integer partition) { + this.partition = partition; + } + + public Optional<K> getStorageKey() { + return Optional.ofNullable(storageKey); + } + + public void setStorageKey(final K storageKey) { + this.storageKey = storageKey; + } + + public Optional<Integer> getOffset() { + return Optional.ofNullable(offset); + } + + public void setOffset(final Integer offset) { + this.offset = offset; + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionStrategy.java index 8d370c689..8644889c0 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionStrategy.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionStrategy.java @@ -16,29 +16,45 @@ package io.aiven.kafka.connect.common.source.task; -import java.util.regex.Pattern; +import java.util.Optional; +import java.util.function.Function; /** * An {@link DistributionStrategy} provides a mechanism to share the work of processing records from objects (or files) * into tasks, which are subsequently processed (potentially in parallel) by Kafka Connect workers. * <p> - * The number of objects in cloud storage can be very high, and they are distributed amongst tasks to minimize the - * overhead of assigning work to Kafka worker threads. All objects assigned to the same task will be processed together - * sequentially by the same worker, which can be useful for maintaining order between objects. There are usually fewer - * workers than tasks, and they will be assigned the remaining tasks as work completes. + * The number of objects in cloud storage can be very high, selecting a distribution strategy allows the connector to + * know how to distribute the load across Connector tasks and in some cases using an appropriate strategy can also + * decide on maintaining a level of ordering between messages as well. */ -public interface DistributionStrategy { +public final class DistributionStrategy { + private int maxTasks; + private final Function<Context<?>, Optional<Long>> mutation; + public final static int UNDEFINED = -1; + + public DistributionStrategy(final Function<Context<?>, Optional<Long>> creator, final int maxTasks) { + assertPositiveInteger(maxTasks); + this.mutation = creator; + this.maxTasks = maxTasks; + } + + private static void assertPositiveInteger(final int sourceInt) { + if (sourceInt <= 0) { + throw new IllegalArgumentException("tasks.max must be set to a positive number and at least 1."); + } + } + /** - * Check if the object should be processed by the task with the given {@code taskId}. Any single object should be - * assigned deterministically to a single taskId. + * Retrieve the taskId that this object should be processed by. Any single object will be assigned deterministically + * to a single taskId, that will be always return the same taskId output given the same context is used. * - * @param taskId - * a task ID, usually for the currently running task - * @param valueToBeEvaluated - * The value to be evaluated to determine if it should be processed by the task. - * @return true if the task should process the object, false if it should not. + * @param ctx + * This is the context which contains optional values for the partition, topic and storage key name + * @return the taskId which this particular task should be assigned to. */ - boolean isPartOfTask(int taskId, String valueToBeEvaluated, Pattern filePattern); + public int getTaskFor(final Context<?> ctx) { + return mutation.apply(ctx).map(aLong -> Math.floorMod(aLong, maxTasks)).orElse(UNDEFINED); + } /** * When a connector receives a reconfigure event this method should be called to ensure that the distribution @@ -47,5 +63,8 @@ public interface DistributionStrategy { * @param maxTasks * The maximum number of tasks created for the Connector */ - void configureDistributionStrategy(int maxTasks); + public void configureDistributionStrategy(final int maxTasks) { + assertPositiveInteger(maxTasks); + this.maxTasks = maxTasks; + } } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionType.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionType.java new file mode 100644 index 000000000..9010e8b8d --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionType.java @@ -0,0 +1,88 @@ +/* + * Copyright 2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.task; + +import java.util.Arrays; +import java.util.Objects; +import java.util.Optional; +import java.util.function.Function; + +import org.apache.kafka.common.config.ConfigException; + +public enum DistributionType { + + /** + * Object_Hash takes the context and uses the storage key implementation to get a hash value of the storage key and + * return a modulus of that relative to the number of maxTasks to decide which task should process a given object + */ + OBJECT_HASH("object_hash", + context -> context.getStorageKey().isPresent() + ? Optional.of((long) context.getStorageKey().get().hashCode()) + : Optional.empty()), + /** + * Partition takes the context and requires the context contain the partition id for it to be able to decide the + * distribution across the max tasks, using a modulus to ensure even distribution against the configured max tasks + */ + PARTITION("partition", + context -> context.getPartition().isPresent() + ? Optional.of((long) context.getPartition().get()) + : Optional.empty()); + + private final String name; + private final Function<Context<?>, Optional<Long>> mutation; + + public String value() { + return name; + } + + /** + * Get the Object distribution strategy for the configured ObjectDistributionStrategy + * + * @param name + * the name of the ObjectDistributionStrategy + * @param mutation + * the mutation required to get the correct details from the context for distribution + */ + DistributionType(final String name, final Function<Context<?>, Optional<Long>> mutation) { + this.name = name; + this.mutation = mutation; + } + + public static DistributionType forName(final String name) { + Objects.requireNonNull(name, "name cannot be null"); + for (final DistributionType distributionType : DistributionType.values()) { + if (distributionType.name.equalsIgnoreCase(name)) { + return distributionType; + } + } + throw new ConfigException(String.format("Unknown distribution.type : %s, allowed values %s ", name, + Arrays.toString(DistributionType.values()))); + } + + /** + * Returns a configured Distribution Strategy + * + * @param maxTasks + * the maximum number of configured tasks for this connector + * + * @return a configured Distribution Strategy with the correct mutation configured for proper distribution across + * tasks of objects being processed. + */ + public DistributionStrategy getDistributionStrategy(final int maxTasks) { + return new DistributionStrategy(mutation, maxTasks); + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategy.java deleted file mode 100644 index 4928f30d9..000000000 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategy.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task; - -import java.util.regex.Pattern; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * {@link HashDistributionStrategy} evenly distributes cloud storage objects between tasks using the hashcode of the - * object's filename, which is uniformly distributed and deterministic across workers. - * <p> - * This is well-suited to use cases where the order of events between records from objects is not important, especially - * when ingesting files into Kafka that were not previously created by a supported cloud storage Sink. - */ -public final class HashDistributionStrategy implements DistributionStrategy { - private final static Logger LOG = LoggerFactory.getLogger(HashDistributionStrategy.class); - private int maxTasks; - public HashDistributionStrategy(final int maxTasks) { - configureDistributionStrategy(maxTasks); - } - - @Override - public boolean isPartOfTask(final int taskId, final String filenameToBeEvaluated, final Pattern filePattern) { - if (filenameToBeEvaluated == null) { - LOG.warn("Ignoring as it is not passing a correct filename to be evaluated."); - return false; - } - final int taskAssignment = Math.floorMod(filenameToBeEvaluated.hashCode(), maxTasks); - // floor mod returns the remainder of a division so will start at 0 and move up - // tasks start at 0 so there should be no issue. - return taskAssignment == taskId; - } - - @Override - public void configureDistributionStrategy(final int maxTasks) { - this.maxTasks = maxTasks; - } - - public void setMaxTasks(final int maxTasks) { - this.maxTasks = maxTasks; - } -} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategy.java deleted file mode 100644 index 25f22dfc0..000000000 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategy.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task; - -import java.util.Optional; -import java.util.regex.Pattern; - -import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * The {@link PartitionDistributionStrategy} finds a partition in the object's filename by matching it to an expected - * format, and assigns all partitions to the same task. - * <p> - * This useful when a sink connector has created the object name in a format like - * {@code topicname-{{partition}}-{{start_offset}}}, and we want all objects with the same partition to be processed - * within a single task. - */ -public final class PartitionDistributionStrategy implements DistributionStrategy { - private final static Logger LOG = LoggerFactory.getLogger(PartitionDistributionStrategy.class); - private int maxTasks; - - public PartitionDistributionStrategy(final int maxTasks) { - this.maxTasks = maxTasks; - } - - /** - * - * @param sourceNameToBeEvaluated - * is the filename/table name of the source for the connector. - * @return Predicate to confirm if the given source name matches - */ - @Override - public boolean isPartOfTask(final int taskId, final String sourceNameToBeEvaluated, final Pattern filePattern) { - if (sourceNameToBeEvaluated == null) { - LOG.warn("Ignoring as it is not passing a correct filename to be evaluated."); - return false; - } - final Optional<Integer> optionalPartitionId = FilePatternUtils.getPartitionId(filePattern, - sourceNameToBeEvaluated); - - if (optionalPartitionId.isPresent()) { - return optionalPartitionId.get() < maxTasks - ? taskMatchesPartition(taskId, optionalPartitionId.get()) - : taskMatchesPartition(taskId, optionalPartitionId.get() % maxTasks); - } - LOG.warn("Unable to find the partition from this file name {}", sourceNameToBeEvaluated); - return false; - } - - boolean taskMatchesPartition(final int taskId, final int partitionId) { - // The partition id and task id are both expected to start at 0 but if the task id is changed to start at 1 this - // will break. - return taskId == partitionId; - } - - /** - * When a connector reconfiguration event is received this method should be called to ensure the correct strategy is - * being implemented by the connector. - * - * @param maxTasks - * maximum number of configured tasks for this connector - */ - @Override - public void configureDistributionStrategy(final int maxTasks) { - this.maxTasks = maxTasks; - } -} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/enums/ObjectDistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/enums/ObjectDistributionStrategy.java deleted file mode 100644 index 26c1efa94..000000000 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/enums/ObjectDistributionStrategy.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright 2025 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task.enums; - -import java.util.Arrays; -import java.util.Objects; - -import org.apache.kafka.common.config.ConfigException; - -public enum ObjectDistributionStrategy { - - OBJECT_HASH("object_hash"), PARTITION_IN_FILENAME("partition_in_filename"); - - private final String name; - - public String value() { - return name; - } - - ObjectDistributionStrategy(final String name) { - this.name = name; - } - - public static ObjectDistributionStrategy forName(final String name) { - Objects.requireNonNull(name, "name cannot be null"); - for (final ObjectDistributionStrategy objectDistributionStrategy : ObjectDistributionStrategy.values()) { - if (objectDistributionStrategy.name.equalsIgnoreCase(name)) { - return objectDistributionStrategy; - } - } - throw new ConfigException(String.format("Unknown object.distribution.strategy type: %s, allowed values %s ", - name, Arrays.toString(ObjectDistributionStrategy.values()))); - } -} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtilsTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtilsTest.java new file mode 100644 index 000000000..70ed07e7f --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtilsTest.java @@ -0,0 +1,63 @@ +/* + * Copyright 2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input.utils; + +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + +import java.util.Optional; + +import io.aiven.kafka.connect.common.source.task.Context; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + +class FilePatternUtilsTest { + + @ParameterizedTest + @CsvSource({ "{{topic}}-1.txt, logs-1.txt, logs", "{{topic}}-{{partition}}.txt,logs-1.txt, logs", + "{{topic}}-{{partition}}.txt,logs2-1.txt, logs2", "{{topic}}-{{partition}}.txt, logs2-1.txt, logs2" }) + void checkTopicDistribution(final String expectedSourceFormat, final String sourceName, + final String expectedTopic) { + + final FilePatternUtils utils = new FilePatternUtils(expectedSourceFormat); + final Optional<Context<String>> ctx = utils.process(sourceName); + assertThat(ctx.isPresent()).isTrue(); + assertThat(ctx.get().getTopic().isPresent()).isTrue(); + assertThat(ctx.get().getTopic().get()).isEqualTo(expectedTopic); + } + + @ParameterizedTest + @CsvSource({ "{{topic}}-{{partition}}-{{start_offset}}.txt, logs2-1-0001.txt, logs2,1,0001", + "{{topic}}-{{start_offset}}-{{partition}}.txt, logs2-0001-1.txt, logs2,0001,1", + "{{topic}}-{{start_offset}}-{{partition}}.txt, logs2-99999-1.txt, logs2,1,99999", + "{{partition}}-{{start_offset}}-{{topic}}.txt, logs2-1-logs2.txt, logs2,2,0001", + "{{partition}}-{{start_offset}}-{{topic}}.txt, logs2-1-logs2.txt, logs2,2,0001", }) + void checkTopicDistribution(final String expectedSourceFormat, final String sourceName, final String expectedTopic, + final int expectedPartition, final int expectedOffset) { + + final FilePatternUtils utils = new FilePatternUtils(expectedSourceFormat); + final Optional<Context<String>> ctx = utils.process(sourceName); + assertThat(ctx.isPresent()).isTrue(); + assertThat(ctx.get().getTopic().isPresent()).isTrue(); + assertThat(ctx.get().getTopic().get()).isEqualTo(expectedTopic); + assertThat(ctx.get().getPartition().isPresent()).isTrue(); + assertThat(ctx.get().getPartition().get()).isEqualTo(expectedPartition); + assertThat(ctx.get().getOffset().isPresent()).isTrue(); + assertThat(ctx.get().getOffset().get()).isEqualTo(expectedOffset); + } + +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategyTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategyTest.java index 50ef73964..c76eb1ce7 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategyTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategyTest.java @@ -20,6 +20,7 @@ import java.util.ArrayList; import java.util.List; +import java.util.Optional; import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; @@ -27,49 +28,122 @@ import org.junit.jupiter.params.provider.CsvSource; final class HashDistributionStrategyTest { - + final DistributionType strategy = DistributionType.OBJECT_HASH; @ParameterizedTest @CsvSource({ "logs-0-0002.txt", "logs-1-0002.txt", "logs-2-0002.txt", "logs-3-0002.txt", "logs-4-0002.txt", - "logs-5-0002.txt", "logs-6-0002.txt", "logs-7-0002.txt", "logs-8-0002.txt", "logs-9-0002.txt", "key-0.txt", - "logs-1-0002.txt", "key-0002.txt", "logs-3-0002.txt", "key-0002.txt", "logs-5-0002.txt", "value-6-0002.txt", - "logs-7-0002.txt", "anImage8-0002.png", - "reallylongfilenamecreatedonS3tohisdesomedata and alsohassome spaces.txt" }) + "logs-5-0002.txt", "logs-6-0002.txt", "logs-7-0002.txt", "logs-8-0002.txt", "logs-9-0002.txt", + "logs-1-0002.txt", "logs-3-0002.txt", "logs-5-0002.txt", "value-6-0002.txt", "logs-7-0002.txt" }) void hashDistributionExactlyOnce(final String path) { final int maxTaskId = 10; - final DistributionStrategy taskDistribution = new HashDistributionStrategy(maxTaskId); - final List<Boolean> results = new ArrayList<>(); + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTaskId); + final Context<String> ctx = getContext("{{topic}}-{{partition}}-{{start_offset}}", path); + + final List<Integer> results = new ArrayList<>(); for (int taskId = 0; taskId < maxTaskId; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern("{{topic}}-{{partition}}-{{start_offset}}"))); + results.add(taskDistribution.getTaskFor(ctx)); } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); } @ParameterizedTest @CsvSource({ "logs-0-0002.txt", "logs-1-0002.txt", "logs-2-0002.txt", "logs-3-0002.txt", "logs-4-0002.txt", - "logs-5-0002.txt", "logs-6-0002.txt", "logs-7-0002.txt", "logs-8-0002.txt", "logs-9-0002.txt", "key-0.txt", - "logs-1-0002.txt", "key-0002.txt", "logs-3-0002.txt", "key-0002.txt", "logs-5-0002.txt", "value-6-0002.txt", - "logs-7-0002.txt", "anImage8-0002.png", - "reallylongfilenamecreatedonS3tohisdesomedata and alsohassome spaces.txt" }) + "logs-5-0002.txt", "logs-6-0002.txt", "logs-7-0002.txt", "logs-8-0002.txt", "logs-9-0002.txt", + "logs-1-0002.txt", "logs-3-0002.txt", "logs-5-0002.txt", "value-6-0002.txt", "logs-7-0002.txt" }) void hashDistributionExactlyOnceWithReconfigureEvent(final String path) { int maxTasks = 10; - final DistributionStrategy taskDistribution = new HashDistributionStrategy(maxTasks); - final List<Boolean> results = new ArrayList<>(); + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final Context<String> ctx = getContext("{{topic}}-{{partition}}-{{start_offset}}", path); + + final List<Integer> results = new ArrayList<>(); for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern("{{topic}}-{{partition}}-{{start_offset}}"))); + results.add(taskDistribution.getTaskFor(ctx)); } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); results.clear(); maxTasks = 5; taskDistribution.configureDistributionStrategy(maxTasks); for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern("{{topic}}-{{partition}}-{{start_offset}}"))); + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + } + + @ParameterizedTest + @CsvSource({ "key-0.txt", "key-0002.txt", "key-0002.txt", "anImage8-0002.png", + "reallylongfilenamecreatedonS3tohisdesomedata and alsohassome spaces.txt" }) + void hashDistributionExactlyOnceWithReconfigureEventAndMatchAllExpectedSource(final String path) { + int maxTasks = 10; + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final Context<String> ctx = getContext(".*", path); + + final List<Integer> results = new ArrayList<>(); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + results.clear(); + maxTasks = 5; + taskDistribution.configureDistributionStrategy(maxTasks); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + } + + @ParameterizedTest + @CsvSource({ "-0", "-1", "-999", "-01", "-2002020" }) + void hashDistributionWithNegativeValues(final int hashCode) { + final int maxTasks = 10; + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final FilePatternUtils utils = new FilePatternUtils(".*"); + final Optional<Context<HashCodeKey>> ctx = utils.process(new HashCodeKey(hashCode)); + + assertThat(ctx).isPresent(); + final int result = taskDistribution.getTaskFor(ctx.get()); + + assertThat(result).isLessThan(maxTasks); + assertThat(result).isGreaterThanOrEqualTo(0); + + } + + private Context<String> getContext(final String expectedSourceName, final String filename) { + final FilePatternUtils utils = new FilePatternUtils(expectedSourceName); + final Optional<Context<String>> ctx = utils.process(filename); + assertThat(ctx.isPresent()).isTrue(); + // Hash distribution can have an empty context can have an empty context + return ctx.get(); + } + + static class HashCodeKey implements Comparable<HashCodeKey> { + private final int hashCodeValue; + public HashCodeKey(final int hashCodeValue) { + this.hashCodeValue = hashCodeValue; + } + + private int getHashCodeValue() { + return hashCodeValue; + } + + @Override + public boolean equals(final Object other) { + if (this == other) { + return true; + } + if (other == null || getClass() != other.getClass()) { + return false; + } + final HashCodeKey that = (HashCodeKey) other; + return hashCodeValue == that.hashCodeValue; + } + + @Override + public int hashCode() { + return hashCodeValue; + } + + @Override + public int compareTo(final HashCodeKey hashCodeKey) { + return Integer.compare(this.hashCodeValue, hashCodeKey.getHashCodeValue()); } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE); } } diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategyTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategyTest.java index c62fbb9bc..f5a46c0b5 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategyTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategyTest.java @@ -17,12 +17,10 @@ package io.aiven.kafka.connect.common.source.task; import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy; import java.util.ArrayList; import java.util.List; - -import org.apache.kafka.common.config.ConfigException; +import java.util.Optional; import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; @@ -31,22 +29,12 @@ import org.junit.jupiter.params.provider.CsvSource; final class PartitionDistributionStrategyTest { - + final DistributionType strategy = DistributionType.PARTITION; @Test void partitionInFileNameDefaultAivenS3Sink() { - final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(2); - assertThat(taskDistribution.isPartOfTask(1, "logs-1-00112.gz", - FilePatternUtils.configurePattern("{{topic}}-{{partition}}-{{start_offset}}"))).isTrue(); - } - - @Test - void partitionLocationNotSetExpectException() { - assertThatThrownBy(() -> new PartitionDistributionStrategy(2).isPartOfTask(1, "", - FilePatternUtils.configurePattern("logs-23-<partition>-<start_offset>"))) - .isInstanceOf(ConfigException.class) - .hasMessage( - "Source name format logs-23-<partition>-<start_offset> missing partition pattern {{partition}} please configure the expected source to include the partition pattern."); - + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(2); + final Context<String> ctx = getContext("{{topic}}-{{partition}}-{{start_offset}}", "logs-1-00112.gz"); + assertThat(taskDistribution.getTaskFor(ctx)).isEqualTo(1); } @ParameterizedTest(name = "[{index}] Pattern: {0}, Filename: {1}") @@ -62,10 +50,10 @@ void partitionLocationNotSetExpectException() { "{{topic}}-{{partition}},DEV_team_1-00112.gz", "{{topic}}-{{partition}}-{{start_offset}},timeseries-1-00112.gz" }) void testPartitionFileNamesAndExpectedOutcomes(final String configuredFilenamePattern, final String filename) { - final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(1); + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(1); // This test is testing the filename matching not the task allocation. - assertThat(taskDistribution.isPartOfTask(0, filename, - FilePatternUtils.configurePattern(configuredFilenamePattern))).isTrue(); + final Context<String> ctx = getContext(configuredFilenamePattern, filename); + assertThat(taskDistribution.getTaskFor(ctx)).isEqualTo(0); } @ParameterizedTest(name = "[{index}] Pattern: {0}, Filename: {1}") @@ -73,13 +61,11 @@ void testPartitionFileNamesAndExpectedOutcomes(final String configuredFilenamePa "no-seperator-in-date-partition-offset-{{timestamp}}-{{partition}}-{{start_offset}},no-seperator-in-date-partition-offset-202420220201100112.gz", "logs-2024-{{timestamp}}-{{partition}}-{{start_offset}},logs-20201-1-00112.gz", "logs-2024-{{timestamp}}{{partition}}-{{start_offset}},logs-202011-00112.gz", - "logs-2024-{{timestamp}}{{partition}}-{{start_offset}}, ", "logs-2023-{{partition}}-{{start_offset}},logs-2023-one-00112.gz" }) void expectFalseOnMalformedFilenames(final String configuredFilenamePattern, final String filename) { - final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(1); // This test is testing the filename matching not the task allocation. - assertThat(taskDistribution.isPartOfTask(0, filename, - FilePatternUtils.configurePattern(configuredFilenamePattern))).isFalse(); + final Optional<Context<String>> ctx = getOptionalContext(configuredFilenamePattern, filename); + assertThat(ctx).isEmpty(); } @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1}, Filename: {1}") @@ -90,10 +76,9 @@ void expectFalseOnMalformedFilenames(final String configuredFilenamePattern, fin "8,10,topics/logs/8/logs-8-0002.txt", "9,10,topics/logs/9/logs-9-0002.txt" }) void checkCorrectDistributionAcrossTasksOnFileName(final int taskId, final int maxTasks, final String path) { - final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTasks); - - assertThat(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern("logs-{{partition}}-{{start_offset}}"))).isTrue(); + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final Context<String> ctx = getContext("logs-{{partition}}-{{start_offset}}", path); + assertThat(taskDistribution.getTaskFor(ctx)).isEqualTo(taskId); } @ParameterizedTest(name = "[{index}] MaxTasks: {0}, Filename: {1}") @@ -103,14 +88,14 @@ void checkCorrectDistributionAcrossTasksOnFileName(final int taskId, final int m "10,topics/logs/9/logs-0002.txt" }) void filenameDistributionExactlyOnceDistribution(final int maxTasks, final String path) { - final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTasks); - final List<Boolean> results = new ArrayList<>(); + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final List<Integer> results = new ArrayList<>(); + final Context<String> ctx = getContext("logs-{{partition}}.txt", path); for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern("logs-{{partition}}.txt"))); + results.add(taskDistribution.getTaskFor(ctx)); } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); + // TODO Double check this, they should all match the first task. + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); } @ParameterizedTest(name = "[{index}] MaxTasks: {0}, TaskId: {1}, Filename: {2}") @@ -121,179 +106,123 @@ void filenameDistributionExactlyOnceDistribution(final int maxTasks, final Strin void filenameDistributionExactlyOnceDistributionWithTaskReconfiguration(final int maxTasks, final int maxTaskAfterReConfig, final String path) { - final String expectedSourceNameFormat = "logs-{{partition}}.txt"; - final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTasks); - final List<Boolean> results = new ArrayList<>(); + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final Context<String> ctx = getContext("logs-{{partition}}.txt", path); + + final List<Integer> results = new ArrayList<>(); for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern(expectedSourceNameFormat))); + results.add(taskDistribution.getTaskFor(ctx)); } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); taskDistribution.configureDistributionStrategy(maxTaskAfterReConfig); results.clear(); - for (int taskId = 0; taskId < maxTaskAfterReConfig; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern(expectedSourceNameFormat))); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE); + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); } @ParameterizedTest - @CsvSource({ - "{topic}}-1.txt,'Source name format {topic}}-1.txt missing partition pattern {{partition}} please configure the expected source to include the partition pattern.'", - " ,'Source name format null missing partition pattern {{partition}} please configure the expected source to include the partition pattern.'", - "empty-pattern,'Source name format empty-pattern missing partition pattern {{partition}} please configure the expected source to include the partition pattern.'" }) - void malformedFilenameSetup(final String expectedSourceFormat, final String expectedErrorMessage) { - final int maxTaskId = 1; - assertThatThrownBy(() -> new PartitionDistributionStrategy(maxTaskId).isPartOfTask(1, "", - FilePatternUtils.configurePattern(expectedSourceFormat))).isInstanceOf(ConfigException.class) - .hasMessage(expectedErrorMessage); - } - - @Test - void errorExpectedNullGivenForSourceNameFormat() { - final int maxTaskId = 1; - assertThatThrownBy(() -> new PartitionDistributionStrategy(maxTaskId).isPartOfTask(1, "", - FilePatternUtils.configurePattern(null))).isInstanceOf(ConfigException.class) - .hasMessage("Source name format null missing partition pattern {{partition}} please configure" - + " the expected source to include the partition pattern."); - } - - @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") - @CsvSource({ "0,1,topics/logs/partition=5/logs+5+0002.txt,true", - "0,4,topics/logs/partition=5/logs+5+0002.txt,false", "1,4,topics/logs/partition=5/logs+5+0002.txt,true", - "0,3,topics/logs/partition=5/logs+5+0002.txt,false", "0,5,topics/logs/partition=5/logs+5+0002.txt,true", - "2,3,topics/logs/partition=5/logs+5+0002.txt,true" }) - void withLeadingStringPartitionNamingConvention(final int taskId, final int maxTasks, final String path, - final boolean expectedResult) { + @CsvSource({ "10,5,topics/logs/0/logs-0002.txt", "10,5,topics/logs/1/logs-001.txt", + "10,5,topics/logs/2/logs-0002.txt", "10,5,topics/logs/3/logs-0002.txt", "10,5,topics/logs/4/logs-0002.txt", + "10,5,topics/logs/5/logs-0002.txt", "10,5,topics/logs/6/logs-0002.txt", "10,5,topics/logs/7/logs-0002.txt", + "10,5,topics/logs/8/logs-0002.txt", "10,5,topics/logs/9/logs-0002.txt" }) + void partitionPathDistributionExactlyOnceDistributionWithTaskReconfiguration(final int maxTasks, + final int maxTaskAfterReConfig, final String path) { - final PartitionDistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTasks); + final String expectedSourceNameFormat = "topics/{{topic}}/{{partition}}/.*$"; + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final Context<String> ctx = getContext(expectedSourceNameFormat, path); + final List<Integer> results = new ArrayList<>(); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + taskDistribution.configureDistributionStrategy(maxTaskAfterReConfig); - assertThat(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern("topics/{{topic}}/partition={{partition}}/.*$"))) - .isEqualTo(expectedResult); + results.clear(); + for (int taskId = 0; taskId < maxTaskAfterReConfig; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); } - @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") - @CsvSource({ "0,1,bucket/topics/topic-1/5/logs+5+0002.txt,true", - "0,4,bucket/topics/topic-1/5/logs+5+0002.txt,false", "1,4,bucket/topics/topic-1/5/logs+5+0002.txt,true", - "0,3,bucket/topics/topic-1/5/logs+5+0002.txt,false", "0,5,bucket/topics/topic-1/5/logs+5+0002.txt,true", - "2,3,bucket/topics/topic-1/5/logs+5+0002.txt,true" }) - void partitionInPathConvention(final int taskId, final int maxTaskId, final String path, - final boolean expectedResult) { - - final PartitionDistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTaskId); + @ParameterizedTest + @CsvSource({ "10,topics/logs/0/logs-0002.txt", "10,topics/logs/1/logs-001.log", "10,topics/logs/2/logs-0002.txt", + "10,topics/logs/3/logs-0002.txt", "10,topics/logs/4/logs-0002.txt", "10,topics/logs/5/logs-0002.txt", + "10,topics/logs/6/logs-0002.txt", "10,topics/logs/7/logs-0002.txt", "10,topics/logs/8/logs-0002.txt", + "10,topics/logs/9/logs-0002.txt" }) + void partitionPathDistributionExactlyOnceDistribution(final int maxTasks, final String path) { + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final List<Integer> results = new ArrayList<>(); + final Context<String> ctx = getContext("topics/{{topic}}/{{partition}}/.*$", path); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + } - assertThat(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern("bucket/topics/{{topic}}/{{partition}}/.*$"))) - .isEqualTo(expectedResult); + @Test + void expectEmptyContextOnNonIntPartitionSuppliedAsNoMatchOccurs() { + final String path = "topics/logs/one/test-001.txt"; + final Optional<Context<String>> ctx = getOptionalContext("topics/{{topic}}/{{partition}}/.*$", path); + assertThat(ctx).isEmpty(); } + @ParameterizedTest(name = "[{index}] Filename: {2}") + @CsvSource({ "topcs/logs/0/logs-0002.txt", "topics/logs/1", "S3/logs/2/logs-0002.txt", + "topicss/log/3/logs-0002.txt", "prod/logs/4/logs-0002.txt", "misspelt/logs/5/logs-0002.txt", + "test/logs/6/logs-0002.txt", "random/logs/7/logs-0002.txt", "DEV/logs/8/logs-0002.txt", + "poll/logs/9/logs-0002.txt" }) + void expectNoMatchOnUnconfiguredPaths(final String path) { + final Optional<Context<String>> ctx = getOptionalContext("topics/{{topic}}/{{partition}}/.*$", path); + assertThat(ctx).isEmpty(); + } @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") @CsvSource({ "0,10,topics/logs/0/logs-0002.txt", "1,10,topics/logs/1/logs-0002.txt", "2,10,topics/logs/2/logs-0002.txt", "3,10,topics/logs/3/logs-0002.txt", "4,10,topics/logs/4/logs-0002.txt", "5,10,topics/logs/5/logs-0002.txt", "6,10,topics/logs/6/logs-0002.txt", "7,10,topics/logs/7/logs-0002.txt", "8,10,topics/logs/8/logs-0002.txt", "9,10,topics/logs/9/logs-0002.txt" }) void checkCorrectDistributionAcrossTasks(final int taskId, final int maxTaskId, final String path) { - - final PartitionDistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTaskId); - - assertThat(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern("topics/{{topic}}/{{partition}}/.*$"))).isTrue(); - } - - @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") - @CsvSource({ "1,10,topcs/logs/0/logs-0002.txt", "2,10,topics/logs/1", "3,10,S3/logs/2/logs-0002.txt", - "4,10,topics/log/3/logs-0002.txt", "5,10,prod/logs/4/logs-0002.txt", "6,10,misspelt/logs/5/logs-0002.txt", - "7,10,test/logs/6/logs-0002.txt", "8,10,random/logs/7/logs-0002.txt", "9,10,DEV/logs/8/logs-0002.txt", - "10,10,poll/logs/9/logs-0002.txt" }) - void expectNoMatchOnUnconfiguredPaths(final int taskId, final int maxTaskId, final String path) { - - final PartitionDistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTaskId); - - assertThat(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern("topics/{{topic}}/{{partition}}/.*$"))).isFalse(); + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTaskId); + final Context<String> ctx = getContext("topics/{{topic}}/{{partition}}/.*$", path); + assertThat(taskDistribution.getTaskFor(ctx)).isEqualTo(taskId); } - @Test - void expectExceptionOnNonIntPartitionSupplied() { - final int taskId = 1; - final int maxTaskId = 1; - final String path = "topics/logs/one/test-001.txt"; + @ParameterizedTest(name = "[{index}] MaxTasks: {1} Filename: {2}") + @CsvSource({ "1,bucket/topics/topic-1/5/logs+5+0002.txt,0", "4,bucket/topics/topic-1/5/logs+5+0002.txt,1", + "4,bucket/topics/topic-1/5/logs+5+0002.txt,1", "3,bucket/topics/topic-1/5/logs+5+0002.txt,2", + "5,bucket/topics/topic-1/5/logs+5+0002.txt,0", "3,bucket/topics/topic-1/5/logs+5+0002.txt,2" }) + void partitionInPathConvention(final int maxTaskId, final String path, final int expectedResult) { - final PartitionDistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTaskId); - assertThat(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern("topics/{{topic}}/{{partition}}/.*$"))).isFalse(); + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTaskId); + final Context<String> ctx = getContext("bucket/topics/{{topic}}/{{partition}}/.*$", path); + assertThat(taskDistribution.getTaskFor(ctx)).isEqualTo(expectedResult); } - @Test - void malformedRegexSetup() { - final int maxTaskId = 1; - - assertThatThrownBy(() -> new PartitionDistributionStrategy(maxTaskId).isPartOfTask(1, "", - FilePatternUtils.configurePattern("topics/{{topic}}/"))).isInstanceOf(ConfigException.class) - .hasMessage( - "Source name format topics/{{topic}}/ missing partition pattern {{partition}} please configure the expected source to include the partition pattern."); - } + @ParameterizedTest(name = "[{index}] MaxTasks: {1} Filename: {2}") + @CsvSource({ "1,topics/logs/partition=5/logs+5+0002.txt,0", "4,topics/logs/partition=5/logs+5+0002.txt,1", + "4,topics/logs/partition=5/logs+5+0002.txt,1", "3,topics/logs/partition=5/logs+5+0002.txt,2", + "5,topics/logs/partition=5/logs+5+0002.txt,0", "3,topics/logs/partition=5/logs+5+0002.txt,2" }) + void withLeadingStringPartitionNamingConvention(final int maxTasks, final String path, final int expectedResult) { - @ParameterizedTest - @CsvSource({ - ",Source name format null missing partition pattern {{partition}} please configure the expected source to include the partition pattern.", - "@adsfs,Source name format @adsfs missing partition pattern {{partition}} please configure the expected source to include the partition pattern.", - "empty-path,Source name format empty-path missing partition pattern {{partition}} please configure the expected source to include the partition pattern." }) - void malformedPathSetup(final String expectedPathFormat, final String expectedErrorMessage) { - final int maxTaskId = 1; + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final Context<String> ctx = getContext("topics/{{topic}}/partition={{partition}}/.*$", path); - assertThatThrownBy(() -> new PartitionDistributionStrategy(maxTaskId).isPartOfTask(1, expectedPathFormat, - FilePatternUtils.configurePattern(expectedPathFormat))).isInstanceOf(ConfigException.class) - .hasMessage(expectedErrorMessage); + assertThat(taskDistribution.getTaskFor(ctx)).isEqualTo(expectedResult); } - @ParameterizedTest - @CsvSource({ "10,topics/logs/0/logs-0002.txt", "10,topics/logs/1/logs-001.log", "10,topics/logs/2/logs-0002.txt", - "10,topics/logs/3/logs-0002.txt", "10,topics/logs/4/logs-0002.txt", "10,topics/logs/5/logs-0002.txt", - "10,topics/logs/6/logs-0002.txt", "10,topics/logs/7/logs-0002.txt", "10,topics/logs/8/logs-0002.txt", - "10,topics/logs/9/logs-0002.txt" }) - void partitionPathDistributionExactlyOnceDistribution(final int maxTasks, final String path) { - final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTasks); - final List<Boolean> results = new ArrayList<>(); - for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern("topics/{{topic}}/{{partition}}/.*$"))); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); + public static Context<String> getContext(final String configuredFilenamePattern, final String filename) { + final Optional<Context<String>> ctx = getOptionalContext(configuredFilenamePattern, filename); + assertThat(ctx.isPresent()).isTrue(); + return ctx.get(); } - @ParameterizedTest - @CsvSource({ "10,5,topics/logs/0/logs-0002.txt", "10,5,topics/logs/1/logs-001.txt", - "10,5,topics/logs/2/logs-0002.txt", "10,5,topics/logs/3/logs-0002.txt", "10,5,topics/logs/4/logs-0002.txt", - "10,5,topics/logs/5/logs-0002.txt", "10,5,topics/logs/6/logs-0002.txt", "10,5,topics/logs/7/logs-0002.txt", - "10,5,topics/logs/8/logs-0002.txt", "10,5,topics/logs/9/logs-0002.txt" }) - void partitionPathDistributionExactlyOnceDistributionWithTaskReconfiguration(final int maxTasks, - final int maxTaskAfterReConfig, final String path) { - - final String expectedSourceNameFormat = "topics/{{topic}}/{{partition}}/.*$"; - final DistributionStrategy taskDistribution = new PartitionDistributionStrategy(maxTasks); - final List<Boolean> results = new ArrayList<>(); - for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern(expectedSourceNameFormat))); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); - taskDistribution.configureDistributionStrategy(maxTaskAfterReConfig); - - results.clear(); - for (int taskId = 0; taskId < maxTaskAfterReConfig; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path, - FilePatternUtils.configurePattern(expectedSourceNameFormat))); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE); + public static Optional<Context<String>> getOptionalContext(final String configuredFilenamePattern, + final String filename) { + final FilePatternUtils utils = new FilePatternUtils(configuredFilenamePattern); + return utils.process(filename); } } diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java index 5d95d6ebd..39a6f7f2d 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java @@ -16,6 +16,9 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.common.config.CommonConfig.MAX_TASKS; +import static io.aiven.kafka.connect.common.config.CommonConfig.TASK_ID; +import static io.aiven.kafka.connect.common.config.FileNameFragment.FILE_NAME_TEMPLATE_CONFIG; import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.AVRO_VALUE_SERIALIZER; import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; @@ -46,8 +49,6 @@ import io.aiven.kafka.connect.common.source.input.InputFormat; import io.aiven.kafka.connect.common.source.input.TransformerFactory; -import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; -import io.aiven.kafka.connect.common.source.task.HashDistributionStrategy; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import io.aiven.kafka.connect.s3.source.utils.AWSV2SourceClient; @@ -108,7 +109,7 @@ private Map<String, String> getConfig(final String topics, final int maxTasks) { config.put(TARGET_TOPICS, topics); config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); config.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.converters.ByteArrayConverter"); - config.put("tasks.max", String.valueOf(maxTasks)); + config.put(MAX_TASKS, String.valueOf(maxTasks)); return config; } @@ -121,10 +122,14 @@ private Map<String, String> getConfig(final String topics, final int maxTasks) { @Test void sourceRecordIteratorBytesTest(final TestInfo testInfo) { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> configData = getConfig(topicName, 1); + final int maxTasks = 1; + final int taskId = 0; + final Map<String, String> configData = getConfig(topicName, maxTasks); configData.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); - + configData.put(FILE_NAME_TEMPLATE_CONFIG, "{{topic}}-{{partition}}-{{start_offset}}"); + configData.put(TASK_ID, String.valueOf(taskId)); + configData.put(MAX_TASKS, String.valueOf(maxTasks)); final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; @@ -153,8 +158,7 @@ void sourceRecordIteratorBytesTest(final TestInfo testInfo) { final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig); final Iterator<S3SourceRecord> sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, offsetManager, - TransformerFactory.getTransformer(InputFormat.BYTES), sourceClient, new HashDistributionStrategy(1), - FilePatternUtils.configurePattern("{{topic}}-{{partition}}-{{start_offset}}"), 0); + TransformerFactory.getTransformer(InputFormat.BYTES), sourceClient); final HashSet<String> seenKeys = new HashSet<>(); while (sourceRecordIterator.hasNext()) { @@ -177,6 +181,9 @@ void sourceRecordIteratorAvroTest(final TestInfo testInfo) throws IOException { configData.put(INPUT_FORMAT_KEY, InputFormat.AVRO.getValue()); configData.put(VALUE_CONVERTER_KEY, "io.confluent.connect.avro.AvroConverter"); configData.put(AVRO_VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); + configData.put(FILE_NAME_TEMPLATE_CONFIG, "{{topic}}-{{partition}}-{{start_offset}}"); + configData.put(TASK_ID, String.valueOf(taskId)); + configData.put(MAX_TASKS, String.valueOf(maxTasks)); // Define Avro schema final String schemaJson = "{\n" + " \"type\": \"record\",\n" + " \"name\": \"TestRecord\",\n" @@ -219,9 +226,7 @@ void sourceRecordIteratorAvroTest(final TestInfo testInfo) throws IOException { final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig); final Iterator<S3SourceRecord> sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, offsetManager, - TransformerFactory.getTransformer(InputFormat.AVRO), sourceClient, - new HashDistributionStrategy(maxTasks), - FilePatternUtils.configurePattern("{{topic}}-{{partition}}-{{start_offset}}"), taskId); + TransformerFactory.getTransformer(InputFormat.AVRO), sourceClient); final HashSet<String> seenKeys = new HashSet<>(); final Map<String, List<Long>> seenRecords = new HashMap<>(); diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java index ad31acc88..387a6105d 100644 --- a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -16,13 +16,14 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.common.config.CommonConfig.MAX_TASKS; import static io.aiven.kafka.connect.common.config.FileNameFragment.FILE_NAME_TEMPLATE_CONFIG; import static io.aiven.kafka.connect.common.config.FileNameFragment.FILE_PATH_PREFIX_TEMPLATE_CONFIG; import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.AVRO_VALUE_SERIALIZER; import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMA_REGISTRY_URL; import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.VALUE_CONVERTER_SCHEMA_REGISTRY_URL; -import static io.aiven.kafka.connect.common.config.SourceConfigFragment.OBJECT_DISTRIBUTION_STRATEGY; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.DISTRIBUTION_TYPE; import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPIC_PARTITIONS; import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_ACCESS_KEY_ID_CONFIG; @@ -58,7 +59,7 @@ import org.apache.kafka.common.serialization.ByteArrayDeserializer; import io.aiven.kafka.connect.common.source.input.InputFormat; -import io.aiven.kafka.connect.common.source.task.enums.ObjectDistributionStrategy; +import io.aiven.kafka.connect.common.source.task.DistributionType; import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; @@ -156,21 +157,21 @@ void tearDown() { @ValueSource(booleans = { true, false }) void bytesTest(final boolean addPrefix) { final var topicName = IntegrationBase.topicName(testInfo); - final ObjectDistributionStrategy objectDistributionStrategy; + final DistributionType distributionType; final int partitionId = 0; final String prefixPattern = "topics/{{topic}}/partition={{partition}}/"; String s3Prefix = ""; if (addPrefix) { - objectDistributionStrategy = ObjectDistributionStrategy.PARTITION_IN_FILENAME; + distributionType = DistributionType.PARTITION; s3Prefix = "topics/" + topicName + "/partition=" + partitionId + "/"; } else { - objectDistributionStrategy = ObjectDistributionStrategy.PARTITION_IN_FILENAME; + distributionType = DistributionType.PARTITION; } final String fileNamePatternSeparator = "_"; - final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 1, objectDistributionStrategy, - addPrefix, s3Prefix, prefixPattern, fileNamePatternSeparator); + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 1, distributionType, addPrefix, + s3Prefix, prefixPattern, fileNamePatternSeparator); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); @@ -212,7 +213,7 @@ void avroTest(final TestInfo testInfo) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); final boolean addPrefix = false; final Map<String, String> connectorConfig = getAvroConfig(topicName, InputFormat.AVRO, addPrefix, "", "", - ObjectDistributionStrategy.OBJECT_HASH); + DistributionType.OBJECT_HASH); connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); @@ -274,10 +275,10 @@ void parquetTest(final boolean addPrefix) throws IOException { final var topicName = IntegrationBase.topicName(testInfo); final String partition = "0"; - final ObjectDistributionStrategy objectDistributionStrategy; + final DistributionType distributionType; final String prefixPattern = "bucket/topics/{{topic}}/partition/{{partition}}/"; String s3Prefix = ""; - objectDistributionStrategy = ObjectDistributionStrategy.PARTITION_IN_FILENAME; + distributionType = DistributionType.PARTITION; if (addPrefix) { s3Prefix = "bucket/topics/" + topicName + "/partition/" + partition + "/"; } @@ -287,7 +288,7 @@ void parquetTest(final boolean addPrefix) throws IOException { final String name = "testuser"; final Map<String, String> connectorConfig = getAvroConfig(topicName, InputFormat.PARQUET, addPrefix, s3Prefix, - prefixPattern, objectDistributionStrategy); + prefixPattern, distributionType); connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); final Path path = ContentUtils.getTmpFilePath(name); @@ -311,9 +312,9 @@ void parquetTest(final boolean addPrefix) throws IOException { private Map<String, String> getAvroConfig(final String topicName, final InputFormat inputFormat, final boolean addPrefix, final String s3Prefix, final String prefixPattern, - final ObjectDistributionStrategy objectDistributionStrategy) { - final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 4, objectDistributionStrategy, - addPrefix, s3Prefix, prefixPattern, "-"); + final DistributionType distributionType) { + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 4, distributionType, addPrefix, + s3Prefix, prefixPattern, "-"); connectorConfig.put(INPUT_FORMAT_KEY, inputFormat.getValue()); connectorConfig.put(SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); connectorConfig.put(VALUE_CONVERTER_KEY, "io.confluent.connect.avro.AvroConverter"); @@ -325,8 +326,8 @@ private Map<String, String> getAvroConfig(final String topicName, final InputFor @Test void jsonTest(final TestInfo testInfo) { final var topicName = IntegrationBase.topicName(testInfo); - final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 1, - ObjectDistributionStrategy.PARTITION_IN_FILENAME, false, "", "", "-"); + final Map<String, String> connectorConfig = getConfig(CONNECTOR_NAME, topicName, 1, DistributionType.PARTITION, + false, "", "", "-"); connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.JSONL.getValue()); connectorConfig.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.json.JsonConverter"); @@ -355,15 +356,15 @@ void jsonTest(final TestInfo testInfo) { } private Map<String, String> getConfig(final String connectorName, final String topics, final int maxTasks, - final ObjectDistributionStrategy taskDistributionConfig, final boolean addPrefix, final String s3Prefix, + final DistributionType taskDistributionConfig, final boolean addPrefix, final String s3Prefix, final String prefixPattern, final String fileNameSeparator) { final Map<String, String> config = new HashMap<>(basicS3ConnectorConfig(addPrefix, s3Prefix)); config.put("name", connectorName); config.put(TARGET_TOPICS, topics); config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); config.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.converters.ByteArrayConverter"); - config.put("tasks.max", String.valueOf(maxTasks)); - config.put(OBJECT_DISTRIBUTION_STRATEGY, taskDistributionConfig.value()); + config.put(MAX_TASKS, String.valueOf(maxTasks)); + config.put(DISTRIBUTION_TYPE, taskDistributionConfig.value()); config.put(FILE_NAME_TEMPLATE_CONFIG, "{{topic}}" + fileNameSeparator + "{{partition}}" + fileNameSeparator + "{{start_offset}}"); if (addPrefix) { diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java index ca0d10a14..18d0f0adb 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java @@ -16,6 +16,8 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.common.config.CommonConfig.TASK_ID; + import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -62,7 +64,7 @@ public List<Map<String, String>> taskConfigs(final int maxTasks) { final var taskProps = new ArrayList<Map<String, String>>(); for (int i = 0; i < maxTasks; i++) { final var props = new HashMap<>(configProperties); // NOPMD - props.put("task.id", String.valueOf(i)); + props.put(TASK_ID, String.valueOf(i)); taskProps.add(props); } return taskProps; diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java index 3ed3fdafd..5466435af 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -19,18 +19,12 @@ import java.util.Iterator; import java.util.Map; import java.util.Objects; -import java.util.regex.Pattern; import org.apache.kafka.connect.source.SourceRecord; import io.aiven.kafka.connect.common.config.SourceCommonConfig; import io.aiven.kafka.connect.common.source.AbstractSourceTask; import io.aiven.kafka.connect.common.source.input.Transformer; -import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; -import io.aiven.kafka.connect.common.source.task.DistributionStrategy; -import io.aiven.kafka.connect.common.source.task.HashDistributionStrategy; -import io.aiven.kafka.connect.common.source.task.PartitionDistributionStrategy; -import io.aiven.kafka.connect.common.source.task.enums.ObjectDistributionStrategy; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; import io.aiven.kafka.connect.s3.source.utils.AWSV2SourceClient; import io.aiven.kafka.connect.s3.source.utils.OffsetManager; @@ -71,9 +65,6 @@ public class S3SourceTask extends AbstractSourceTask { private OffsetManager offsetManager; private S3SourceConfig s3SourceConfig; - private int taskId; - private Pattern filePattern; - public S3SourceTask() { super(LOGGER); } @@ -136,8 +127,8 @@ protected SourceCommonConfig configure(final Map<String, String> props) { this.transformer = s3SourceConfig.getTransformer(); offsetManager = new OffsetManager(context, s3SourceConfig); awsv2SourceClient = new AWSV2SourceClient(s3SourceConfig); - setS3SourceRecordIterator(new SourceRecordIterator(s3SourceConfig, offsetManager, this.transformer, - awsv2SourceClient, initializeObjectDistributionStrategy(), filePattern, taskId)); + setS3SourceRecordIterator( + new SourceRecordIterator(s3SourceConfig, offsetManager, this.transformer, awsv2SourceClient)); return s3SourceConfig; } @@ -179,22 +170,4 @@ public Transformer getTransformer() { return transformer; } - private DistributionStrategy initializeObjectDistributionStrategy() { - final ObjectDistributionStrategy objectDistributionStrategy = s3SourceConfig.getObjectDistributionStrategy(); - final int maxTasks = Integer.parseInt(s3SourceConfig.originals().get("tasks.max").toString()); - this.taskId = Integer.parseInt(s3SourceConfig.originals().get("task.id").toString()) % maxTasks; - DistributionStrategy distributionStrategy; - - if (objectDistributionStrategy == ObjectDistributionStrategy.PARTITION_IN_FILENAME) { - this.filePattern = FilePatternUtils - .configurePattern(s3SourceConfig.getS3FileNameFragment().getFilenameTemplate().toString()); - distributionStrategy = new PartitionDistributionStrategy(maxTasks); - } else { - this.filePattern = FilePatternUtils - .configurePattern(s3SourceConfig.getS3FileNameFragment().getFilenameTemplate().toString()); - distributionStrategy = new HashDistributionStrategy(maxTasks); - } - - return distributionStrategy; - } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java index 820be20aa..2eb31fff2 100644 --- a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -20,8 +20,8 @@ import java.util.Iterator; import java.util.Map; import java.util.Optional; +import java.util.function.Consumer; import java.util.function.Function; -import java.util.regex.Pattern; import java.util.stream.Stream; import org.apache.kafka.connect.data.SchemaAndValue; @@ -29,9 +29,13 @@ import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; +import io.aiven.kafka.connect.common.source.task.Context; import io.aiven.kafka.connect.common.source.task.DistributionStrategy; +import io.aiven.kafka.connect.common.source.task.DistributionType; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import software.amazon.awssdk.services.s3.model.S3Object; /** @@ -40,6 +44,7 @@ */ public final class SourceRecordIterator implements Iterator<S3SourceRecord> { public static final long BYTES_TRANSFORMATION_NUM_OF_RECS = 1L; + private static final Logger LOGGER = LoggerFactory.getLogger(SourceRecordIterator.class); private final OffsetManager offsetManager; @@ -51,20 +56,19 @@ public final class SourceRecordIterator implements Iterator<S3SourceRecord> { // At which point it will work for al our integrations. private final AWSV2SourceClient sourceClient; - private String topic; - private int partitionId; + private Context<String> context; private final DistributionStrategy distributionStrategy; - private final int taskId; + private int taskId; private final Iterator<S3Object> inner; private Iterator<S3SourceRecord> outer; - private final Pattern filePattern; + private FilePatternUtils filePattern; + private final Optional<String> targetTopics; public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final OffsetManager offsetManager, - final Transformer transformer, final AWSV2SourceClient sourceClient, - final DistributionStrategy distributionStrategy, final Pattern filePattern, final int taskId) { + final Transformer transformer, final AWSV2SourceClient sourceClient) { super(); this.s3SourceConfig = s3SourceConfig; this.offsetManager = offsetManager; @@ -72,13 +76,12 @@ public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final OffsetMan this.bucketName = s3SourceConfig.getAwsS3BucketName(); this.transformer = transformer; this.sourceClient = sourceClient; - this.filePattern = filePattern; - this.distributionStrategy = distributionStrategy; - this.taskId = taskId; + this.targetTopics = Optional.ofNullable(s3SourceConfig.getTargetTopics()); + this.distributionStrategy = initializeDistributionStrategy(); // Initialize predicates sourceClient.addPredicate(this::isFileMatchingPattern); - sourceClient.addPredicate(this::isFileAssignedToTask); + sourceClient.addPredicate(obj -> isFileAssignedToTask(context, taskId)); // call filters out bad file names and extracts topic/partition inner = sourceClient.getS3ObjectIterator(null); @@ -86,19 +89,16 @@ public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final OffsetMan } public boolean isFileMatchingPattern(final S3Object s3Object) { - final Optional<String> optionalTopic = FilePatternUtils.getTopic(filePattern, s3Object.key()); - final Optional<Integer> optionalPartitionId = FilePatternUtils.getPartitionId(filePattern, s3Object.key()); - - if (optionalTopic.isPresent() && optionalPartitionId.isPresent()) { - topic = optionalTopic.get(); - partitionId = optionalPartitionId.get(); + final Optional<Context<String>> optionalCtx = filePattern.process(s3Object.key()); + if (optionalCtx.isPresent()) { + context = optionalCtx.get(); return true; } return false; } - public boolean isFileAssignedToTask(final S3Object s3Object) { - return distributionStrategy.isPartOfTask(taskId, s3Object.key(), filePattern); + public boolean isFileAssignedToTask(final Context<String> ctx, final int taskId) { + return taskId == distributionStrategy.getTaskFor(ctx); } @Override @@ -127,8 +127,12 @@ public void remove() { * @return a stream of S3SourceRecords created from the input stream of the S3Object. */ private Stream<S3SourceRecord> convert(final S3Object s3Object) { - - final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(topic, partitionId, bucketName); + // Set the target topic in the context if it has been set from configuration. + if (targetTopics.isPresent()) { + overrideContextTopic(); + } + final Map<String, Object> partitionMap = ConnectUtils.getPartitionMap(context.getTopic().get(), + context.getPartition().get(), bucketName); final long recordCount = offsetManager.recordsProcessedForObjectKey(partitionMap, s3Object.key()); // Optimizing without reading stream again. @@ -136,13 +140,32 @@ private Stream<S3SourceRecord> convert(final S3Object s3Object) { return Stream.empty(); } - final SchemaAndValue keyData = transformer.getKeyData(s3Object.key(), topic, s3SourceConfig); + final SchemaAndValue keyData = transformer.getKeyData(s3Object.key(), context.getTopic().get(), s3SourceConfig); return transformer - .getRecords(sourceClient.getObject(s3Object.key()), topic, partitionId, s3SourceConfig, recordCount) + .getRecords(sourceClient.getObject(s3Object.key()), context.getTopic().get(), + context.getPartition().get(), s3SourceConfig, recordCount) .map(new Mapper(partitionMap, recordCount, keyData, s3Object.key())); } + private Consumer<String> overrideContextTopic() { + if (context.getTopic().isPresent()) { + LOGGER.debug( + "Overriding topic '{}' extracted from S3 Object Key with topic '{}' from configuration 'topics'. ", + context.getTopic().get(), targetTopics.get()); + } + return context::setTopic; + } + + private DistributionStrategy initializeDistributionStrategy() { + final DistributionType distributionType = s3SourceConfig.getDistributionType(); + final int maxTasks = s3SourceConfig.getMaxTasks(); + this.taskId = s3SourceConfig.getTaskId() % maxTasks; + this.filePattern = new FilePatternUtils( + s3SourceConfig.getS3FileNameFragment().getFilenameTemplate().toString()); + return distributionType.getDistributionStrategy(maxTasks); + } + /** * maps the data from the @{link Transformer} stream to an S3SourceRecord given all the additional data required. */ @@ -175,7 +198,8 @@ public Mapper(final Map<String, Object> partitionMap, final long recordCount, fi @Override public S3SourceRecord apply(final SchemaAndValue valueData) { recordCount++; - return new S3SourceRecord(partitionMap, recordCount, topic, partitionId, objectKey, keyData, valueData); + return new S3SourceRecord(partitionMap, recordCount, context.getTopic().get(), context.getPartition().get(), + objectKey, keyData, valueData); } } } diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java index c915376c9..e7b958ab3 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -16,6 +16,8 @@ package io.aiven.kafka.connect.s3.source; +import static io.aiven.kafka.connect.common.config.CommonConfig.MAX_TASKS; +import static io.aiven.kafka.connect.common.config.CommonConfig.TASK_ID; import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPIC_PARTITIONS; @@ -173,8 +175,8 @@ private void setBasicProperties() { properties.putIfAbsent("name", "test_source_connector"); properties.putIfAbsent("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); properties.putIfAbsent("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); - properties.putIfAbsent("tasks.max", "1"); - properties.put("task.id", "1"); + properties.putIfAbsent(MAX_TASKS, "1"); + properties.put(TASK_ID, "1"); properties.putIfAbsent("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); properties.putIfAbsent(TARGET_TOPIC_PARTITIONS, "0,1"); properties.putIfAbsent(TARGET_TOPICS, "testtopic"); diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java index f7559ddfd..e5e8ad613 100644 --- a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -16,10 +16,10 @@ package io.aiven.kafka.connect.s3.source.utils; -import static io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils.PATTERN_PARTITION_KEY; -import static io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils.PATTERN_TOPIC_KEY; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG; import static io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator.BYTES_TRANSFORMATION_NUM_OF_RECS; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.ArgumentMatchers.anyMap; import static org.mockito.Mockito.any; import static org.mockito.Mockito.anyInt; @@ -31,140 +31,191 @@ import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; -import java.io.ByteArrayInputStream; -import java.io.InputStream; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Queue; +import java.util.function.Consumer; import java.util.function.Predicate; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.util.stream.Stream; import org.apache.kafka.connect.data.SchemaAndValue; +import io.aiven.kafka.connect.common.config.FileNameFragment; import io.aiven.kafka.connect.common.source.input.AvroTransformer; import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; import io.aiven.kafka.connect.common.source.input.InputFormat; import io.aiven.kafka.connect.common.source.input.Transformer; import io.aiven.kafka.connect.common.source.input.TransformerFactory; import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; -import io.aiven.kafka.connect.common.source.task.HashDistributionStrategy; +import io.aiven.kafka.connect.common.source.task.DistributionType; +import io.aiven.kafka.connect.common.templating.Template; import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import org.apache.commons.lang3.tuple.Pair; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.CsvSource; +import software.amazon.awssdk.core.ResponseBytes; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.GetObjectRequest; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Response; import software.amazon.awssdk.services.s3.model.S3Object; - +@SuppressWarnings("PMD.ExcessiveImports") final class SourceRecordIteratorTest { private S3SourceConfig mockConfig; private OffsetManager mockOffsetManager; private Transformer mockTransformer; + private FileNameFragment mockFileNameFrag; - private AWSV2SourceClient mockSourceApiClient; + private AWSV2SourceClient sourceApiClient; @BeforeEach public void setUp() { mockConfig = mock(S3SourceConfig.class); mockOffsetManager = mock(OffsetManager.class); mockTransformer = mock(Transformer.class); - mockSourceApiClient = mock(AWSV2SourceClient.class); + mockFileNameFrag = mock(FileNameFragment.class); + } + + private S3SourceConfig getConfig(final Map<String, String> data) { + final Map<String, String> defaults = new HashMap<>(); + defaults.put(AWS_S3_BUCKET_NAME_CONFIG, "bucket-name"); + defaults.putAll(data); + return new S3SourceConfig(defaults); + } + + private void mockSourceConfig(final S3SourceConfig s3SourceConfig, final String filePattern, final int taskId, final int maxTasks,final String targetTopic ){ + when(s3SourceConfig.getDistributionType()).thenReturn(DistributionType.OBJECT_HASH); + when(s3SourceConfig.getTaskId()).thenReturn(taskId); + when(s3SourceConfig.getMaxTasks()).thenReturn(maxTasks); + when(s3SourceConfig.getS3FileNameFragment()).thenReturn(mockFileNameFrag); + when(mockFileNameFrag.getFilenameTemplate()).thenReturn(Template.of(filePattern)); + when(mockConfig.getTargetTopics()).thenReturn(targetTopic); } @Test void testIteratorProcessesS3Objects() throws Exception { final String key = "topic-00001-abc123.txt"; + final String filePattern = "{{topic}}-{{partition}}"; + final S3SourceConfig config = getConfig(Collections.emptyMap()); + final S3ClientBuilder builder = new S3ClientBuilder(); + sourceApiClient = new AWSV2SourceClient(builder.build(), config); - // Mock InputStream - try (InputStream mockInputStream = new ByteArrayInputStream(new byte[] {})) { - when(mockSourceApiClient.getObject(anyString())).thenReturn(() -> mockInputStream); + mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); - mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); + when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); - when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); - final Pattern filePattern = mock(Pattern.class); + mockSourceConfig(mockConfig, filePattern, 0, 1, null); - when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Collections.emptyIterator()); - Iterator<S3SourceRecord> iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, - mockSourceApiClient, new HashDistributionStrategy(1), - FilePatternUtils.configurePattern("{{topic}}-{{partition}}-{{start_offset}}"), 0); + final Iterator<S3SourceRecord> iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, sourceApiClient); + assertThat(iterator).isExhausted(); - assertThat(iterator.hasNext()).isFalse(); - mockPatternMatcher(filePattern); + builder.reset().addObject(key, "Hello World").endOfBlock(); + sourceApiClient = new AWSV2SourceClient(builder.build(), config); + final Iterator<S3SourceRecord> s3ObjectIterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, sourceApiClient); - final S3Object obj = S3Object.builder().key(key).build(); + assertThat(s3ObjectIterator).hasNext(); + assertThat(s3ObjectIterator.next()).isNotNull(); + assertThat(s3ObjectIterator).isExhausted(); - final ByteArrayInputStream bais = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8)); - when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(obj).iterator()); - when(mockSourceApiClient.getObject(any())).thenReturn(() -> bais); - iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, mockSourceApiClient, - new HashDistributionStrategy(1), filePattern, 0); + } + + @Test + void testIteratorExpectExceptionWhenGetsContextWithNoTopic() throws Exception { + + final String key = "topic-00001-abc123.txt"; + final String filePattern = "{{partition}}"; + final S3SourceConfig config = getConfig(Collections.emptyMap()); + final S3ClientBuilder builder = new S3ClientBuilder(); + sourceApiClient = new AWSV2SourceClient(builder.build(), config); + + mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); + + when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); + + mockSourceConfig(mockConfig, filePattern, 0, 1, null); + + final Iterator<S3SourceRecord> iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, sourceApiClient); + assertThat(iterator).isExhausted(); + + builder.reset().addObject(key, "Hello World").endOfBlock(); + sourceApiClient = new AWSV2SourceClient(builder.build(), config); + final Iterator<S3SourceRecord> s3ObjectIterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, sourceApiClient); + + assertThatThrownBy(s3ObjectIterator::hasNext).isInstanceOf(NoSuchElementException.class) + .hasMessage("No value present"); - assertThat(iterator.hasNext()).isTrue(); - assertThat(iterator.next()).isNotNull(); - } } @Test void testIteratorProcessesS3ObjectsForByteArrayTransformer() throws Exception { final String key = "topic-00001-abc123.txt"; - final S3Object s3Object = S3Object.builder().key(key).build(); + final String filePattern = "{{topic}}-{{partition}}"; + + final S3SourceConfig config = getConfig(Collections.emptyMap()); + final S3ClientBuilder builder = new S3ClientBuilder(); + + builder.reset().addObject(key, "Hello World").endOfBlock(); + sourceApiClient = new AWSV2SourceClient(builder.build(), config); + + mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); + + when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); + + mockSourceConfig(mockConfig, filePattern, 0, 1, null); // With ByteArrayTransformer - try (InputStream inputStream = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8))) { - when(mockSourceApiClient.getObject(key)).thenReturn(() -> inputStream); - final Pattern filePattern = mock(Pattern.class); - - when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(s3Object).iterator()); - - mockTransformer = mock(ByteArrayTransformer.class); - when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) - .thenReturn(Stream.of(SchemaAndValue.NULL)); - - when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); - - when(mockSourceApiClient.getListOfObjectKeys(any())) - .thenReturn(Collections.singletonList(key).listIterator()); - when(mockOffsetManager.recordsProcessedForObjectKey(anyMap(), anyString())) - .thenReturn(BYTES_TRANSFORMATION_NUM_OF_RECS); - mockPatternMatcher(filePattern); - - // should skip if any records were produced by source record iterator. - final Iterator<S3SourceRecord> iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, - mockTransformer, mockSourceApiClient, new HashDistributionStrategy(1), filePattern, 0); - assertThat(iterator.hasNext()).isFalse(); - verify(mockSourceApiClient, never()).getObject(any()); - verify(mockTransformer, never()).getRecords(any(), anyString(), anyInt(), any(), anyLong()); - } + + mockTransformer = mock(ByteArrayTransformer.class); + when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) + .thenReturn(Stream.of(SchemaAndValue.NULL)); + + when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); + + when(mockOffsetManager.recordsProcessedForObjectKey(anyMap(), anyString())) + .thenReturn(BYTES_TRANSFORMATION_NUM_OF_RECS); + + // should skip if any records were produced by source record iterator. + final Iterator<S3SourceRecord> byteArrayIterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, sourceApiClient); + + assertThat(byteArrayIterator).isExhausted(); + + verify(mockTransformer, never()).getRecords(any(), anyString(), anyInt(), any(), anyLong()); // With AvroTransformer - try (InputStream inputStream = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8))) { - when(mockSourceApiClient.getObject(key)).thenReturn(() -> inputStream); - final Pattern filePattern = mock(Pattern.class); - when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(s3Object).iterator()); - mockTransformer = mock(AvroTransformer.class); - when(mockSourceApiClient.getListOfObjectKeys(any())) - .thenReturn(Collections.singletonList(key).listIterator()); - - when(mockOffsetManager.recordsProcessedForObjectKey(anyMap(), anyString())) - .thenReturn(BYTES_TRANSFORMATION_NUM_OF_RECS); - mockPatternMatcher(filePattern); - - when(mockTransformer.getKeyData(anyString(), anyString(), any())).thenReturn(SchemaAndValue.NULL); - when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) - .thenReturn(Arrays.asList(SchemaAndValue.NULL).stream()); - - final Iterator<S3SourceRecord> iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, - mockTransformer, mockSourceApiClient, new HashDistributionStrategy(1), filePattern, 0); - assertThat(iterator.hasNext()).isFalse(); - - verify(mockTransformer, times(0)).getRecords(any(), anyString(), anyInt(), any(), anyLong()); - } + + mockTransformer = mock(AvroTransformer.class); + + when(mockOffsetManager.recordsProcessedForObjectKey(anyMap(), anyString())) + .thenReturn(BYTES_TRANSFORMATION_NUM_OF_RECS); + + when(mockTransformer.getKeyData(anyString(), anyString(), any())).thenReturn(SchemaAndValue.NULL); + when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) + .thenReturn(Arrays.asList(SchemaAndValue.NULL).stream()); + + final Iterator<S3SourceRecord> avroIterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, sourceApiClient); + assertThat(avroIterator).isExhausted(); + + verify(mockTransformer, times(0)).getRecords(any(), anyString(), anyInt(), any(), anyLong()); + } @ParameterizedTest @@ -174,52 +225,123 @@ void testFetchObjectSummariesWithOneNonZeroByteObjectWithTaskIdAssigned(final in mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); - final Pattern filePattern = mock(Pattern.class); - - mockPatternMatcher(filePattern); + final String key = "topic-00001-abc123.txt"; + final String filePattern = "{{partition}}"; + final String topic = "topic"; + final FilePatternUtils filePatternUtils = new FilePatternUtils(filePattern); + final S3SourceConfig config = getConfig(Collections.emptyMap()); + final S3ClientBuilder builder = new S3ClientBuilder(); + mockSourceConfig(mockConfig, filePattern, taskId, maxTasks, topic); final S3Object obj = S3Object.builder().key(objectKey).build(); - final ByteArrayInputStream bais = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8)); - when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(obj).iterator()); - when(mockSourceApiClient.getObject(any())).thenReturn(() -> bais); + // Build s3 Client + builder.reset().addObject(key, "Hello World").endOfBlock(); + sourceApiClient = new AWSV2SourceClient(builder.build(), config); + final SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, - mockSourceApiClient, new HashDistributionStrategy(maxTasks), filePattern, taskId); + sourceApiClient); final Predicate<S3Object> s3ObjectPredicate = s3Object -> iterator.isFileMatchingPattern(s3Object) - && iterator.isFileAssignedToTask(s3Object); + && iterator.isFileAssignedToTask(filePatternUtils.process(s3Object.key()).orElseThrow(), taskId); // Assert assertThat(s3ObjectPredicate).accepts(obj); } @ParameterizedTest - @CsvSource({ "4, 1, topic1-2-0", "4, 3, key1", "4, 0, key1", "4, 1, key2", "4, 2, key2", "4, 0, key2", "4, 1, key3", + @CsvSource({ "4, 1, topic1-2-0", "4, 3,key1", "4, 0, key1", "4, 1, key2", "4, 2, key2", "4, 0, key2", "4, 1, key3", "4, 2, key3", "4, 3, key3", "4, 0, key4", "4, 2, key4", "4, 3, key4" }) void testFetchObjectSummariesWithOneNonZeroByteObjectWithTaskIdUnassigned(final int maxTasks, final int taskId, final String objectKey) { mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); - final Pattern filePattern = mock(Pattern.class); - - mockPatternMatcher(filePattern); + final String filePattern = "{{partition}}"; + final String topic = "topic"; + mockSourceConfig(mockConfig, filePattern, taskId, maxTasks, topic); + final S3ClientBuilder builder = new S3ClientBuilder(); + final S3SourceConfig config = getConfig(Collections.emptyMap()); + final FilePatternUtils filePatternUtils = new FilePatternUtils(filePattern); final S3Object obj = S3Object.builder().key(objectKey).build(); - final ByteArrayInputStream bais = new ByteArrayInputStream("Hello World".getBytes(StandardCharsets.UTF_8)); - when(mockSourceApiClient.getS3ObjectIterator(any())).thenReturn(Arrays.asList(obj).iterator()); - when(mockSourceApiClient.getObject(any())).thenReturn(() -> bais); + builder.reset().addObject(objectKey, "Hello World").endOfBlock(); + sourceApiClient = new AWSV2SourceClient(builder.build(), config); + final SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, - mockSourceApiClient, new HashDistributionStrategy(maxTasks), filePattern, taskId); + sourceApiClient); + final Predicate<S3Object> stringPredicate = s3Object -> iterator.isFileMatchingPattern(s3Object) - && iterator.isFileAssignedToTask(s3Object); + && iterator.isFileAssignedToTask(filePatternUtils.process(s3Object.key()).orElseThrow(), taskId); // Assert assertThat(stringPredicate.test(obj)).as("Predicate should accept the objectKey: " + objectKey).isFalse(); } - private static void mockPatternMatcher(final Pattern filePattern) { - final Matcher fileMatcher = mock(Matcher.class); - when(filePattern.matcher(any())).thenReturn(fileMatcher); - when(fileMatcher.find()).thenReturn(true); - when(fileMatcher.group(PATTERN_TOPIC_KEY)).thenReturn("testtopic"); - when(fileMatcher.group(PATTERN_PARTITION_KEY)).thenReturn("0"); + @Test + void testS3ClientIteratorMock() { + final S3ClientBuilder builder = new S3ClientBuilder(); + builder.addObject("Key", "value"); + final S3Client client = builder.build(); // NOPMD is asking to close client is done so on line 254 + final ListObjectsV2Response response = client.listObjectsV2(ListObjectsV2Request.builder().build()); + client.close(); + assertThat(response.contents()).isNotEmpty(); + + sourceApiClient = new AWSV2SourceClient(builder.build(), getConfig(Collections.emptyMap())); + final Iterator<S3Object> iterator = sourceApiClient.getS3ObjectIterator(null); + assertThat(iterator.hasNext()).isTrue(); + + } + + static class S3ClientBuilder { + Queue<Pair<List<S3Object>, Map<String, byte[]>>> blocks = new LinkedList<>(); + List<S3Object> objects = new ArrayList<>(); + Map<String, byte[]> data = new HashMap<>(); + + public S3ClientBuilder addObject(final String key, final byte[] data) { + objects.add(S3Object.builder().key(key).size((long) data.length).build()); + this.data.put(key, data); + return this; + } + + public S3ClientBuilder endOfBlock() { + blocks.add(Pair.of(objects, data)); + return reset(); + } + + public S3ClientBuilder reset() { + objects = new ArrayList<>(); + data = new HashMap<>(); + return this; + } + + public S3ClientBuilder addObject(final String key, final String data) { + return addObject(key, data.getBytes(StandardCharsets.UTF_8)); + } + + private ResponseBytes getResponse(final String key) { + return ResponseBytes.fromByteArray(new byte[0], data.get(key)); + } + + private ListObjectsV2Response dequeueData() { + if (blocks.isEmpty()) { + objects = Collections.emptyList(); + data = Collections.emptyMap(); + } else { + final Pair<List<S3Object>, Map<String, byte[]>> pair = blocks.remove(); + objects = pair.getLeft(); + data = pair.getRight(); + } + return ListObjectsV2Response.builder().contents(objects).isTruncated(false).build(); + } + + public S3Client build() { + if (!objects.isEmpty()) { + endOfBlock(); + } + final S3Client result = mock(S3Client.class); + when(result.listObjectsV2(any(ListObjectsV2Request.class))).thenAnswer(env -> dequeueData()); + when(result.listObjectsV2(any(Consumer.class))).thenAnswer(env -> dequeueData()); + when(result.getObjectAsBytes(any(GetObjectRequest.class))) + .thenAnswer(env -> getResponse(env.getArgument(0, GetObjectRequest.class).key())); + return result; + } } } From 1ec8ed77cf43fe92279aea045edd2002875cccf1 Mon Sep 17 00:00:00 2001 From: Claude Warren <claude.warren@aiven.io> Date: Mon, 20 Jan 2025 13:32:53 +0000 Subject: [PATCH 90/90] Attempt to fix backoff testing issue (#389) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Attempt to fix the timing issue with the backoff delay in testing. --------- Co-authored-by: ¨Claude <¨claude.warren@aiven.io¨> --- .../common/source/AbstractSourceTaskTest.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/AbstractSourceTaskTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/AbstractSourceTaskTest.java index 9b3a581eb..92fbddf46 100644 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/AbstractSourceTaskTest.java +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/AbstractSourceTaskTest.java @@ -28,6 +28,11 @@ class AbstractSourceTaskTest { + /** + * The amount of extra time that we will allow for timing errors. + */ + private static final long TIMING_DELTA_MS = 250; + @Test void timerTest() { final AbstractSourceTask.Timer timer = new AbstractSourceTask.Timer(Duration.ofSeconds(1)); @@ -92,7 +97,8 @@ void backoffTest() throws InterruptedException { backoff.delay(); stopWatch.stop(); assertThat(stopWatch.getTime()).as("Result without timer running") - .isBetween(estimatedDelay - backoff.getMaxJitter(), estimatedDelay + backoff.getMaxJitter()); + .isBetween(estimatedDelay - backoff.getMaxJitter() - TIMING_DELTA_MS, + estimatedDelay + backoff.getMaxJitter() + TIMING_DELTA_MS); timer.start(); for (int i = 0; i < 9; i++) { @@ -109,8 +115,8 @@ void backoffTest() throws InterruptedException { final int step = i; if (!timer.isExpired()) { assertThat(stopWatch.getTime()).as(() -> String.format("Result with timer running at step %s", step)) - .isBetween(Duration.ofSeconds(1).toMillis() - backoff.getMaxJitter(), - Duration.ofSeconds(1).toMillis() + backoff.getMaxJitter()); + .isBetween(Duration.ofSeconds(1).toMillis() - backoff.getMaxJitter() - TIMING_DELTA_MS, + Duration.ofSeconds(1).toMillis() + backoff.getMaxJitter() + TIMING_DELTA_MS); } } }