diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index b6a7393ee..9a5db94e7 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -13,10 +13,14 @@ name: "CodeQL" on: push: - branches: [main] + branches: + - main + - s3-source-release pull_request: # The branches below must be a subset of the branches above - branches: [main] + branches: + - main + - s3-source-release schedule: - cron: "42 20 * * 6" diff --git a/.github/workflows/main_push_workflow.yml b/.github/workflows/main_push_workflow.yml index 7db41ce21..393534842 100644 --- a/.github/workflows/main_push_workflow.yml +++ b/.github/workflows/main_push_workflow.yml @@ -2,9 +2,13 @@ name: Main and pull request checks on: push: - branches: [ main ] + branches: + - main + - s3-source-release pull_request: - branches: [ main ] + branches: + - main + - s3-source-release jobs: build: strategy: @@ -30,4 +34,4 @@ jobs: run: ./gradlew build test - name: Build in Linux if: runner.os == 'Linux' - run: ./gradlew build check test integrationTest + run: ./gradlew build check test integrationTest -i diff --git a/README.md b/README.md index b8bd950e8..b8f0ff2e2 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ - [Aiven GCS Sink Connector](./gcs-sink-connector/README.md) - [Aiven S3 Sink Connector](./s3-sink-connector/README.md) - [Aiven Azure Blob Sink Connector](./azure-sink-connector/README.md) +- [Aiven S3 Source Connector](./s3-source-connector/README.md) # Development diff --git a/commons/build.gradle.kts b/commons/build.gradle.kts index 9bdc06b78..101ef8db9 100644 --- a/commons/build.gradle.kts +++ b/commons/build.gradle.kts @@ -27,7 +27,7 @@ dependencies { implementation(confluent.kafka.connect.avro.data) { exclude(group = "org.apache.kafka", module = "kafka-clients") } - + implementation("commons-io:commons-io:2.18.0") implementation(tools.spotbugs.annotations) implementation(compressionlibs.snappy) implementation(compressionlibs.zstd.jni) @@ -41,6 +41,7 @@ dependencies { exclude(group = "org.slf4j", module = "slf4j-api") exclude(group = "org.apache.avro", module = "avro") } + implementation(apache.hadoop.common) { exclude(group = "org.apache.hadoop.thirdparty", module = "hadoop-shaded-protobuf_3_7") exclude(group = "com.google.guava", module = "guava") @@ -86,11 +87,12 @@ dependencies { testImplementation(jackson.databind) testImplementation(testinglibs.mockito.core) testImplementation(testinglibs.assertj.core) + testImplementation(testinglibs.awaitility) testImplementation(testFixtures(project(":commons"))) - testImplementation(testinglibs.woodstox.stax2.api) testImplementation(apache.hadoop.mapreduce.client.core) testImplementation(confluent.kafka.connect.avro.converter) + testImplementation("org.mockito:mockito-junit-jupiter:5.14.2") testRuntimeOnly(testinglibs.junit.jupiter.engine) testRuntimeOnly(logginglibs.logback.classic) diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/CommonConfig.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/CommonConfig.java index 8c4683a34..0242d40b7 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/CommonConfig.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/CommonConfig.java @@ -27,6 +27,8 @@ public class CommonConfig extends AbstractConfig { protected static final String GROUP_COMPRESSION = "File Compression"; protected static final String GROUP_FORMAT = "Format"; + public static final String TASK_ID = "task.id"; + public static final String MAX_TASKS = "tasks.max"; /** * @deprecated No longer needed. @@ -58,4 +60,25 @@ public Long getKafkaRetryBackoffMs() { return new BackoffPolicyConfig(this).getKafkaRetryBackoffMs(); } + /** + * + * Get the maximum number of tasks that should be run by this connector configuration Max Tasks is set within the + * Kafka Connect framework and so is retrieved slightly differently in ConnectorConfig.java + * + * @return The maximum number of tasks that should be run by this connector configuration + */ + public int getMaxTasks() { + // TODO when Connect framework is upgraded it will be possible to retrieve this information from the configDef + // as tasksMax + return Integer.parseInt(this.originalsStrings().get(MAX_TASKS)); + } + /** + * Get the task id for this configuration + * + * @return The task id for this configuration + */ + public int getTaskId() { + return Integer.parseInt(this.originalsStrings().get(TASK_ID)); + } + } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/FileNameFragment.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/FileNameFragment.java index 8d3156e22..467ea2cb2 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/FileNameFragment.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/FileNameFragment.java @@ -43,9 +43,12 @@ public final class FileNameFragment extends ConfigFragment { static final String FILE_MAX_RECORDS = "file.max.records"; static final String FILE_NAME_TIMESTAMP_TIMEZONE = "file.name.timestamp.timezone"; static final String FILE_NAME_TIMESTAMP_SOURCE = "file.name.timestamp.source"; - static final String FILE_NAME_TEMPLATE_CONFIG = "file.name.template"; + public static final String FILE_NAME_TEMPLATE_CONFIG = "file.name.template"; static final String DEFAULT_FILENAME_TEMPLATE = "{{topic}}-{{partition}}-{{start_offset}}"; + public static final String FILE_PATH_PREFIX_TEMPLATE_CONFIG = "file.prefix.template"; + static final String DEFAULT_FILE_PATH_PREFIX_TEMPLATE = "topics/{{topic}}/partition={{partition}}/"; + public FileNameFragment(final AbstractConfig cfg) { super(cfg); } @@ -109,9 +112,18 @@ public void ensureValid(final String name, final Object value) { configDef.define(FILE_NAME_TIMESTAMP_SOURCE, ConfigDef.Type.STRING, TimestampSource.Type.WALLCLOCK.name(), new TimestampSourceValidator(), ConfigDef.Importance.LOW, "Specifies the the timestamp variable source. Default is wall-clock.", GROUP_FILE, fileGroupCounter++, // NOPMD - // UnusedAssignment ConfigDef.Width.SHORT, FILE_NAME_TIMESTAMP_SOURCE); + configDef.define(FILE_PATH_PREFIX_TEMPLATE_CONFIG, ConfigDef.Type.STRING, DEFAULT_FILE_PATH_PREFIX_TEMPLATE, + new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, + "The template for file prefix on S3. " + + "Supports `{{ variable }}` placeholders for substituting variables. " + + "Currently supported variables are `topic` and `partition` " + + "and are mandatory to have these in the directory structure." + + "Example prefix : topics/{{topic}}/partition/{{partition}}/", + GROUP_FILE, fileGroupCounter++, // NOPMD UnusedAssignment + ConfigDef.Width.LONG, FILE_PATH_PREFIX_TEMPLATE_CONFIG); + return configDef; } @@ -185,4 +197,8 @@ public int getMaxRecordsPerFile() { return cfg.getInt(FILE_MAX_RECORDS); } + public String getFilePathPrefixTemplateConfig() { + return cfg.getString(FILE_PATH_PREFIX_TEMPLATE_CONFIG); + } + } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SchemaRegistryFragment.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SchemaRegistryFragment.java new file mode 100644 index 000000000..8ea7b7f95 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SchemaRegistryFragment.java @@ -0,0 +1,76 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.config; + +import java.util.Locale; + +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.common.config.ConfigDef; + +import io.aiven.kafka.connect.common.source.input.InputFormat; + +public final class SchemaRegistryFragment extends ConfigFragment { + private static final String SCHEMAREGISTRY_GROUP = "Schema registry group"; + public static final String SCHEMA_REGISTRY_URL = "schema.registry.url"; + public static final String VALUE_CONVERTER_SCHEMA_REGISTRY_URL = "value.converter.schema.registry.url"; + public static final String AVRO_VALUE_SERIALIZER = "value.serializer"; + public static final String INPUT_FORMAT_KEY = "input.format"; + public static final String SCHEMAS_ENABLE = "schemas.enable"; + + /** + * Construct the ConfigFragment.. + * + * @param cfg + * the configuration that this fragment is associated with. + */ + public SchemaRegistryFragment(final AbstractConfig cfg) { + super(cfg); + } + + public static ConfigDef update(final ConfigDef configDef) { + int srCounter = 0; + configDef.define(SCHEMA_REGISTRY_URL, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "SCHEMA REGISTRY URL", SCHEMAREGISTRY_GROUP, srCounter++, + ConfigDef.Width.NONE, SCHEMA_REGISTRY_URL); + configDef.define(VALUE_CONVERTER_SCHEMA_REGISTRY_URL, ConfigDef.Type.STRING, null, + new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, "SCHEMA REGISTRY URL", + SCHEMAREGISTRY_GROUP, srCounter++, ConfigDef.Width.NONE, VALUE_CONVERTER_SCHEMA_REGISTRY_URL); + configDef.define(INPUT_FORMAT_KEY, ConfigDef.Type.STRING, InputFormat.BYTES.getValue(), + new ConfigDef.NonEmptyString(), ConfigDef.Importance.MEDIUM, + "Input format of messages read from source avro/json/parquet/bytes", SCHEMAREGISTRY_GROUP, srCounter++, // NOPMD + ConfigDef.Width.NONE, INPUT_FORMAT_KEY); + + configDef.define(AVRO_VALUE_SERIALIZER, ConfigDef.Type.CLASS, null, ConfigDef.Importance.MEDIUM, + "Avro value serializer", SCHEMAREGISTRY_GROUP, srCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, AVRO_VALUE_SERIALIZER); + return configDef; + } + + public InputFormat getInputFormat() { + return InputFormat.valueOf(cfg.getString(INPUT_FORMAT_KEY).toUpperCase(Locale.ROOT)); + } + + public String getSchemaRegistryUrl() { + return cfg.getString(SCHEMA_REGISTRY_URL); + } + + public Class getAvroValueSerializer() { + return cfg.getClass(AVRO_VALUE_SERIALIZER); + } + +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java index e363d7c9a..68036bd68 100644 --- a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceCommonConfig.java @@ -20,8 +20,66 @@ import org.apache.kafka.common.config.ConfigDef; +import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; +import io.aiven.kafka.connect.common.source.input.InputFormat; +import io.aiven.kafka.connect.common.source.input.Transformer; +import io.aiven.kafka.connect.common.source.input.TransformerFactory; +import io.aiven.kafka.connect.common.source.task.DistributionType; + public class SourceCommonConfig extends CommonConfig { + + private final SchemaRegistryFragment schemaRegistryFragment; + private final SourceConfigFragment sourceConfigFragment; + private final FileNameFragment fileNameFragment; + private final OutputFormatFragment outputFormatFragment; + public SourceCommonConfig(ConfigDef definition, Map originals) {// NOPMD super(definition, originals); + // Construct Fragments + schemaRegistryFragment = new SchemaRegistryFragment(this); + sourceConfigFragment = new SourceConfigFragment(this); + fileNameFragment = new FileNameFragment(this); + outputFormatFragment = new OutputFormatFragment(this); + + validate(); // NOPMD ConstructorCallsOverridableMethod + } + + private void validate() { + schemaRegistryFragment.validate(); + sourceConfigFragment.validate(); + fileNameFragment.validate(); + outputFormatFragment.validate(); + } + + public InputFormat getInputFormat() { + return schemaRegistryFragment.getInputFormat(); + } + + public String getSchemaRegistryUrl() { + return schemaRegistryFragment.getSchemaRegistryUrl(); + } + + public String getTargetTopics() { + return sourceConfigFragment.getTargetTopics(); + } + public String getTargetTopicPartitions() { + return sourceConfigFragment.getTargetTopicPartitions(); + } + + public ErrorsTolerance getErrorsTolerance() { + return sourceConfigFragment.getErrorsTolerance(); + } + + public DistributionType getDistributionType() { + return sourceConfigFragment.getDistributionType(); } + + public int getMaxPollRecords() { + return sourceConfigFragment.getMaxPollRecords(); + } + + public Transformer getTransformer() { + return TransformerFactory.getTransformer(schemaRegistryFragment.getInputFormat()); + } + } diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java new file mode 100644 index 000000000..7f5d6276f --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/SourceConfigFragment.java @@ -0,0 +1,138 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.config; + +import static io.aiven.kafka.connect.common.source.task.DistributionType.OBJECT_HASH; + +import java.util.Arrays; +import java.util.stream.Collectors; + +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.common.config.ConfigDef; + +import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; +import io.aiven.kafka.connect.common.source.task.DistributionType; + +import org.apache.commons.lang3.StringUtils; + +public final class SourceConfigFragment extends ConfigFragment { + private static final String GROUP_OTHER = "OTHER_CFG"; + public static final String MAX_POLL_RECORDS = "max.poll.records"; + public static final String EXPECTED_MAX_MESSAGE_BYTES = "expected.max.message.bytes"; + private static final String GROUP_OFFSET_TOPIC = "OFFSET_TOPIC"; + public static final String TARGET_TOPIC_PARTITIONS = "topic.partitions"; + public static final String TARGET_TOPICS = "topics"; + public static final String ERRORS_TOLERANCE = "errors.tolerance"; + + public static final String DISTRIBUTION_TYPE = "distribution.type"; + + /** + * Construct the ConfigFragment.. + * + * @param cfg + * the configuration that this fragment is associated with. + */ + public SourceConfigFragment(final AbstractConfig cfg) { + super(cfg); + } + + public static ConfigDef update(final ConfigDef configDef) { + int sourcePollingConfigCounter = 0; + + configDef.define(MAX_POLL_RECORDS, ConfigDef.Type.INT, 500, ConfigDef.Range.atLeast(1), + ConfigDef.Importance.MEDIUM, "Max poll records", GROUP_OTHER, sourcePollingConfigCounter++, + ConfigDef.Width.NONE, MAX_POLL_RECORDS); + // KIP-298 Error Handling in Connect + configDef.define(ERRORS_TOLERANCE, ConfigDef.Type.STRING, ErrorsTolerance.NONE.name(), + new ErrorsToleranceValidator(), ConfigDef.Importance.MEDIUM, + "Indicates to the connector what level of exceptions are allowed before the connector stops, supported values : none,all", + GROUP_OTHER, sourcePollingConfigCounter++, ConfigDef.Width.NONE, ERRORS_TOLERANCE); + + configDef.define(EXPECTED_MAX_MESSAGE_BYTES, ConfigDef.Type.INT, 1_048_588, ConfigDef.Importance.MEDIUM, + "The largest record batch size allowed by Kafka config max.message.bytes", GROUP_OTHER, + sourcePollingConfigCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, EXPECTED_MAX_MESSAGE_BYTES); + + // Offset Storage config group includes target topics + int offsetStorageGroupCounter = 0; + configDef.define(TARGET_TOPIC_PARTITIONS, ConfigDef.Type.STRING, "0", new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "eg : 0,1", GROUP_OFFSET_TOPIC, offsetStorageGroupCounter++, + ConfigDef.Width.NONE, TARGET_TOPIC_PARTITIONS); + configDef.define(TARGET_TOPICS, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "eg : connect-storage-offsets", GROUP_OFFSET_TOPIC, + offsetStorageGroupCounter++, ConfigDef.Width.NONE, TARGET_TOPICS); + configDef.define(DISTRIBUTION_TYPE, ConfigDef.Type.STRING, OBJECT_HASH.name(), + new ObjectDistributionStrategyValidator(), ConfigDef.Importance.MEDIUM, + "Based on tasks.max config and the type of strategy selected, objects are processed in distributed" + + " way by Kafka connect workers, supported values : " + + Arrays.stream(DistributionType.values()) + .map(DistributionType::value) + .collect(Collectors.joining(", ")), + GROUP_OTHER, offsetStorageGroupCounter++, ConfigDef.Width.NONE, DISTRIBUTION_TYPE); // NOPMD + // UnusedAssignment + + return configDef; + } + + public String getTargetTopics() { + return cfg.getString(TARGET_TOPICS); + } + + public String getTargetTopicPartitions() { + return cfg.getString(TARGET_TOPIC_PARTITIONS); + } + + public int getMaxPollRecords() { + return cfg.getInt(MAX_POLL_RECORDS); + } + + public int getExpectedMaxMessageBytes() { + return cfg.getInt(EXPECTED_MAX_MESSAGE_BYTES); + } + + public ErrorsTolerance getErrorsTolerance() { + return ErrorsTolerance.forName(cfg.getString(ERRORS_TOLERANCE)); + } + + public DistributionType getDistributionType() { + return DistributionType.forName(cfg.getString(DISTRIBUTION_TYPE)); + } + + private static class ErrorsToleranceValidator implements ConfigDef.Validator { + @Override + public void ensureValid(final String name, final Object value) { + final String errorsTolerance = (String) value; + if (StringUtils.isNotBlank(errorsTolerance)) { + // This will throw an Exception if not a valid value. + ErrorsTolerance.forName(errorsTolerance); + } + } + } + + private static class ObjectDistributionStrategyValidator implements ConfigDef.Validator { + @Override + public void ensureValid(final String name, final Object value) { + final String objectDistributionStrategy = (String) value; + if (StringUtils.isNotBlank(objectDistributionStrategy)) { + // This will throw an Exception if not a valid value. + DistributionType.forName(objectDistributionStrategy); + } + } + } + +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/config/enums/ErrorsTolerance.java b/commons/src/main/java/io/aiven/kafka/connect/common/config/enums/ErrorsTolerance.java new file mode 100644 index 000000000..9c42c46d9 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/config/enums/ErrorsTolerance.java @@ -0,0 +1,44 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.config.enums; + +import java.util.Arrays; +import java.util.Objects; + +import org.apache.kafka.common.config.ConfigException; + +public enum ErrorsTolerance { + + NONE("none"), ALL("all"); + + private final String name; + + ErrorsTolerance(final String name) { + this.name = name; + } + + public static ErrorsTolerance forName(final String name) { + Objects.requireNonNull(name, "name cannot be null"); + for (final ErrorsTolerance errorsTolerance : ErrorsTolerance.values()) { + if (errorsTolerance.name.equalsIgnoreCase(name)) { + return errorsTolerance; + } + } + throw new ConfigException(String.format("Unknown errors.tolerance type: %s, allowed values %s ", name, + Arrays.toString(ErrorsTolerance.values()))); + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/AbstractSourceTask.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/AbstractSourceTask.java new file mode 100644 index 000000000..f55257f46 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/AbstractSourceTask.java @@ -0,0 +1,511 @@ +/* + * Copyright 2024-2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.apache.kafka.connect.source.SourceRecord; +import org.apache.kafka.connect.source.SourceTask; + +import io.aiven.kafka.connect.common.config.SourceCommonConfig; +import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; + +import org.apache.commons.lang3.time.StopWatch; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class handles extracting records from an iterator and returning them to Kafka. It uses an exponential backoff + * with jitter to reduce the number of calls to the backend when there is no data. This solution: + * + * + * + */ +public abstract class AbstractSourceTask extends SourceTask { + + public static final List NULL_RESULT = null; + + /** + * The maximum time to spend polling. This is set to 5 seconds as that is the time that is allotted to a system for + * shutdown. + */ + public static final Duration MAX_POLL_TIME = Duration.ofSeconds(5); + /** + * The boolean that indicates the connector is stopped. + */ + private final AtomicBoolean connectorStopped; + + /** + * The logger to use. Set from the class implementing AbstractSourceTask. + */ + private final Logger logger; + + /** + * The maximum number of records to put in a poll. Specified in the configuration. + */ + private int maxPollRecords; + + /** + * The Backoff implementation that executes the delay in the poll loop. + */ + private final Backoff backoff; + + private final Timer timer; + + /** + * The configuration + */ + private SourceCommonConfig config; + + private Iterator sourceRecordIterator; + + /** + * Constructor. + * + * @param logger + * the logger to use. + */ + protected AbstractSourceTask(final Logger logger) { + super(); + this.logger = logger; + connectorStopped = new AtomicBoolean(); + timer = new Timer(MAX_POLL_TIME); + backoff = new Backoff(timer.getBackoffConfig()); + } + + /** + * Gets the iterator of SourceRecords. The iterator that SourceRecords are extracted from during a poll event. When + * this iterator runs out of records it should attempt to reset and read more records from the backend on the next + * {@code hasNext()} call. In this way it should detect when new data has been added to the backend and continue + * processing. + *

+ * This method should handle any backend exception that can be retried. Any runtime exceptions that are thrown when + * this iterator executes may cause the task to abort. + *

+ * + * @param config + * the configuration for the Backoff. + * @return The iterator of SourceRecords. + */ + abstract protected Iterator getIterator(BackoffConfig config); + + /** + * Called by {@link #start} to allows the concrete implementation to configure itself based on properties. + * + * @param props + * the properties to use for configuration. + */ + abstract protected SourceCommonConfig configure(Map props); + + @Override + public final void start(final Map props) { + logger.debug("Starting"); + config = configure(props); + maxPollRecords = config.getMaxPollRecords(); + sourceRecordIterator = getIterator(timer.getBackoffConfig()); + } + + /** + * Try to add a SourceRecord to the results. + * + * @param results + * the result to add the record to. + * @param sourceRecordIterator + * the source record iterator. + * @return true if successful, false if the iterator is empty. + */ + private boolean tryAdd(final List results, final Iterator sourceRecordIterator) { + if (sourceRecordIterator.hasNext()) { + backoff.reset(); + final SourceRecord sourceRecord = sourceRecordIterator.next(); + if (logger.isDebugEnabled()) { + logger.debug("tryAdd() : read record {}", sourceRecord.sourceOffset()); + } + results.add(sourceRecord); + return true; + } + logger.info("No records found in tryAdd call"); + return false; + } + + /** + * Returns {@code true} if the connector is not stopped and the timer has not expired. + * + * @return {@code true} if the connector is not stopped and the timer has not expired. + */ + protected boolean stillPolling() { + final boolean result = !connectorStopped.get() && !timer.isExpired(); + logger.debug("Still polling: {}", result); + return result; + } + + @Override + public final List poll() { + logger.debug("Polling"); + if (connectorStopped.get()) { + logger.info("Stopping"); + closeResources(); + return NULL_RESULT; + } else { + timer.start(); + try { + final List result = populateList(); + if (logger.isDebugEnabled()) { + logger.debug("Poll() returning {} SourceRecords.", result == null ? null : result.size()); + } + return result; + } finally { + timer.stop(); + timer.reset(); + } + } + } + + /** + * Attempts to populate the return list. Will read as many records into the list as it can until the timer expires + * or the task is shut down. + * + * @return A list SourceRecords or {@code null} if the system hit a runtime exception. + */ + private List populateList() { + final List results = new ArrayList<>(); + try { + while (stillPolling() && results.size() < maxPollRecords) { + if (!tryAdd(results, sourceRecordIterator)) { + if (!results.isEmpty()) { + logger.debug("tryAdd() did not add to the list, returning current results."); + // if we could not get a record and the results are not empty return them + break; + } + logger.debug("Attempting {}", backoff); + backoff.cleanDelay(); + } + } + + } catch (RuntimeException e) { // NOPMD must catch runtime here. + logger.error("Error during poll(): {}", e.getMessage(), e); + if (config.getErrorsTolerance() == ErrorsTolerance.NONE) { + logger.error("Stopping Task"); + throw e; + } + } + return results.isEmpty() ? NULL_RESULT : results; + } + + @Override + public final void stop() { + logger.debug("Stopping"); + connectorStopped.set(true); + } + + /** + * Returns the running state of the task. + * + * @return {@code true} if the connector is running, {@code false} otherwise. + */ + public final boolean isRunning() { + return !connectorStopped.get(); + } + + /** + * Close any resources the source has open. Called by the IteratorRunnable when it is stopping. + */ + abstract protected void closeResources(); + + /** + * Calculates elapsed time and flags when expired. + */ + protected static class Timer extends StopWatch { + /** + * The length of time that the timer should run. + */ + private final long duration; + + /** + * The flag that indicates the timer has been aborted. + */ + private boolean hasAborted; + + /** + * Constructor. + * + * @param duration + * the length of time the timer should run. + */ + Timer(final Duration duration) { + super(); + this.duration = duration.toMillis(); + } + + /** + * Gets the maximum duration for this timer. + * + * @return the maximum duration for the timer. + */ + public long millisecondsRemaining() { + return super.isStarted() ? duration - super.getTime() : duration; + } + + /** + * Returns {@code true} if the timer has expired. + * + * @return {@code true} if the timer has expired. + */ + public boolean isExpired() { + return hasAborted || super.getTime() >= duration; + } + + /** + * Aborts the timer. Timer will report that it has expired until reset is called. + */ + public void abort() { + hasAborted = true; + } + + @Override + public void start() { + try { + hasAborted = false; + super.start(); + } catch (IllegalStateException e) { + throw new IllegalStateException("Timer: " + e.getMessage()); + } + } + + @Override + public void stop() { + try { + super.stop(); + } catch (IllegalStateException e) { + throw new IllegalStateException("Timer: " + e.getMessage()); + } + } + + @Override + public void reset() { + try { + hasAborted = false; + super.reset(); + } catch (IllegalStateException e) { + throw new IllegalStateException("Timer: " + e.getMessage()); + } + } + + /** + * Gets a Backoff Config for this timer. + * + * @return a backoff Configuration. + */ + public BackoffConfig getBackoffConfig() { + return new BackoffConfig() { + + @Override + public SupplierOfLong getSupplierOfTimeRemaining() { + return Timer.this::millisecondsRemaining; + } + + @Override + public AbortTrigger getAbortTrigger() { + return Timer.this::abort; + } + }; + } + } + + /** + * Performs a delay based on the number of successive {@link #delay()} or {@link #cleanDelay()} calls without a + * {@link #reset()}. Delay increases exponentially but never exceeds the time remaining by more than 0.512 seconds. + */ + public static class Backoff { + /** The logger to write to */ + private static final Logger LOGGER = LoggerFactory.getLogger(Backoff.class); + /** + * The maximum jitter random number. Should be a power of 2 for speed. + */ + public static final int MAX_JITTER = 1024; + + public static final int JITTER_SUBTRAHEND = MAX_JITTER / 2; + /** + * A supplier of the time remaining (in milliseconds) on the overriding timer. + */ + private final SupplierOfLong timeRemaining; + + /** + * A function to call to abort the timer. + */ + private final AbortTrigger abortTrigger; + + /** + * The maximum number of times {@link #delay()} will be called before maxWait is reached. + */ + private int maxCount; + /** + * The number of times {@link #delay()} has been called. + */ + private int waitCount; + + /** + * A random number generator to construct jitter. + */ + Random random = new Random(); + + /** + * Constructor. + * + * @param config + * The configuration for the backoff. + */ + public Backoff(final BackoffConfig config) { + this.timeRemaining = config.getSupplierOfTimeRemaining(); + this.abortTrigger = config.getAbortTrigger(); + reset(); + } + + /** + * Reset the backoff time so that delay is again at the minimum. + */ + public final void reset() { + // if the reminaing time is 0 or negative the maxCount will be infinity + // so make sure that it is 0 in that case. + final long remainingTime = timeRemaining.get(); + maxCount = remainingTime < 1L ? 0 : (int) (Math.log10(remainingTime) / Math.log10(2)); + waitCount = 0; + LOGGER.debug("Reset {}", this); + } + + /** + * Handle adjustment when maxCount could not be set. + * + * @return the corrected maxCount + */ + private int getMaxCount() { + if (maxCount == 0) { + reset(); + } + return maxCount; + } + + /** + * Calculates the delay wihtout jitter. + * + * @return the number of milliseconds the delay will be. + */ + public long estimatedDelay() { + long sleepTime = timeRemaining.get(); + if (sleepTime > 0 && waitCount < maxCount) { + sleepTime = (long) Math.min(sleepTime, Math.pow(2, waitCount + 1)); + } + return sleepTime < 0 ? 0 : sleepTime; + } + + /** + * Calculates the range of jitter in milliseconds. + * + * @return the maximum jitter in milliseconds. jitter is +/- maximum jitter. + */ + public int getMaxJitter() { + return MAX_JITTER - JITTER_SUBTRAHEND; + } + + private long timeWithJitter() { + // generate approx +/- 0.512 seconds of jitter + final int jitter = random.nextInt(MAX_JITTER) - JITTER_SUBTRAHEND; + return (long) Math.pow(2, waitCount) + jitter; + } + + /** + * Delay execution based on the number of times this method has been called. + * + * @throws InterruptedException + * If any thread interrupts this thread. + */ + public void delay() throws InterruptedException { + final long sleepTime = timeRemaining.get(); + if (sleepTime > 0 && waitCount < (maxCount == 0 ? getMaxCount() : maxCount)) { + waitCount++; + final long nextSleep = timeWithJitter(); + // don't sleep negative time. Jitter can introduce negative tme. + if (nextSleep > 0) { + if (nextSleep >= sleepTime) { + LOGGER.debug("Backoff aborting timer"); + abortTrigger.apply(); + } else { + LOGGER.debug("Backoff sleepiing {}", nextSleep); + Thread.sleep(nextSleep); + } + } + } + } + + /** + * Like {@link #delay} but swallows the {@link InterruptedException}. + */ + public void cleanDelay() { + try { + delay(); + } catch (InterruptedException exception) { + // do nothing return results below + } + } + + @Override + public String toString() { + return String.format("Backoff %s/%s, %s milliseconds remaining.", waitCount, maxCount, timeRemaining.get()); + } + } + + /** + * A functional interface to return long values. + */ + @FunctionalInterface + public interface SupplierOfLong { + long get(); + } + + /** + * A functional interface that will abort the timer. After being called timer will indicate that it is expired, + * until it is reset. + */ + @FunctionalInterface + public interface AbortTrigger { + void apply(); + } + + /** + * An interface to define the Backoff configuration. Used for convenience with Timer. + */ + public interface BackoffConfig { + SupplierOfLong getSupplierOfTimeRemaining(); + AbortTrigger getAbortTrigger(); + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java new file mode 100644 index 000000000..760d074d2 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/AvroTransformer.java @@ -0,0 +1,98 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMA_REGISTRY_URL; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.Map; +import java.util.function.Consumer; + +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaAndValue; + +import io.confluent.connect.avro.AvroData; +import org.apache.avro.file.DataFileStream; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.DatumReader; +import org.apache.commons.io.function.IOSupplier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class AvroTransformer extends Transformer { + + private final AvroData avroData; + + private static final Logger LOGGER = LoggerFactory.getLogger(AvroTransformer.class); + + AvroTransformer(final AvroData avroData) { + super(); + this.avroData = avroData; + } + + @Override + public void configureValueConverter(final Map config, final AbstractConfig sourceConfig) { + config.put(SCHEMA_REGISTRY_URL, sourceConfig.getString(SCHEMA_REGISTRY_URL)); + } + + @Override + public StreamSpliterator createSpliterator(final IOSupplier inputStreamIOSupplier, final String topic, + final int topicPartition, final AbstractConfig sourceConfig) { + return new StreamSpliterator(LOGGER, inputStreamIOSupplier) { + private DataFileStream dataFileStream; + private final DatumReader datumReader = new GenericDatumReader<>(); + + @Override + protected InputStream inputOpened(final InputStream input) throws IOException { + dataFileStream = new DataFileStream<>(input, datumReader); + return input; + } + + @Override + public void doClose() { + if (dataFileStream != null) { + try { + dataFileStream.close(); + } catch (IOException e) { + LOGGER.error("Error closing reader: {}", e.getMessage(), e); + } + } + } + + @Override + protected boolean doAdvance(final Consumer action) { + if (dataFileStream.hasNext()) { + final GenericRecord record = dataFileStream.next(); + action.accept(avroData.toConnectData(record.getSchema(), record)); + return true; + } + return false; + } + }; + } + + @Override + public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topic, + final AbstractConfig sourceConfig) { + return new SchemaAndValue(Schema.OPTIONAL_BYTES_SCHEMA, + ((String) cloudStorageKey).getBytes(StandardCharsets.UTF_8)); + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java new file mode 100644 index 000000000..232aaef24 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformer.java @@ -0,0 +1,85 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Map; +import java.util.function.Consumer; + +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.connect.data.SchemaAndValue; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.function.IOSupplier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ByteArrayTransformer extends Transformer { + private static final Logger LOGGER = LoggerFactory.getLogger(ByteArrayTransformer.class); + + private static final int MAX_BUFFER_SIZE = 4096; + + @Override + public void configureValueConverter(final Map config, final AbstractConfig sourceConfig) { + // For byte array transformations, ByteArrayConverter is the converter which is the default config. + } + + @Override + public StreamSpliterator createSpliterator(final IOSupplier inputStreamIOSupplier, final String topic, + final int topicPartition, final AbstractConfig sourceConfig) { + return new StreamSpliterator(LOGGER, inputStreamIOSupplier) { + @Override + protected InputStream inputOpened(final InputStream input) { + return input; + } + + @Override + protected void doClose() { + // nothing to do. + } + + @Override + protected boolean doAdvance(final Consumer action) { + final byte[] buffer = new byte[MAX_BUFFER_SIZE]; + try { + final int bytesRead = IOUtils.read(inputStream, buffer); + if (bytesRead == 0) { + return false; + } + if (bytesRead < MAX_BUFFER_SIZE) { + action.accept(new SchemaAndValue(null, Arrays.copyOf(buffer, bytesRead))); + } else { + action.accept(new SchemaAndValue(null, buffer)); + } + return true; + } catch (IOException e) { + LOGGER.error("Error trying to advance inputStream: {}", e.getMessage(), e); + return false; + } + } + }; + } + + @Override + public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topic, + final AbstractConfig sourceConfig) { + return new SchemaAndValue(null, ((String) cloudStorageKey).getBytes(StandardCharsets.UTF_8)); + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/InputFormat.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/InputFormat.java new file mode 100644 index 000000000..8234e2c03 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/InputFormat.java @@ -0,0 +1,38 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import java.util.Locale; + +public enum InputFormat { + AVRO("avro"), PARQUET("parquet"), JSONL("jsonl"), BYTES("bytes"); + + private final String format; + + InputFormat(final String format) { + this.format = format; + } + + public String getValue() { + return format.toLowerCase(Locale.ROOT); + } + + @Override + public String toString() { + return format; + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java new file mode 100644 index 000000000..8069d08c1 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/JsonTransformer.java @@ -0,0 +1,105 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.Map; +import java.util.function.Consumer; + +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.json.JsonConverter; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.io.function.IOSupplier; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class JsonTransformer extends Transformer { + + private final JsonConverter jsonConverter; + + private static final Logger LOGGER = LoggerFactory.getLogger(JsonTransformer.class); + + final ObjectMapper objectMapper = new ObjectMapper(); + + JsonTransformer(final JsonConverter jsonConverter) { + super(); + this.jsonConverter = jsonConverter; + } + + @Override + public void configureValueConverter(final Map config, final AbstractConfig sourceConfig) { + } + + @Override + public StreamSpliterator createSpliterator(final IOSupplier inputStreamIOSupplier, final String topic, + final int topicPartition, final AbstractConfig sourceConfig) { + return new StreamSpliterator(LOGGER, inputStreamIOSupplier) { + BufferedReader reader; + + @Override + protected InputStream inputOpened(final InputStream input) throws IOException { + reader = new BufferedReader(new InputStreamReader(input, StandardCharsets.UTF_8)); + return input; + } + + @Override + public void doClose() { + if (reader != null) { + try { + reader.close(); + } catch (IOException e) { + LOGGER.error("Error closing reader: {}", e.getMessage(), e); + } + } + } + + @Override + public boolean doAdvance(final Consumer action) { + String line = null; + try { + // remove blank and empty lines. + while (StringUtils.isBlank(line)) { + line = reader.readLine(); + if (line == null) { + // end of file + return false; + } + } + line = line.trim(); + action.accept(jsonConverter.toConnectData(topic, line.getBytes(StandardCharsets.UTF_8))); + return true; + } catch (IOException e) { + LOGGER.error("Error reading input stream: {}", e.getMessage(), e); + return false; + } + } + }; + } + + @Override + public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topic, + final AbstractConfig sourceConfig) { + return new SchemaAndValue(null, ((String) cloudStorageKey).getBytes(StandardCharsets.UTF_8)); + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java new file mode 100644 index 000000000..2c47d5103 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/ParquetTransformer.java @@ -0,0 +1,135 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMA_REGISTRY_URL; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.util.Map; +import java.util.function.Consumer; + +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.connect.data.SchemaAndValue; + +import io.aiven.kafka.connect.common.source.input.parquet.LocalInputFile; + +import io.confluent.connect.avro.AvroData; +import org.apache.avro.generic.GenericRecord; +import org.apache.commons.compress.utils.IOUtils; +import org.apache.commons.io.function.IOSupplier; +import org.apache.parquet.avro.AvroParquetReader; +import org.apache.parquet.hadoop.ParquetReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ParquetTransformer extends Transformer { + + private final AvroData avroData; + + private static final Logger LOGGER = LoggerFactory.getLogger(ParquetTransformer.class); + + ParquetTransformer(final AvroData avroData) { + super(); + this.avroData = avroData; + } + + @Override + public void configureValueConverter(final Map config, final AbstractConfig sourceConfig) { + config.put(SCHEMA_REGISTRY_URL, sourceConfig.getString(SCHEMA_REGISTRY_URL)); + } + + @Override + public SchemaAndValue getKeyData(final Object cloudStorageKey, final String topic, + final AbstractConfig sourceConfig) { + return new SchemaAndValue(null, ((String) cloudStorageKey).getBytes(StandardCharsets.UTF_8)); + } + + @Override + public StreamSpliterator createSpliterator(final IOSupplier inputStreamIOSupplier, final String topic, + final int topicPartition, final AbstractConfig sourceConfig) { + + return new StreamSpliterator(LOGGER, inputStreamIOSupplier) { + + private ParquetReader reader; + private File parquetFile; + + @Override + protected InputStream inputOpened(final InputStream input) throws IOException { + final String timestamp = String.valueOf(Instant.now().toEpochMilli()); + + try { + // Create a temporary file for the Parquet data + parquetFile = File.createTempFile(topic + "_" + topicPartition + "_" + timestamp, ".parquet"); + } catch (IOException e) { + LOGGER.error("Error creating temp file for Parquet data: {}", e.getMessage(), e); + throw e; + } + + try (OutputStream outputStream = Files.newOutputStream(parquetFile.toPath())) { + IOUtils.copy(input, outputStream); // Copy input stream to temporary file + } + reader = AvroParquetReader.builder(new LocalInputFile(parquetFile.toPath())).build(); + return input; + } + + @Override + protected void doClose() { + if (reader != null) { + try { + reader.close(); // Close reader at end of file + } catch (IOException e) { + logger.error("Error closing reader: {}", e.getMessage(), e); + } + } + if (parquetFile != null) { + deleteTmpFile(parquetFile.toPath()); + } + } + + @Override + protected boolean doAdvance(final Consumer action) { + try { + final GenericRecord record = reader.read(); + if (record != null) { + action.accept(avroData.toConnectData(record.getSchema(), record)); // Pass record to the stream + return true; + } + } catch (IOException e) { + logger.error("Error reading record: {}", e.getMessage(), e); + } + return false; + } + }; + } + + static void deleteTmpFile(final Path parquetFile) { + if (Files.exists(parquetFile)) { + try { + Files.delete(parquetFile); + } catch (IOException e) { + LOGGER.error("Error in deleting tmp file {}", e.getMessage(), e); + } + } + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java new file mode 100644 index 000000000..09e8c0ca5 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/Transformer.java @@ -0,0 +1,183 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Map; +import java.util.Spliterator; +import java.util.function.Consumer; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.connect.data.SchemaAndValue; + +import org.apache.commons.io.function.IOSupplier; +import org.slf4j.Logger; + +public abstract class Transformer { + + public abstract void configureValueConverter(Map config, AbstractConfig sourceConfig); + + public final Stream getRecords(final IOSupplier inputStreamIOSupplier, + final String topic, final int topicPartition, final AbstractConfig sourceConfig, final long skipRecords) { + + final StreamSpliterator spliterator = createSpliterator(inputStreamIOSupplier, topic, topicPartition, + sourceConfig); + return StreamSupport.stream(spliterator, false).onClose(spliterator::close).skip(skipRecords); + } + + /** + * Creates the stream spliterator for this transformer. + * + * @param inputStreamIOSupplier + * the input stream supplier. + * @param topic + * the topic. + * @param topicPartition + * the partition. + * @param sourceConfig + * the source configuraiton. + * @return a StreamSpliterator instance. + */ + protected abstract StreamSpliterator createSpliterator(IOSupplier inputStreamIOSupplier, String topic, + int topicPartition, AbstractConfig sourceConfig); + + public abstract SchemaAndValue getKeyData(Object cloudStorageKey, String topic, AbstractConfig sourceConfig); + + /** + * A Spliterator that performs various checks on the opening/closing of the input stream. + */ + protected abstract static class StreamSpliterator implements Spliterator { + /** + * The input stream supplier. + */ + private final IOSupplier inputStreamIOSupplier; + /** + * The logger to be used by all instances of this class. This will be the Transformer logger. + */ + protected final Logger logger; + /** + * The input stream. Will be null until {@link #inputOpened} has completed. May be used for reading but should + * not be closed or otherwise made unreadable. + */ + protected InputStream inputStream; + + /** + * A flag indicate that the input stream has been closed. + */ + private boolean closed; + + /** + * Constructor. + * + * @param logger + * The logger for this Spliterator to use. + * @param inputStreamIOSupplier + * the InputStream supplier + */ + protected StreamSpliterator(final Logger logger, final IOSupplier inputStreamIOSupplier) { + this.logger = logger; + this.inputStreamIOSupplier = inputStreamIOSupplier; + } + + /** + * Attempt to read the next record. If there is no record to read or an error occurred return false. If a record + * was created, call {@code action.accept()} with the record. + * + * @param action + * the Consumer to call if record is created. + * @return {@code true} if a record was processed, {@code false} otherwise. + */ + abstract protected boolean doAdvance(Consumer action); + + /** + * Method to close additional inputs if needed. + */ + abstract protected void doClose(); + + public final void close() { + doClose(); + try { + if (inputStream != null) { + inputStream.close(); + inputStream = null; // NOPMD setting null to release resources + closed = true; + } + } catch (IOException e) { + logger.error("Error trying to close inputStream: {}", e.getMessage(), e); + } + } + + /** + * Allows modification of input stream. Called immediatly after the input stream is opened. Implementations may + * modify the type of input stream by wrapping it with a specific implementation, or may create Readers from the + * input stream. The modified input stream must be returned. If a Reader or similar class is created from the + * input stream the input stream must be returned. + * + * @param input + * the input stream that was just opened. + * @return the input stream or modified input stream. + * @throws IOException + * on IO error. + */ + abstract protected InputStream inputOpened(InputStream input) throws IOException; + + @Override + public final boolean tryAdvance(final Consumer action) { + if (closed) { + return false; + } + boolean result = false; + try { + if (inputStream == null) { + try { + inputStream = inputStreamIOSupplier.get(); + inputOpened(inputStream); + } catch (IOException e) { + logger.error("Error trying to open inputStream: {}", e.getMessage(), e); + close(); + return false; + } + } + result = doAdvance(action); + } catch (RuntimeException e) { // NOPMD must catch runtime exception here. + logger.error("Error trying to advance data: {}", e.getMessage(), e); + } + if (!result) { + close(); + } + return result; + } + + @Override + public final Spliterator trySplit() { // NOPMD returning null is reqruied by API + return null; + } + + @Override + public long estimateSize() { + return Long.MAX_VALUE; + } + + @Override + public int characteristics() { + return Spliterator.ORDERED | Spliterator.NONNULL; + } + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java new file mode 100644 index 000000000..574604306 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/TransformerFactory.java @@ -0,0 +1,61 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMAS_ENABLE; + +import java.util.Map; + +import org.apache.kafka.connect.json.JsonConverter; + +import io.confluent.connect.avro.AvroData; + +/** + * A factory to create Transformers. + */ +public final class TransformerFactory { + /** The cache size for systems that read Avro data */ + public static final int CACHE_SIZE = 100; + + private TransformerFactory() { + // hidden + } + + /** + * Gets a configured Transformer. + * + * @param inputFormat + * The input format for the transformer. + * @return the Transformer for the specified input format. + */ + public static Transformer getTransformer(final InputFormat inputFormat) { + switch (inputFormat) { + case AVRO : + return new AvroTransformer(new AvroData(CACHE_SIZE)); + case PARQUET : + return new ParquetTransformer(new AvroData(CACHE_SIZE)); + case JSONL : + final JsonConverter jsonConverter = new JsonConverter(); + jsonConverter.configure(Map.of(SCHEMAS_ENABLE, "false"), false); + return new JsonTransformer(jsonConverter); + case BYTES : + return new ByteArrayTransformer(); + default : + throw new IllegalArgumentException("Unknown input format in configuration: " + inputFormat); + } + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/parquet/LocalInputFile.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/parquet/LocalInputFile.java new file mode 100644 index 000000000..bb1081ab2 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/parquet/LocalInputFile.java @@ -0,0 +1,103 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input.parquet; + +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.file.Path; + +import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.SeekableInputStream; + +/** + * {@code LocalInputFile} is an implementation needed by Parquet to read from local data files using + * {@link SeekableInputStream} instances. + */ +public class LocalInputFile implements InputFile { + + private final Path path; + private long length = -1; + + public LocalInputFile(final Path file) { + path = file; + } + + @Override + public long getLength() throws IOException { + if (length == -1) { + try (RandomAccessFile file = new RandomAccessFile(path.toFile(), "r")) { + length = file.length(); + } + } + return length; + } + + @Override + public SeekableInputStream newStream() throws IOException { + + return new SeekableInputStream() { + + private final RandomAccessFile randomAccessFile = new RandomAccessFile(path.toFile(), "r"); + + @Override + public int read() throws IOException { + return randomAccessFile.read(); + } + + @Override + public long getPos() throws IOException { + return randomAccessFile.getFilePointer(); + } + + @Override + public void seek(final long newPos) throws IOException { + randomAccessFile.seek(newPos); + } + + @Override + public void readFully(final byte[] bytes) throws IOException { + randomAccessFile.readFully(bytes); + } + + @Override + public void readFully(final byte[] bytes, final int start, final int len) throws IOException { + randomAccessFile.readFully(bytes, start, len); + } + + @Override + public int read(final ByteBuffer buf) throws IOException { + final byte[] buffer = new byte[buf.remaining()]; + final int code = read(buffer); + buf.put(buffer, buf.position() + buf.arrayOffset(), buf.remaining()); + return code; + } + + @Override + public void readFully(final ByteBuffer buf) throws IOException { + final byte[] buffer = new byte[buf.remaining()]; + readFully(buffer); + buf.put(buffer, buf.position() + buf.arrayOffset(), buf.remaining()); + } + + @Override + public void close() throws IOException { + randomAccessFile.close(); + } + }; + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtils.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtils.java new file mode 100644 index 000000000..3f78431ea --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtils.java @@ -0,0 +1,170 @@ +/* + * Copyright 2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input.utils; + +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.kafka.common.config.ConfigException; + +import io.aiven.kafka.connect.common.source.task.Context; + +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * FilePatternUtils allows the construction of a regex pattern to extract the + * {@link io.aiven.kafka.connect.common.source.task.Context Context} from an Object Key. + * + */ +public final class FilePatternUtils { + private static final Logger LOGGER = LoggerFactory.getLogger(FilePatternUtils.class); + public static final String PATTERN_PARTITION_KEY = "partition"; + public static final String PATTERN_TOPIC_KEY = "topic"; + public static final String PATTERN_START_OFFSET_KEY = "startOffset"; // no undercore allowed as it breaks the regex. + public static final String START_OFFSET_PATTERN = "{{start_offset}}"; + public static final String TIMESTAMP_PATTERN = "{{timestamp}}"; + public static final String PARTITION_PATTERN = "{{" + PATTERN_PARTITION_KEY + "}}"; + public static final String TOPIC_PATTERN = "{{" + PATTERN_TOPIC_KEY + "}}"; + + // Use a named group to return the partition in a complex string to always get the correct information for the + // partition number. + public static final String PARTITION_NAMED_GROUP_REGEX_PATTERN = "(?<" + PATTERN_PARTITION_KEY + ">\\d+)"; + public static final String START_OFFSET_NAMED_GROUP_REGEX_PATTERN = "(?<" + PATTERN_START_OFFSET_KEY + ">\\d+)"; + public static final String NUMBER_REGEX_PATTERN = "(?:\\d+)"; + public static final String TOPIC_NAMED_GROUP_REGEX_PATTERN = "(?<" + PATTERN_TOPIC_KEY + ">[a-zA-Z0-9\\-_.]+)"; + public static final String START_OFFSET = "Start offset"; + + final Pattern pattern; + private final boolean startOffsetConfigured; + private final boolean partitionConfigured; + private final boolean topicConfigured; + + /** + * Creates an instance of FilePatternUtils, this constructor is used to configure the Pattern that is used to + * extract Context from Object 'K'. + * + * @param pattern + */ + public FilePatternUtils(final String pattern) { + this.pattern = configurePattern(pattern); + startOffsetConfigured = pattern.contains(START_OFFSET_PATTERN); + partitionConfigured = pattern.contains(PARTITION_PATTERN); + topicConfigured = pattern.contains(TOPIC_PATTERN); + } + + /** + * Sets a Regex Pattern based on initial configuration that allows group regex to be used to extract information + * from the toString() of Object K which is passed in for Context extraction. + * + * @param expectedSourceNameFormat + * This is a string in the expected compatible format which will allow object name or keys to have unique + * information such as partition number, topic name, offset and timestamp information. + * @return A pattern which is configured to allow extraction of the key information from object names and keys. + */ + private Pattern configurePattern(final String expectedSourceNameFormat) { + if (expectedSourceNameFormat == null) { + throw new ConfigException( + "Source name format is missing please configure the expected source to include the partition pattern."); + } + + // Build REGEX Matcher + String regexString = StringUtils.replace(expectedSourceNameFormat, START_OFFSET_PATTERN, + START_OFFSET_NAMED_GROUP_REGEX_PATTERN); + regexString = StringUtils.replace(regexString, TIMESTAMP_PATTERN, NUMBER_REGEX_PATTERN); + regexString = StringUtils.replace(regexString, TOPIC_PATTERN, TOPIC_NAMED_GROUP_REGEX_PATTERN); + regexString = StringUtils.replace(regexString, PARTITION_PATTERN, PARTITION_NAMED_GROUP_REGEX_PATTERN); + try { + return Pattern.compile(regexString); + } catch (IllegalArgumentException iae) { + throw new ConfigException( + String.format("Unable to compile the regex pattern %s to retrieve the partition id.", regexString), + iae); + } + } + + public > Optional> process(final K sourceName) { + final Optional matcher = fileMatches(sourceName.toString()); + if (matcher.isPresent()) { + final Context ctx = new Context<>(sourceName); + getTopic(matcher.get(), sourceName.toString()).ifPresent(ctx::setTopic); + getPartitionId(matcher.get(), sourceName.toString()).ifPresent(ctx::setPartition); + getOffset(matcher.get(), sourceName.toString()).ifPresent(ctx::setOffset); + return Optional.of(ctx); + } + return Optional.empty(); + + } + + private Optional fileMatches(final String sourceName) { + return matchPattern(sourceName); + } + + private Optional getTopic(final Matcher matcher, final String sourceName) { + + try { + return Optional.of(matcher.group(PATTERN_TOPIC_KEY)); + } catch (IllegalArgumentException ex) { + // It is possible that when checking for the group it does not match and returns an + // illegalArgumentException + if (topicConfigured) { + LOGGER.warn("Unable to extract Topic from {} and 'topics' not configured.", sourceName); + } + return Optional.empty(); + } + + } + + private Optional getPartitionId(final Matcher matcher, final String sourceName) { + try { + return Optional.of(Integer.parseInt(matcher.group(PATTERN_PARTITION_KEY))); + } catch (IllegalArgumentException e) { + // It is possible that when checking for the group it does not match and returns an + // illegalStateException, Number format exception is also covered by this in this case. + if (partitionConfigured) { + LOGGER.warn("Unable to extract Partition id from {}.", sourceName); + } + return Optional.empty(); + } + + } + + private Optional getOffset(final Matcher matcher, final String sourceName) { + try { + return Optional.of(Integer.parseInt(matcher.group(PATTERN_START_OFFSET_KEY))); + } catch (IllegalArgumentException e) { + // It is possible that when checking for the group it does not match and returns an + // illegalStateException, Number format exception is also covered by this in this case. + if (startOffsetConfigured) { + LOGGER.warn("Unable to extract start offset from {}.", sourceName); + } + return Optional.empty(); + } + + } + + private Optional matchPattern(final String sourceName) { + if (sourceName == null) { + throw new IllegalArgumentException("filePattern and sourceName must not be null"); + } + final Matcher matcher = pattern.matcher(sourceName); + return matcher.find() ? Optional.of(matcher) : Optional.empty(); + } + +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/Context.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/Context.java new file mode 100644 index 000000000..265ade6db --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/Context.java @@ -0,0 +1,71 @@ +/* + * Copyright 2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.task; + +import java.util.Optional; + +/** + * A Context which captures all the details about the source which are required to successfully send a source record + * onto Kafka + * + * @param + * is is the type/class of the key unique to the object the context is being created about + */ +public class Context> { + + private String topic; + private Integer partition; + private Integer offset; + private K storageKey; + + public Context(final K storageKey) { + + this.storageKey = storageKey; + } + + public Optional getTopic() { + return Optional.ofNullable(topic); + } + + public void setTopic(final String topic) { + this.topic = topic; + } + + public Optional getPartition() { + return Optional.ofNullable(partition); + } + + public void setPartition(final Integer partition) { + this.partition = partition; + } + + public Optional getStorageKey() { + return Optional.ofNullable(storageKey); + } + + public void setStorageKey(final K storageKey) { + this.storageKey = storageKey; + } + + public Optional getOffset() { + return Optional.ofNullable(offset); + } + + public void setOffset(final Integer offset) { + this.offset = offset; + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionStrategy.java new file mode 100644 index 000000000..8644889c0 --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionStrategy.java @@ -0,0 +1,70 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.task; + +import java.util.Optional; +import java.util.function.Function; + +/** + * An {@link DistributionStrategy} provides a mechanism to share the work of processing records from objects (or files) + * into tasks, which are subsequently processed (potentially in parallel) by Kafka Connect workers. + *

+ * The number of objects in cloud storage can be very high, selecting a distribution strategy allows the connector to + * know how to distribute the load across Connector tasks and in some cases using an appropriate strategy can also + * decide on maintaining a level of ordering between messages as well. + */ +public final class DistributionStrategy { + private int maxTasks; + private final Function, Optional> mutation; + public final static int UNDEFINED = -1; + + public DistributionStrategy(final Function, Optional> creator, final int maxTasks) { + assertPositiveInteger(maxTasks); + this.mutation = creator; + this.maxTasks = maxTasks; + } + + private static void assertPositiveInteger(final int sourceInt) { + if (sourceInt <= 0) { + throw new IllegalArgumentException("tasks.max must be set to a positive number and at least 1."); + } + } + + /** + * Retrieve the taskId that this object should be processed by. Any single object will be assigned deterministically + * to a single taskId, that will be always return the same taskId output given the same context is used. + * + * @param ctx + * This is the context which contains optional values for the partition, topic and storage key name + * @return the taskId which this particular task should be assigned to. + */ + public int getTaskFor(final Context ctx) { + return mutation.apply(ctx).map(aLong -> Math.floorMod(aLong, maxTasks)).orElse(UNDEFINED); + } + + /** + * When a connector receives a reconfigure event this method should be called to ensure that the distribution + * strategy is updated correctly. + * + * @param maxTasks + * The maximum number of tasks created for the Connector + */ + public void configureDistributionStrategy(final int maxTasks) { + assertPositiveInteger(maxTasks); + this.maxTasks = maxTasks; + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionType.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionType.java new file mode 100644 index 000000000..9010e8b8d --- /dev/null +++ b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/DistributionType.java @@ -0,0 +1,88 @@ +/* + * Copyright 2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.task; + +import java.util.Arrays; +import java.util.Objects; +import java.util.Optional; +import java.util.function.Function; + +import org.apache.kafka.common.config.ConfigException; + +public enum DistributionType { + + /** + * Object_Hash takes the context and uses the storage key implementation to get a hash value of the storage key and + * return a modulus of that relative to the number of maxTasks to decide which task should process a given object + */ + OBJECT_HASH("object_hash", + context -> context.getStorageKey().isPresent() + ? Optional.of((long) context.getStorageKey().get().hashCode()) + : Optional.empty()), + /** + * Partition takes the context and requires the context contain the partition id for it to be able to decide the + * distribution across the max tasks, using a modulus to ensure even distribution against the configured max tasks + */ + PARTITION("partition", + context -> context.getPartition().isPresent() + ? Optional.of((long) context.getPartition().get()) + : Optional.empty()); + + private final String name; + private final Function, Optional> mutation; + + public String value() { + return name; + } + + /** + * Get the Object distribution strategy for the configured ObjectDistributionStrategy + * + * @param name + * the name of the ObjectDistributionStrategy + * @param mutation + * the mutation required to get the correct details from the context for distribution + */ + DistributionType(final String name, final Function, Optional> mutation) { + this.name = name; + this.mutation = mutation; + } + + public static DistributionType forName(final String name) { + Objects.requireNonNull(name, "name cannot be null"); + for (final DistributionType distributionType : DistributionType.values()) { + if (distributionType.name.equalsIgnoreCase(name)) { + return distributionType; + } + } + throw new ConfigException(String.format("Unknown distribution.type : %s, allowed values %s ", name, + Arrays.toString(DistributionType.values()))); + } + + /** + * Returns a configured Distribution Strategy + * + * @param maxTasks + * the maximum number of configured tasks for this connector + * + * @return a configured Distribution Strategy with the correct mutation configured for proper distribution across + * tasks of objects being processed. + */ + public DistributionStrategy getDistributionStrategy(final int maxTasks) { + return new DistributionStrategy(mutation, maxTasks); + } +} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/HashObjectDistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/HashObjectDistributionStrategy.java deleted file mode 100644 index c39676ad0..000000000 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/HashObjectDistributionStrategy.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * {@link HashObjectDistributionStrategy} evenly distributes cloud storage objects between tasks using the hashcode of - * the object's filename, which is uniformly distributed and deterministic across workers. - *

- * This is well-suited to use cases where the order of events between records from objects is not important, especially - * when ingesting files into Kafka that were not previously created by a supported cloud storage Sink. - */ -public final class HashObjectDistributionStrategy implements ObjectDistributionStrategy { - private final static Logger LOG = LoggerFactory.getLogger(HashObjectDistributionStrategy.class); - private int maxTasks; - HashObjectDistributionStrategy(final int maxTasks) { - this.maxTasks = maxTasks; - } - - @Override - public boolean isPartOfTask(final int taskId, final String filenameToBeEvaluated) { - if (filenameToBeEvaluated == null) { - LOG.warn("Ignoring as it is not passing a correct filename to be evaluated."); - return false; - } - final int taskAssignment = Math.floorMod(filenameToBeEvaluated.hashCode(), maxTasks); - // floor mod returns the remainder of a division so will start at 0 and move up - // tasks start at 0 so there should be no issue. - return taskAssignment == taskId; - } - - @Override - public void reconfigureDistributionStrategy(final int maxTasks, final String expectedFormat) { - setMaxTasks(maxTasks); - } - - public void setMaxTasks(final int maxTasks) { - this.maxTasks = maxTasks; - } -} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/ObjectDistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/ObjectDistributionStrategy.java deleted file mode 100644 index 5925d880d..000000000 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/ObjectDistributionStrategy.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task; - -/** - * An {@link ObjectDistributionStrategy} provides a mechanism to share the work of processing records from objects (or - * files) into tasks, which are subsequently processed (potentially in parallel) by Kafka Connect workers. - *

- * The number of objects in cloud storage can be very high, and they are distributed amongst tasks to minimize the - * overhead of assigning work to Kafka worker threads. All objects assigned to the same task will be processed together - * sequentially by the same worker, which can be useful for maintaining order between objects. There are usually fewer - * workers than tasks, and they will be assigned the remaining tasks as work completes. - */ -public interface ObjectDistributionStrategy { - - /** - * Check if the object should be processed by the task with the given {@code taskId}. Any single object should be - * assigned deterministically to a single taskId. - * - * @param taskId - * a task ID, usually for the currently running task - * @param valueToBeEvaluated - * The value to be evaluated to determine if it should be processed by the task. - * @return true if the task should process the object, false if it should not. - */ - boolean isPartOfTask(int taskId, String valueToBeEvaluated); - - /** - * When a connector receives a reconfigure event this method should be called to ensure that the distribution - * strategy is updated correctly. - * - * @param maxTasks - * The maximum number of tasks created for the Connector - * @param expectedFormat - * The expected format, of files, path, table names or other ways to partition the tasks. - */ - void reconfigureDistributionStrategy(int maxTasks, String expectedFormat); - - /** - * Check if the task is responsible for this set of files by checking if the given task matches the partition id. - * - * @param taskId - * the current running task - * @param partitionId - * The partitionId recovered from the file path. - * @return true if this task is responsible for this partition. false if it is not responsible for this task. - */ - default boolean taskMatchesPartition(final int taskId, final int partitionId) { - // The partition id and task id are both expected to start at 0 but if the task id is changed to start at 1 this - // will break. - return taskId == partitionId; - } - - /** - * In the event of more partitions existing then tasks configured, the task will be required to take up additional - * tasks that match. - * - * @param taskId - * the current running task. - * @param maxTasks - * The maximum number of configured tasks allowed to run for this connector. - * @param partitionId - * The partitionId recovered from the file path. - * @return true if the task supplied should handle the supplied partition - */ - default boolean taskMatchesModOfPartitionAndMaxTask(final int taskId, final int maxTasks, final int partitionId) { - - return taskMatchesPartition(taskId, partitionId % maxTasks); - } - - default boolean toBeProcessedByThisTask(final int taskId, final int maxTasks, final int partitionId) { - return partitionId < maxTasks - ? taskMatchesPartition(taskId, partitionId) - : taskMatchesModOfPartitionAndMaxTask(taskId, maxTasks, partitionId); - - } -} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionInFilenameDistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionInFilenameDistributionStrategy.java deleted file mode 100644 index f74e56826..000000000 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionInFilenameDistributionStrategy.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.kafka.common.config.ConfigException; - -import org.codehaus.plexus.util.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * The {@link PartitionInFilenameDistributionStrategy} finds a partition in the object's filename by matching it to an - * expected format, and assigns all partitions to the same task. - *

- * This useful when a sink connector has created the object name in a format like - * {@code topicname-{{partition}}-{{start_offset}}}, and we want all objects with the same partition to be processed - * within a single task. - */ -public final class PartitionInFilenameDistributionStrategy implements ObjectDistributionStrategy { - private final static Logger LOG = LoggerFactory.getLogger(PartitionInFilenameDistributionStrategy.class); - private final static String NUMBER_REGEX_PATTERN = "(\\d)+"; - // Use a named group to return the partition in a complex string to always get the correct information for the - // partition number. - private final static String PARTITION_NAMED_GROUP_REGEX_PATTERN = "(?\\d)+"; - private final static String PARTITION_PATTERN = "\\{\\{partition}}"; - private final static String START_OFFSET_PATTERN = "\\{\\{start_offset}}"; - private final static String TIMESTAMP_PATTERN = "\\{\\{timestamp}}"; - public static final String PARTITION = "partition"; - private Pattern partitionPattern; - - private int maxTasks; - - PartitionInFilenameDistributionStrategy(final int maxTasks, final String expectedSourceNameFormat) { - configureDistributionStrategy(maxTasks, expectedSourceNameFormat); - } - - /** - * - * @param sourceNameToBeEvaluated - * is the filename/table name of the source for the connector. - * @return Predicate to confirm if the given source name matches - */ - @Override - public boolean isPartOfTask(final int taskId, final String sourceNameToBeEvaluated) { - if (sourceNameToBeEvaluated == null) { - LOG.warn("Ignoring as it is not passing a correct filename to be evaluated."); - return false; - } - final Matcher match = partitionPattern.matcher(sourceNameToBeEvaluated); - if (match.find()) { - return toBeProcessedByThisTask(taskId, maxTasks, Integer.parseInt(match.group(PARTITION))); - } - LOG.warn("Unable to find the partition from this file name {}", sourceNameToBeEvaluated); - return false; - } - - /** - * When a connector reconfiguration event is received this method should be called to ensure the correct strategy is - * being implemented by the connector. - * - * @param maxTasks - * maximum number of configured tasks for this connector - * @param expectedSourceNameFormat - * what the format of the source should appear like so to configure the task distribution. - */ - @Override - public void reconfigureDistributionStrategy(final int maxTasks, final String expectedSourceNameFormat) { - configureDistributionStrategy(maxTasks, expectedSourceNameFormat); - } - - private void configureDistributionStrategy(final int maxTasks, final String expectedSourceNameFormat) { - if (expectedSourceNameFormat == null || !expectedSourceNameFormat.contains(PARTITION_PATTERN)) { - throw new ConfigException(String.format( - "Source name format %s missing partition pattern {{partition}}, please configure the expected source to include the partition pattern.", - expectedSourceNameFormat)); - } - setMaxTasks(maxTasks); - // Build REGEX Matcher - String regexString = StringUtils.replace(expectedSourceNameFormat, START_OFFSET_PATTERN, NUMBER_REGEX_PATTERN); - regexString = StringUtils.replace(regexString, TIMESTAMP_PATTERN, NUMBER_REGEX_PATTERN); - regexString = StringUtils.replace(regexString, PARTITION_PATTERN, PARTITION_NAMED_GROUP_REGEX_PATTERN); - try { - partitionPattern = Pattern.compile(regexString); - } catch (IllegalArgumentException iae) { - throw new ConfigException( - String.format("Unable to compile the regex pattern %s to retrieve the partition id.", regexString), - iae); - } - } - - private void setMaxTasks(final int maxTasks) { - this.maxTasks = maxTasks; - } - -} diff --git a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionInPathDistributionStrategy.java b/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionInPathDistributionStrategy.java deleted file mode 100644 index 85e1c3e75..000000000 --- a/commons/src/main/java/io/aiven/kafka/connect/common/source/task/PartitionInPathDistributionStrategy.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task; - -import org.apache.kafka.common.config.ConfigException; -import org.apache.kafka.connect.errors.ConnectException; - -import org.apache.commons.lang3.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * The {@link PartitionInPathDistributionStrategy} finds a partition number in the path by matching a - * {@code {{partition}} } marker in the path. - *

- * This useful when a sink connector has created the object name in a path like - * {@code /PREFIX/partition={{partition}}/YYYY/MM/DD/mm/}}, and we want all objects with the same partition to be - * processed within a single task. - *

- * Partitions are evenly distributed between tasks. For example, in Connect with 10 Partitions and 3 tasks: - * - *

- *   | Task | Partitions |
- *   |------|------------|
- *   | 0    | 0, 3, 6, 9 |
- *   | 1    | 1, 4, 7    |
- *   | 2    | 2, 5, 8    |
- * 
- */ -public final class PartitionInPathDistributionStrategy implements ObjectDistributionStrategy { - public static final String PARTITION_ID_PATTERN = "\\{\\{partition}}"; - private final static Logger LOG = LoggerFactory.getLogger(PartitionInPathDistributionStrategy.class); - - private String prefix; - private int maxTasks; - - PartitionInPathDistributionStrategy(final int maxTasks, final String expectedPathFormat) { - configureDistributionStrategy(maxTasks, expectedPathFormat); - } - - @Override - public boolean isPartOfTask(final int taskId, final String pathToBeEvaluated) { - if (pathToBeEvaluated == null || !pathToBeEvaluated.startsWith(prefix)) { - LOG.warn("Ignoring path {}, does not contain the preconfigured prefix {} set up at startup", - pathToBeEvaluated, prefix); - return false; - } - final String modifiedPath = StringUtils.substringAfter(pathToBeEvaluated, prefix); - if (!modifiedPath.contains("/")) { - LOG.warn("Ignoring path {}, does not contain any sub folders after partitionId prefix {}", - pathToBeEvaluated, prefix); - return false; - } - final String partitionId = StringUtils.substringBefore(modifiedPath, "/"); - - try { - return toBeProcessedByThisTask(taskId, maxTasks, Integer.parseInt(partitionId)); - } catch (NumberFormatException ex) { - throw new ConnectException(String - .format("Unexpected non integer value found parsing path for partitionId: %s", pathToBeEvaluated)); - } - } - - /** - * - * @param maxTasks - * The maximum number of configured tasks for this - * @param expectedPathFormat - * The format of the path and where to identify - */ - @Override - public void reconfigureDistributionStrategy(final int maxTasks, final String expectedPathFormat) { - configureDistributionStrategy(maxTasks, expectedPathFormat); - } - - private void configureDistributionStrategy(final int maxTasks, final String expectedPathFormat) { - setMaxTasks(maxTasks); - - if (StringUtils.isEmpty(expectedPathFormat) || !expectedPathFormat.contains(PARTITION_ID_PATTERN)) { - throw new ConfigException(String.format( - "Expected path format %s is missing the identifier '%s' to correctly select the partition", - expectedPathFormat, PARTITION_ID_PATTERN)); - } - prefix = StringUtils.substringBefore(expectedPathFormat, PARTITION_ID_PATTERN); - } - - private void setMaxTasks(final int maxTasks) { - this.maxTasks = maxTasks; - } - -} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/AbstractSourceTaskTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/AbstractSourceTaskTest.java new file mode 100644 index 000000000..92fbddf46 --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/AbstractSourceTaskTest.java @@ -0,0 +1,151 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; +import static org.awaitility.Awaitility.await; + +import java.time.Duration; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.apache.commons.lang3.time.StopWatch; +import org.junit.jupiter.api.Test; + +class AbstractSourceTaskTest { + + /** + * The amount of extra time that we will allow for timing errors. + */ + private static final long TIMING_DELTA_MS = 250; + + @Test + void timerTest() { + final AbstractSourceTask.Timer timer = new AbstractSourceTask.Timer(Duration.ofSeconds(1)); + assertThat(timer.millisecondsRemaining()).isEqualTo(Duration.ofSeconds(1).toMillis()); + timer.start(); + await().atMost(Duration.ofSeconds(2)).until(timer::isExpired); + assertThat(timer.millisecondsRemaining()).isLessThan(0); + timer.stop(); + assertThat(timer.millisecondsRemaining()).isEqualTo(Duration.ofSeconds(1).toMillis()); + } + + @Test + void timerSequenceTest() { + final AbstractSourceTask.Timer timer = new AbstractSourceTask.Timer(Duration.ofSeconds(1)); + // stopped state does not allow stop + assertThatExceptionOfType(IllegalStateException.class).as("stop while not running") + .isThrownBy(timer::stop) + .withMessageStartingWith("Timer: "); + timer.reset(); // verify that an exception is not thrown. + + // started state does not allow start + timer.start(); + assertThatExceptionOfType(IllegalStateException.class).as("start while running") + .isThrownBy(timer::start) + .withMessageStartingWith("Timer: "); + timer.reset(); + timer.start(); // restart the timer. + timer.stop(); + + // stopped state does not allow stop or start + assertThatExceptionOfType(IllegalStateException.class).as("stop after stop") + .isThrownBy(timer::stop) + .withMessageStartingWith("Timer: "); + assertThatExceptionOfType(IllegalStateException.class).as("start after stop") + .isThrownBy(timer::start) + .withMessageStartingWith("Timer: "); + timer.reset(); + + // stopped + reset does not allow stop. + assertThatExceptionOfType(IllegalStateException.class).as("stop after reset (1)") + .isThrownBy(timer::stop) + .withMessageStartingWith("Timer: "); + timer.start(); + timer.reset(); + + // started + reset does not allow stop; + assertThatExceptionOfType(IllegalStateException.class).as("stop after reset (2)") + .isThrownBy(timer::stop) + .withMessageStartingWith("Timer: "); + } + + @Test + void backoffTest() throws InterruptedException { + final AbstractSourceTask.Timer timer = new AbstractSourceTask.Timer(Duration.ofSeconds(1)); + final AbstractSourceTask.Backoff backoff = new AbstractSourceTask.Backoff(timer.getBackoffConfig()); + final long estimatedDelay = backoff.estimatedDelay(); + assertThat(estimatedDelay).isLessThan(500); + + // execute delay without timer running. + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + backoff.delay(); + stopWatch.stop(); + assertThat(stopWatch.getTime()).as("Result without timer running") + .isBetween(estimatedDelay - backoff.getMaxJitter() - TIMING_DELTA_MS, + estimatedDelay + backoff.getMaxJitter() + TIMING_DELTA_MS); + + timer.start(); + for (int i = 0; i < 9; i++) { + stopWatch.reset(); + timer.reset(); + timer.start(); + stopWatch.start(); + await().atMost(Duration.ofSeconds(2)).until(() -> { + backoff.delay(); + return backoff.estimatedDelay() == 0 || timer.isExpired(); + }); + stopWatch.stop(); + timer.stop(); + final int step = i; + if (!timer.isExpired()) { + assertThat(stopWatch.getTime()).as(() -> String.format("Result with timer running at step %s", step)) + .isBetween(Duration.ofSeconds(1).toMillis() - backoff.getMaxJitter() - TIMING_DELTA_MS, + Duration.ofSeconds(1).toMillis() + backoff.getMaxJitter() + TIMING_DELTA_MS); + } + } + } + + @Test + void backoffIncrementalTimeTest() throws InterruptedException { + final AtomicBoolean abortTrigger = new AtomicBoolean(); + // delay increases in powers of 2. + final long maxDelay = 1000; // not a power of 2 + final AbstractSourceTask.BackoffConfig config = new AbstractSourceTask.BackoffConfig() { + @Override + public AbstractSourceTask.SupplierOfLong getSupplierOfTimeRemaining() { + return () -> maxDelay; + } + + @Override + public AbstractSourceTask.AbortTrigger getAbortTrigger() { + return () -> abortTrigger.set(true); + } + }; + + final AbstractSourceTask.Backoff backoff = new AbstractSourceTask.Backoff(config); + long expected = 2; + while (backoff.estimatedDelay() < maxDelay) { + assertThat(backoff.estimatedDelay()).isEqualTo(expected); + backoff.delay(); + expected *= 2; + } + assertThat(backoff.estimatedDelay()).isEqualTo(maxDelay); + assertThat(abortTrigger).isFalse(); + } +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java new file mode 100644 index 000000000..617dd290a --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/AvroTransformerTest.java @@ -0,0 +1,169 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMA_REGISTRY_URL; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.when; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.data.Struct; + +import io.aiven.kafka.connect.common.config.SourceCommonConfig; + +import io.confluent.connect.avro.AvroData; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.DatumWriter; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +final class AvroTransformerTest { + + @Mock + private SourceCommonConfig sourceCommonConfig; + + private AvroTransformer avroTransformer; + private Map config; + + @BeforeEach + void setUp() { + avroTransformer = new AvroTransformer(new AvroData(100)); + config = new HashMap<>(); + } + + @Test + void testConfigureValueConverter() { + final String value = "http://localhost:8081"; + when(sourceCommonConfig.getString(SCHEMA_REGISTRY_URL)).thenReturn(value); + avroTransformer.configureValueConverter(config, sourceCommonConfig); + assertThat(config.get(SCHEMA_REGISTRY_URL)).isEqualTo("http://localhost:8081") + .describedAs("The schema registry URL should be correctly set in the config."); + } + + @Test + void testReadAvroRecordsInvalidData() { + final InputStream inputStream = new ByteArrayInputStream("mock-avro-data".getBytes(StandardCharsets.UTF_8)); + + final Stream records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + 0); + + final List recs = records.collect(Collectors.toList()); + assertThat(recs).isEmpty(); + } + + @Test + void testReadAvroRecords() throws Exception { + final ByteArrayOutputStream avroData = generateMockAvroData(25); + final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); + + final List expected = new ArrayList<>(); + for (int i = 0; i < 25; i++) { + expected.add("Hello, Kafka Connect S3 Source! object " + i); + } + + final Stream records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + 0); + + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Struct) sv).getString("message")) + .containsExactlyElementsOf(expected); + } + + @Test + void testReadAvroRecordsSkipFew() throws Exception { + final ByteArrayOutputStream avroData = generateMockAvroData(20); + final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); + + final List expected = new ArrayList<>(); + for (int i = 5; i < 20; i++) { + expected.add("Hello, Kafka Connect S3 Source! object " + i); + } + final Stream records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + 5); + + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Struct) sv).getString("message")) + .containsExactlyElementsOf(expected); + } + + @Test + void testReadAvroRecordsSkipMoreRecordsThanExist() throws Exception { + final ByteArrayOutputStream avroData = generateMockAvroData(20); + final InputStream inputStream = new ByteArrayInputStream(avroData.toByteArray()); + + final Stream records = avroTransformer.getRecords(() -> inputStream, "", 0, sourceCommonConfig, + 25); + + assertThat(records).isEmpty(); + } + + static ByteArrayOutputStream generateMockAvroData(final int numRecs) throws IOException { + final String schemaJson = "{\n" + " \"type\": \"record\",\n" + " \"name\": \"TestRecord\",\n" + + " \"fields\": [\n" + " {\"name\": \"message\", \"type\": \"string\"},\n" + + " {\"name\": \"id\", \"type\": \"int\"}\n" + " ]\n" + "}"; + final Schema.Parser parser = new Schema.Parser(); + final Schema schema = parser.parse(schemaJson); + + return getAvroRecords(schema, numRecs); + } + + private static ByteArrayOutputStream getAvroRecords(final Schema schema, final int numOfRecs) throws IOException { + // Create Avro records + final List avroRecords = new ArrayList<>(); + for (int i = 0; i < numOfRecs; i++) { + final GenericRecord avroRecord = new GenericData.Record(schema); // NOPMD AvoidInstantiatingObjectsInLoops + avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + i); + avroRecord.put("id", i); + avroRecords.add(avroRecord); + } + + // Serialize Avro records to byte arrays + final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + final DatumWriter datumWriter = new GenericDatumWriter<>(schema); + + // Append each record using a loop + try (DataFileWriter dataFileWriter = new DataFileWriter<>(datumWriter)) { + dataFileWriter.create(schema, outputStream); + for (final GenericRecord record : avroRecords) { + dataFileWriter.append(record); + } + dataFileWriter.flush(); + } + outputStream.close(); + return outputStream; + } +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java new file mode 100644 index 000000000..80820e13b --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ByteArrayTransformerTest.java @@ -0,0 +1,77 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.kafka.connect.data.SchemaAndValue; + +import io.aiven.kafka.connect.common.config.SourceCommonConfig; + +import org.apache.commons.io.function.IOSupplier; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +final class ByteArrayTransformerTest { + + public static final String TEST_TOPIC = "test-topic"; + private ByteArrayTransformer byteArrayTransformer; + + @Mock + private SourceCommonConfig sourceCommonConfig; + + @BeforeEach + void setUp() { + byteArrayTransformer = new ByteArrayTransformer(); + } + + @Test + void testGetRecordsSingleChunk() { + final byte[] data = { 1, 2, 3, 4, 5 }; + final InputStream inputStream = new ByteArrayInputStream(data); + final IOSupplier inputStreamIOSupplier = () -> inputStream; + + final Stream records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, + sourceCommonConfig, 0); + + final List recs = records.collect(Collectors.toList()); + assertThat(recs).hasSize(1); + assertThat(recs.get(0).value()).isEqualTo(data); + } + + @Test + void testGetRecordsEmptyInputStream() { + final InputStream inputStream = new ByteArrayInputStream(new byte[] {}); + + final IOSupplier inputStreamIOSupplier = () -> inputStream; + + final Stream records = byteArrayTransformer.getRecords(inputStreamIOSupplier, TEST_TOPIC, 0, + sourceCommonConfig, 0); + + assertThat(records).hasSize(0); + } +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ContentUtils.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ContentUtils.java new file mode 100644 index 000000000..4b82f0a63 --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ContentUtils.java @@ -0,0 +1,99 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import static org.apache.kafka.connect.data.Schema.INT32_SCHEMA; +import static org.apache.kafka.connect.data.Schema.STRING_SCHEMA; + +import java.io.IOException; +import java.net.ConnectException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.kafka.common.record.TimestampType; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.sink.SinkRecord; + +import io.aiven.kafka.connect.common.config.OutputField; +import io.aiven.kafka.connect.common.config.OutputFieldEncodingType; +import io.aiven.kafka.connect.common.config.OutputFieldType; +import io.aiven.kafka.connect.common.output.parquet.ParquetOutputWriter; + +public final class ContentUtils { + private ContentUtils() { + } + public static Path getTmpFilePath(final String name1) throws IOException { + final String tmpFile = "users.parquet"; + final Path parquetFileDir = Files.createTempDirectory("parquet_tests"); + final String parquetFilePath = parquetFileDir.toAbsolutePath() + "/" + tmpFile; + + writeParquetFile(parquetFilePath, name1); + return Paths.get(parquetFilePath); + } + + public static void writeParquetFile(final String tempFilePath, final String name1) throws IOException { + // Define the Avro schema + final Schema schema = SchemaBuilder.struct() + .field("name", STRING_SCHEMA) + .field("age", INT32_SCHEMA) + .field("email", STRING_SCHEMA) + .build(); + // Write the Parquet file + try { + writeParquetFile(tempFilePath, schema, name1, 100); + } catch (IOException e) { + throw new ConnectException("Error writing parquet file"); + } + } + + @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") + private static void writeParquetFile(final String outputPath, final Schema schema, final String name1, + final int numOfRecords) throws IOException { + + final List allParquetRecords = new ArrayList<>(); + // Write records to the Parquet file + for (int i = 0; i < numOfRecords; i++) { + allParquetRecords + .add(new Struct(schema).put("name", name1 + i).put("age", 30).put("email", name1 + "@test")); + } + + // Create a Parquet writer + final Path outputFilePath = Paths.get(outputPath); + try (var outputStream = Files.newOutputStream(outputFilePath.toAbsolutePath()); + var parquetWriter = new ParquetOutputWriter( + List.of(new OutputField(OutputFieldType.VALUE, OutputFieldEncodingType.NONE)), outputStream, + Collections.emptyMap(), false)) { + int counter = 0; + final var sinkRecords = new ArrayList(); + for (final var r : allParquetRecords) { + final var sinkRecord = new SinkRecord( // NOPMD AvoidInstantiatingObjectsInLoops + "some-topic", 1, STRING_SCHEMA, "some-key-" + counter, schema, r, 100L, 1000L + counter, + TimestampType.CREATE_TIME, null); + sinkRecords.add(sinkRecord); + counter++; + } + parquetWriter.writeRecords(sinkRecords); + } + + } +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java new file mode 100644 index 000000000..e482fd61c --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/JsonTransformerTest.java @@ -0,0 +1,152 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMAS_ENABLE; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; + +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.json.JsonConverter; + +import io.aiven.kafka.connect.common.config.SourceCommonConfig; + +import org.apache.commons.io.function.IOSupplier; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +final class JsonTransformerTest { + + public static final String TESTTOPIC = "testtopic"; + JsonTransformer jsonTransformer; + + SourceCommonConfig sourceCommonConfig; + + @Mock + private IOSupplier inputStreamIOSupplierMock; + + JsonConverter jsonConverter; + + @BeforeEach + void setUp() { + jsonConverter = new JsonConverter(); + final Map config = new HashMap<>(); + config.put(SCHEMAS_ENABLE, "false"); + jsonConverter.configure(config, false); + + jsonTransformer = new JsonTransformer(jsonConverter); + sourceCommonConfig = mock(SourceCommonConfig.class); + } + + @AfterEach + void destroy() { + jsonConverter.close(); + } + + @Test + void testHandleValueDataWithValidJson() { + final InputStream validJsonInputStream = new ByteArrayInputStream( + getJsonRecs(100).getBytes(StandardCharsets.UTF_8)); + + final List expected = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + expected.add("value" + i); + } + + final Stream records = jsonTransformer.getRecords(() -> validJsonInputStream, TESTTOPIC, 1, + sourceCommonConfig, 0); + + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Map) sv).get("key")) + .containsExactlyElementsOf(expected); + } + + @Test + void testHandleValueDataWithValidJsonSkipFew() { + final InputStream validJsonInputStream = new ByteArrayInputStream( + getJsonRecs(100).getBytes(StandardCharsets.UTF_8)); + + final List expected = new ArrayList<>(); + for (int i = 25; i < 100; i++) { + expected.add("value" + i); + } + + final Stream records = jsonTransformer.getRecords(() -> validJsonInputStream, TESTTOPIC, 1, + sourceCommonConfig, 25L); + + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Map) sv).get("key")) + .containsExactlyElementsOf(expected); + + } + + @Test + void testHandleValueDataWithInvalidJson() { + final InputStream invalidJsonInputStream = new ByteArrayInputStream( + "invalid-json".getBytes(StandardCharsets.UTF_8)); + final IOSupplier inputStreamIOSupplier = () -> invalidJsonInputStream; + + final Stream jsonNodes = jsonTransformer.getRecords(inputStreamIOSupplier, TESTTOPIC, 1, + sourceCommonConfig, 0); + + assertThat(jsonNodes).isEmpty(); + + } + + @Test + void testGetRecordsWithIOException() throws IOException { + when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException")); + final Stream resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); + + assertThat(resultStream).isEmpty(); + } + + @Test + void testCustomSpliteratorWithIOExceptionDuringInitialization() throws IOException { + when(inputStreamIOSupplierMock.get()).thenThrow(new IOException("Test IOException during initialization")); + final Stream resultStream = jsonTransformer.getRecords(inputStreamIOSupplierMock, "topic", 0, null, 0); + + assertThat(resultStream).isEmpty(); + } + + static String getJsonRecs(final int recordCount) { + final StringBuilder jsonRecords = new StringBuilder(); + for (int i = 0; i < recordCount; i++) { + jsonRecords.append(String.format("{\"key\":\"value%d\"}", i)); + if (i < recordCount) { + jsonRecords.append("\n"); // NOPMD AppendCharacterWithChar + } + } + return jsonRecords.toString(); + } +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java new file mode 100644 index 000000000..2f7a405fe --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/ParquetTransformerTest.java @@ -0,0 +1,177 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.data.Struct; + +import io.aiven.kafka.connect.common.config.SourceCommonConfig; + +import io.confluent.connect.avro.AvroData; +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.function.IOSupplier; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mockito; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +final class ParquetTransformerTest { + private ParquetTransformer parquetTransformer; + + @BeforeEach + public void setUp() { + parquetTransformer = new ParquetTransformer(new AvroData(100)); + } + + @Test + void testHandleValueDataWithZeroBytes() { + final byte[] mockParquetData = new byte[0]; + final InputStream inputStream = new ByteArrayInputStream(mockParquetData); + final IOSupplier inputStreamIOSupplier = () -> inputStream; + final SourceCommonConfig s3SourceConfig = mock(SourceCommonConfig.class); + + final String topic = "test-topic"; + final int topicPartition = 0; + final Stream recs = parquetTransformer.getRecords(inputStreamIOSupplier, topic, topicPartition, + s3SourceConfig, 0L); + + assertThat(recs).isEmpty(); + } + + @Test + void testGetRecordsWithValidData() throws Exception { + final byte[] mockParquetData = generateMockParquetData(); + final InputStream inputStream = new ByteArrayInputStream(mockParquetData); + final IOSupplier inputStreamIOSupplier = () -> inputStream; + final SourceCommonConfig s3SourceConfig = mock(SourceCommonConfig.class); + + final String topic = "test-topic"; + final int topicPartition = 0; + final List expected = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + expected.add("name" + i); + } + final List records = parquetTransformer + .getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig, 0L) + .collect(Collectors.toList()); + + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Struct) sv).getString("name")) + .containsExactlyElementsOf(expected); + } + + @Test + void testGetRecordsWithValidDataSkipFew() throws Exception { + final byte[] mockParquetData = generateMockParquetData(); + final InputStream inputStream = new ByteArrayInputStream(mockParquetData); + final IOSupplier inputStreamIOSupplier = () -> inputStream; + final SourceCommonConfig s3SourceConfig = mock(SourceCommonConfig.class); + + final String topic = "test-topic"; + final int topicPartition = 0; + + final List expected = new ArrayList<>(); + for (int i = 25; i < 100; i++) { + expected.add("name" + i); + } + + final List records = parquetTransformer + .getRecords(inputStreamIOSupplier, topic, topicPartition, s3SourceConfig, 25L) + .collect(Collectors.toList()); + + assertThat(records).extracting(SchemaAndValue::value) + .extracting(sv -> ((Struct) sv).getString("name")) + .containsExactlyElementsOf(expected); + } + + @Test + void testGetRecordsWithInvalidData() { + final byte[] invalidData = "invalid data".getBytes(StandardCharsets.UTF_8); + final InputStream inputStream = new ByteArrayInputStream(invalidData); + final IOSupplier inputStreamIOSupplier = () -> inputStream; + + final SourceCommonConfig s3SourceConfig = mock(SourceCommonConfig.class); + + final String topic = "test-topic"; + final int topicPartition = 0; + + final Stream records = parquetTransformer.getRecords(inputStreamIOSupplier, topic, + topicPartition, s3SourceConfig, 0L); + assertThat(records).isEmpty(); + } + + @Test + void testTemporaryFileDeletion() throws Exception { + final Path tempFile = Files.createTempFile("test-file", ".parquet"); + assertThat(Files.exists(tempFile)).isTrue(); + + ParquetTransformer.deleteTmpFile(tempFile); + assertThat(Files.exists(tempFile)).isFalse(); + } + + static byte[] generateMockParquetData() throws IOException { + final Path path = ContentUtils.getTmpFilePath("name"); + return IOUtils.toByteArray(Files.newInputStream(path)); + } + + @Test + void testIOExceptionCreatingTempFile() { + try (var mockStatic = Mockito.mockStatic(File.class)) { + mockStatic.when(() -> File.createTempFile(anyString(), anyString())) + .thenThrow(new IOException("Test IOException for temp file")); + + final IOSupplier inputStreamSupplier = mock(IOSupplier.class); + final Stream resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", + 1, null, 0L); + + assertThat(resultStream).isEmpty(); + } + } + + @Test + void testIOExceptionDuringDataCopy() throws IOException { + try (InputStream inputStreamMock = mock(InputStream.class)) { + when(inputStreamMock.read(any(byte[].class))).thenThrow(new IOException("Test IOException during copy")); + + final IOSupplier inputStreamSupplier = () -> inputStreamMock; + final Stream resultStream = parquetTransformer.getRecords(inputStreamSupplier, "test-topic", + 1, null, 0L); + + assertThat(resultStream).isEmpty(); + } + } +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java new file mode 100644 index 000000000..73b27b01f --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/TransformerStreamingTest.java @@ -0,0 +1,151 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Stream; + +import org.apache.kafka.common.config.AbstractConfig; +import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.connect.data.SchemaAndValue; + +import io.aiven.kafka.connect.common.config.CommonConfig; + +import org.apache.commons.io.function.IOSupplier; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +/** + * Abstract test class to verify that streaming data is closed properly. + */ +class TransformerStreamingTest { + + @ParameterizedTest + @MethodSource("testData") + void verifyExceptionDuringIOOpen(final Transformer transformer, final byte[] testData, final AbstractConfig config, + final int expectedCount) throws IOException { + final IOSupplier ioSupplier = mock(IOSupplier.class); + when(ioSupplier.get()).thenThrow(new IOException("Test IOException during initialization")); + final Stream objStream = transformer.getRecords(ioSupplier, "topic", 1, config, 0); + assertThat(objStream).isEmpty(); + } + + @ParameterizedTest + @MethodSource("testData") + void verifyExceptionDuringRead(final Transformer transformer, final byte[] testData, final AbstractConfig config, + final int expectedCount) throws IOException { + try (InputStream inputStream = mock(InputStream.class)) { + when(inputStream.read()).thenThrow(new IOException("Test IOException during read")); + when(inputStream.read(any())).thenThrow(new IOException("Test IOException during read")); + when(inputStream.read(any(), anyInt(), anyInt())) + .thenThrow(new IOException("Test IOException during read")); + when(inputStream.readNBytes(any(), anyInt(), anyInt())) + .thenThrow(new IOException("Test IOException during read")); + when(inputStream.readNBytes(anyInt())).thenThrow(new IOException("Test IOException during read")); + when(inputStream.readAllBytes()).thenThrow(new IOException("Test IOException during read")); + try (CloseTrackingStream stream = new CloseTrackingStream(inputStream)) { + final Stream objStream = transformer.getRecords(() -> stream, "topic", 1, config, 0); + assertThat(objStream).isEmpty(); + assertThat(stream.closeCount).isGreaterThan(0); + } + } + } + + @ParameterizedTest + @MethodSource("testData") + void verifyCloseCalledAtEnd(final Transformer transformer, final byte[] testData, final AbstractConfig config, + final int expectedCount) throws IOException { + final CloseTrackingStream stream = new CloseTrackingStream(new ByteArrayInputStream(testData)); + final Stream objStream = transformer.getRecords(() -> stream, "topic", 1, config, 0); + final long count = objStream.count(); + assertThat(count).isEqualTo(expectedCount); + assertThat(stream.closeCount).isGreaterThan(0); + } + + @ParameterizedTest + @MethodSource("testData") + void verifyCloseCalledAtIteratorEnd(final Transformer transformer, final byte[] testData, + final AbstractConfig config, final int expectedCount) throws IOException { + final CloseTrackingStream stream = new CloseTrackingStream(new ByteArrayInputStream(testData)); + final Stream objStream = transformer.getRecords(() -> stream, "topic", 1, config, 0); + final Iterator iter = objStream.iterator(); + long count = 0L; + while (iter.hasNext()) { + count += 1; + iter.next(); + } + assertThat(count).isEqualTo(expectedCount); + assertThat(stream.closeCount).isGreaterThan(0); + } + + static Stream testData() throws IOException { + final List lst = new ArrayList<>(); + lst.add(Arguments.of(TransformerFactory.getTransformer(InputFormat.AVRO), + AvroTransformerTest.generateMockAvroData(100).toByteArray(), + new CommonConfig(new ConfigDef(), new HashMap<>()) { + }, 100)); + lst.add(Arguments.of(TransformerFactory.getTransformer(InputFormat.BYTES), + "Hello World".getBytes(StandardCharsets.UTF_8), new CommonConfig(new ConfigDef(), new HashMap<>()) { + }, 1)); + lst.add(Arguments.of(TransformerFactory.getTransformer(InputFormat.JSONL), + JsonTransformerTest.getJsonRecs(100).getBytes(StandardCharsets.UTF_8), + new CommonConfig(new ConfigDef(), new HashMap<>()) { + }, 100)); + lst.add(Arguments.of(TransformerFactory.getTransformer(InputFormat.PARQUET), + ParquetTransformerTest.generateMockParquetData(), new CommonConfig(new ConfigDef(), new HashMap<>()) { + }, 100)); + return lst.stream(); + } + + private static class CloseTrackingStream extends InputStream { + InputStream delegate; + int closeCount; + + CloseTrackingStream(final InputStream stream) { + super(); + this.delegate = stream; + } + + @Override + public int read() throws IOException { + if (closeCount > 0) { + throw new IOException("ERROR Read after close"); + } + return delegate.read(); + } + + @Override + public void close() throws IOException { + closeCount++; + delegate.close(); + } + } +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtilsTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtilsTest.java new file mode 100644 index 000000000..70ed07e7f --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/input/utils/FilePatternUtilsTest.java @@ -0,0 +1,63 @@ +/* + * Copyright 2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.input.utils; + +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + +import java.util.Optional; + +import io.aiven.kafka.connect.common.source.task.Context; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + +class FilePatternUtilsTest { + + @ParameterizedTest + @CsvSource({ "{{topic}}-1.txt, logs-1.txt, logs", "{{topic}}-{{partition}}.txt,logs-1.txt, logs", + "{{topic}}-{{partition}}.txt,logs2-1.txt, logs2", "{{topic}}-{{partition}}.txt, logs2-1.txt, logs2" }) + void checkTopicDistribution(final String expectedSourceFormat, final String sourceName, + final String expectedTopic) { + + final FilePatternUtils utils = new FilePatternUtils(expectedSourceFormat); + final Optional> ctx = utils.process(sourceName); + assertThat(ctx.isPresent()).isTrue(); + assertThat(ctx.get().getTopic().isPresent()).isTrue(); + assertThat(ctx.get().getTopic().get()).isEqualTo(expectedTopic); + } + + @ParameterizedTest + @CsvSource({ "{{topic}}-{{partition}}-{{start_offset}}.txt, logs2-1-0001.txt, logs2,1,0001", + "{{topic}}-{{start_offset}}-{{partition}}.txt, logs2-0001-1.txt, logs2,0001,1", + "{{topic}}-{{start_offset}}-{{partition}}.txt, logs2-99999-1.txt, logs2,1,99999", + "{{partition}}-{{start_offset}}-{{topic}}.txt, logs2-1-logs2.txt, logs2,2,0001", + "{{partition}}-{{start_offset}}-{{topic}}.txt, logs2-1-logs2.txt, logs2,2,0001", }) + void checkTopicDistribution(final String expectedSourceFormat, final String sourceName, final String expectedTopic, + final int expectedPartition, final int expectedOffset) { + + final FilePatternUtils utils = new FilePatternUtils(expectedSourceFormat); + final Optional> ctx = utils.process(sourceName); + assertThat(ctx.isPresent()).isTrue(); + assertThat(ctx.get().getTopic().isPresent()).isTrue(); + assertThat(ctx.get().getTopic().get()).isEqualTo(expectedTopic); + assertThat(ctx.get().getPartition().isPresent()).isTrue(); + assertThat(ctx.get().getPartition().get()).isEqualTo(expectedPartition); + assertThat(ctx.get().getOffset().isPresent()).isTrue(); + assertThat(ctx.get().getOffset().get()).isEqualTo(expectedOffset); + } + +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategyTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategyTest.java new file mode 100644 index 000000000..c76eb1ce7 --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashDistributionStrategyTest.java @@ -0,0 +1,149 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.task; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + +final class HashDistributionStrategyTest { + final DistributionType strategy = DistributionType.OBJECT_HASH; + @ParameterizedTest + @CsvSource({ "logs-0-0002.txt", "logs-1-0002.txt", "logs-2-0002.txt", "logs-3-0002.txt", "logs-4-0002.txt", + "logs-5-0002.txt", "logs-6-0002.txt", "logs-7-0002.txt", "logs-8-0002.txt", "logs-9-0002.txt", + "logs-1-0002.txt", "logs-3-0002.txt", "logs-5-0002.txt", "value-6-0002.txt", "logs-7-0002.txt" }) + void hashDistributionExactlyOnce(final String path) { + final int maxTaskId = 10; + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTaskId); + final Context ctx = getContext("{{topic}}-{{partition}}-{{start_offset}}", path); + + final List results = new ArrayList<>(); + for (int taskId = 0; taskId < maxTaskId; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + } + + @ParameterizedTest + @CsvSource({ "logs-0-0002.txt", "logs-1-0002.txt", "logs-2-0002.txt", "logs-3-0002.txt", "logs-4-0002.txt", + "logs-5-0002.txt", "logs-6-0002.txt", "logs-7-0002.txt", "logs-8-0002.txt", "logs-9-0002.txt", + "logs-1-0002.txt", "logs-3-0002.txt", "logs-5-0002.txt", "value-6-0002.txt", "logs-7-0002.txt" }) + void hashDistributionExactlyOnceWithReconfigureEvent(final String path) { + int maxTasks = 10; + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final Context ctx = getContext("{{topic}}-{{partition}}-{{start_offset}}", path); + + final List results = new ArrayList<>(); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + results.clear(); + maxTasks = 5; + taskDistribution.configureDistributionStrategy(maxTasks); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + } + + @ParameterizedTest + @CsvSource({ "key-0.txt", "key-0002.txt", "key-0002.txt", "anImage8-0002.png", + "reallylongfilenamecreatedonS3tohisdesomedata and alsohassome spaces.txt" }) + void hashDistributionExactlyOnceWithReconfigureEventAndMatchAllExpectedSource(final String path) { + int maxTasks = 10; + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final Context ctx = getContext(".*", path); + + final List results = new ArrayList<>(); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + results.clear(); + maxTasks = 5; + taskDistribution.configureDistributionStrategy(maxTasks); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + } + + @ParameterizedTest + @CsvSource({ "-0", "-1", "-999", "-01", "-2002020" }) + void hashDistributionWithNegativeValues(final int hashCode) { + final int maxTasks = 10; + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final FilePatternUtils utils = new FilePatternUtils(".*"); + final Optional> ctx = utils.process(new HashCodeKey(hashCode)); + + assertThat(ctx).isPresent(); + final int result = taskDistribution.getTaskFor(ctx.get()); + + assertThat(result).isLessThan(maxTasks); + assertThat(result).isGreaterThanOrEqualTo(0); + + } + + private Context getContext(final String expectedSourceName, final String filename) { + final FilePatternUtils utils = new FilePatternUtils(expectedSourceName); + final Optional> ctx = utils.process(filename); + assertThat(ctx.isPresent()).isTrue(); + // Hash distribution can have an empty context can have an empty context + return ctx.get(); + } + + static class HashCodeKey implements Comparable { + private final int hashCodeValue; + public HashCodeKey(final int hashCodeValue) { + this.hashCodeValue = hashCodeValue; + } + + private int getHashCodeValue() { + return hashCodeValue; + } + + @Override + public boolean equals(final Object other) { + if (this == other) { + return true; + } + if (other == null || getClass() != other.getClass()) { + return false; + } + final HashCodeKey that = (HashCodeKey) other; + return hashCodeValue == that.hashCodeValue; + } + + @Override + public int hashCode() { + return hashCodeValue; + } + + @Override + public int compareTo(final HashCodeKey hashCodeKey) { + return Integer.compare(this.hashCodeValue, hashCodeKey.getHashCodeValue()); + } + } +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashObjectDistributionStrategyTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashObjectDistributionStrategyTest.java deleted file mode 100644 index 63a6a76f5..000000000 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/HashObjectDistributionStrategyTest.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.ArrayList; -import java.util.List; - -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.CsvSource; - -final class HashObjectDistributionStrategyTest { - - @ParameterizedTest - @CsvSource({ "logs-0-0002.txt", "logs-1-0002.txt", "logs-2-0002.txt", "logs-3-0002.txt", "logs-4-0002.txt", - "logs-5-0002.txt", "logs-6-0002.txt", "logs-7-0002.txt", "logs-8-0002.txt", "logs-9-0002.txt", "key-0.txt", - "logs-1-0002.txt", "key-0002.txt", "logs-3-0002.txt", "key-0002.txt", "logs-5-0002.txt", "value-6-0002.txt", - "logs-7-0002.txt", "anImage8-0002.png", - "reallylongfilenamecreatedonS3tohisdesomedata and alsohassome spaces.txt" }) - void hashDistributionExactlyOnce(final String path) { - final int maxTaskId = 10; - final ObjectDistributionStrategy taskDistribution = new HashObjectDistributionStrategy(maxTaskId); - final List results = new ArrayList<>(); - for (int taskId = 0; taskId < maxTaskId; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); - } - - @ParameterizedTest - @CsvSource({ "logs-0-0002.txt", "logs-1-0002.txt", "logs-2-0002.txt", "logs-3-0002.txt", "logs-4-0002.txt", - "logs-5-0002.txt", "logs-6-0002.txt", "logs-7-0002.txt", "logs-8-0002.txt", "logs-9-0002.txt", "key-0.txt", - "logs-1-0002.txt", "key-0002.txt", "logs-3-0002.txt", "key-0002.txt", "logs-5-0002.txt", "value-6-0002.txt", - "logs-7-0002.txt", "anImage8-0002.png", - "reallylongfilenamecreatedonS3tohisdesomedata and alsohassome spaces.txt" }) - void hashDistributionExactlyOnceWithReconfigureEvent(final String path) { - int maxTasks = 10; - final ObjectDistributionStrategy taskDistribution = new HashObjectDistributionStrategy(maxTasks); - final List results = new ArrayList<>(); - for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); - results.clear(); - maxTasks = 5; - taskDistribution.reconfigureDistributionStrategy(maxTasks, null); - for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE); - } -} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategyTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategyTest.java new file mode 100644 index 000000000..f5a46c0b5 --- /dev/null +++ b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionDistributionStrategyTest.java @@ -0,0 +1,228 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.common.source.task; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + +final class PartitionDistributionStrategyTest { + final DistributionType strategy = DistributionType.PARTITION; + @Test + void partitionInFileNameDefaultAivenS3Sink() { + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(2); + final Context ctx = getContext("{{topic}}-{{partition}}-{{start_offset}}", "logs-1-00112.gz"); + assertThat(taskDistribution.getTaskFor(ctx)).isEqualTo(1); + } + + @ParameterizedTest(name = "[{index}] Pattern: {0}, Filename: {1}") + @CsvSource({ "{{topic}}-{{partition}}-{{start_offset}},logs-0-00112.gz", + "{{topic}}-2024-{{timestamp}}-{{partition}}-{{start_offset}},logs-2024-20220201-0-00112.gz", + "{{topic}}-2023-{{partition}}-{{start_offset}},logs-2023-0-00112.gz", + "logs-2023-{{partition}}-{{start_offset}},logs-2023-0-00112.gz", + "{{topic}}-{{timestamp}}-{{timestamp}}-{{timestamp}}-{{partition}}-{{start_offset}},logs1-2022-10-02-10-00112.gz", + "{{topic}}{{partition}}-{{start_offset}},89521-00112.gz", + "{{topic}}-{{partition}},Emergency-TEST1-00112.gz", + "Emergency-TEST1-{{partition}},Emergency-TEST1-00112.gz", + "{{topic}}-{{partition}}-{{start_offset}},PROD-logs-1-00112.gz", + "{{topic}}-{{partition}},DEV_team_1-00112.gz", + "{{topic}}-{{partition}}-{{start_offset}},timeseries-1-00112.gz" }) + void testPartitionFileNamesAndExpectedOutcomes(final String configuredFilenamePattern, final String filename) { + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(1); + // This test is testing the filename matching not the task allocation. + final Context ctx = getContext(configuredFilenamePattern, filename); + assertThat(taskDistribution.getTaskFor(ctx)).isEqualTo(0); + } + + @ParameterizedTest(name = "[{index}] Pattern: {0}, Filename: {1}") + @CsvSource({ "different-topic-{{partition}}-{{start_offset}},logs-1-00112.gz", + "no-seperator-in-date-partition-offset-{{timestamp}}-{{partition}}-{{start_offset}},no-seperator-in-date-partition-offset-202420220201100112.gz", + "logs-2024-{{timestamp}}-{{partition}}-{{start_offset}},logs-20201-1-00112.gz", + "logs-2024-{{timestamp}}{{partition}}-{{start_offset}},logs-202011-00112.gz", + "logs-2023-{{partition}}-{{start_offset}},logs-2023-one-00112.gz" }) + void expectFalseOnMalformedFilenames(final String configuredFilenamePattern, final String filename) { + // This test is testing the filename matching not the task allocation. + final Optional> ctx = getOptionalContext(configuredFilenamePattern, filename); + assertThat(ctx).isEmpty(); + } + + @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1}, Filename: {1}") + @CsvSource({ "0,10,topics/logs/0/logs-0-0002.txt", "1,10,topics/logs/1/logs-1-0002.txt", + "2,10,topics/logs/2/logs-2-0002.txt", "3,10,topics/logs/3/logs-3-0002.txt", + "4,10,topics/logs/4/logs-4-0002.txt", "5,10,topics/logs/5/logs-5-0002.txt", + "6,10,topics/logs/6/logs-6-0002.txt", "7,10,topics/logs/7/logs-7-0002.txt", + "8,10,topics/logs/8/logs-8-0002.txt", "9,10,topics/logs/9/logs-9-0002.txt" }) + void checkCorrectDistributionAcrossTasksOnFileName(final int taskId, final int maxTasks, final String path) { + + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final Context ctx = getContext("logs-{{partition}}-{{start_offset}}", path); + assertThat(taskDistribution.getTaskFor(ctx)).isEqualTo(taskId); + } + + @ParameterizedTest(name = "[{index}] MaxTasks: {0}, Filename: {1}") + @CsvSource({ "10,topics/logs/0/logs-0002.txt", "10,topics/logs/1/logs-001.txt", "10,topics/logs/2/logs-0002.txt", + "10,topics/logs/3/logs-0002.txt", "10,topics/logs/4/logs-0002.txt", "10,topics/logs/5/logs-0002.txt", + "10,topics/logs/6/logs-0002.txt", "10,topics/logs/7/logs-0002.txt", "10,topics/logs/8/logs-0002.txt", + "10,topics/logs/9/logs-0002.txt" }) + void filenameDistributionExactlyOnceDistribution(final int maxTasks, final String path) { + + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final List results = new ArrayList<>(); + final Context ctx = getContext("logs-{{partition}}.txt", path); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + // TODO Double check this, they should all match the first task. + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + } + + @ParameterizedTest(name = "[{index}] MaxTasks: {0}, TaskId: {1}, Filename: {2}") + @CsvSource({ "10,5,topics/logs/0/logs-0002.txt", "10,5,topics/logs/1/logs-001.txt", + "10,5,topics/logs/2/logs-0002.txt", "10,5,topics/logs/3/logs-0002.txt", "10,5,topics/logs/4/logs-0002.txt", + "10,5,topics/logs/5/logs-0002.txt", "10,5,topics/logs/6/logs-0002.txt", "10,5,topics/logs/7/logs-0002.txt", + "10,5,topics/logs/8/logs-0002.txt", "10,5,topics/logs/9/logs-0002.txt" }) + void filenameDistributionExactlyOnceDistributionWithTaskReconfiguration(final int maxTasks, + final int maxTaskAfterReConfig, final String path) { + + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final Context ctx = getContext("logs-{{partition}}.txt", path); + + final List results = new ArrayList<>(); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + taskDistribution.configureDistributionStrategy(maxTaskAfterReConfig); + + results.clear(); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + } + + @ParameterizedTest + @CsvSource({ "10,5,topics/logs/0/logs-0002.txt", "10,5,topics/logs/1/logs-001.txt", + "10,5,topics/logs/2/logs-0002.txt", "10,5,topics/logs/3/logs-0002.txt", "10,5,topics/logs/4/logs-0002.txt", + "10,5,topics/logs/5/logs-0002.txt", "10,5,topics/logs/6/logs-0002.txt", "10,5,topics/logs/7/logs-0002.txt", + "10,5,topics/logs/8/logs-0002.txt", "10,5,topics/logs/9/logs-0002.txt" }) + void partitionPathDistributionExactlyOnceDistributionWithTaskReconfiguration(final int maxTasks, + final int maxTaskAfterReConfig, final String path) { + + final String expectedSourceNameFormat = "topics/{{topic}}/{{partition}}/.*$"; + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final Context ctx = getContext(expectedSourceNameFormat, path); + final List results = new ArrayList<>(); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + taskDistribution.configureDistributionStrategy(maxTaskAfterReConfig); + + results.clear(); + for (int taskId = 0; taskId < maxTaskAfterReConfig; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + } + + @ParameterizedTest + @CsvSource({ "10,topics/logs/0/logs-0002.txt", "10,topics/logs/1/logs-001.log", "10,topics/logs/2/logs-0002.txt", + "10,topics/logs/3/logs-0002.txt", "10,topics/logs/4/logs-0002.txt", "10,topics/logs/5/logs-0002.txt", + "10,topics/logs/6/logs-0002.txt", "10,topics/logs/7/logs-0002.txt", "10,topics/logs/8/logs-0002.txt", + "10,topics/logs/9/logs-0002.txt" }) + void partitionPathDistributionExactlyOnceDistribution(final int maxTasks, final String path) { + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final List results = new ArrayList<>(); + final Context ctx = getContext("topics/{{topic}}/{{partition}}/.*$", path); + for (int taskId = 0; taskId < maxTasks; taskId++) { + results.add(taskDistribution.getTaskFor(ctx)); + } + assertThat(results).allMatch(i -> i == taskDistribution.getTaskFor(ctx)); + } + + @Test + void expectEmptyContextOnNonIntPartitionSuppliedAsNoMatchOccurs() { + final String path = "topics/logs/one/test-001.txt"; + final Optional> ctx = getOptionalContext("topics/{{topic}}/{{partition}}/.*$", path); + assertThat(ctx).isEmpty(); + } + + @ParameterizedTest(name = "[{index}] Filename: {2}") + @CsvSource({ "topcs/logs/0/logs-0002.txt", "topics/logs/1", "S3/logs/2/logs-0002.txt", + "topicss/log/3/logs-0002.txt", "prod/logs/4/logs-0002.txt", "misspelt/logs/5/logs-0002.txt", + "test/logs/6/logs-0002.txt", "random/logs/7/logs-0002.txt", "DEV/logs/8/logs-0002.txt", + "poll/logs/9/logs-0002.txt" }) + void expectNoMatchOnUnconfiguredPaths(final String path) { + final Optional> ctx = getOptionalContext("topics/{{topic}}/{{partition}}/.*$", path); + assertThat(ctx).isEmpty(); + } + @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") + @CsvSource({ "0,10,topics/logs/0/logs-0002.txt", "1,10,topics/logs/1/logs-0002.txt", + "2,10,topics/logs/2/logs-0002.txt", "3,10,topics/logs/3/logs-0002.txt", "4,10,topics/logs/4/logs-0002.txt", + "5,10,topics/logs/5/logs-0002.txt", "6,10,topics/logs/6/logs-0002.txt", "7,10,topics/logs/7/logs-0002.txt", + "8,10,topics/logs/8/logs-0002.txt", "9,10,topics/logs/9/logs-0002.txt" }) + void checkCorrectDistributionAcrossTasks(final int taskId, final int maxTaskId, final String path) { + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTaskId); + final Context ctx = getContext("topics/{{topic}}/{{partition}}/.*$", path); + assertThat(taskDistribution.getTaskFor(ctx)).isEqualTo(taskId); + } + + @ParameterizedTest(name = "[{index}] MaxTasks: {1} Filename: {2}") + @CsvSource({ "1,bucket/topics/topic-1/5/logs+5+0002.txt,0", "4,bucket/topics/topic-1/5/logs+5+0002.txt,1", + "4,bucket/topics/topic-1/5/logs+5+0002.txt,1", "3,bucket/topics/topic-1/5/logs+5+0002.txt,2", + "5,bucket/topics/topic-1/5/logs+5+0002.txt,0", "3,bucket/topics/topic-1/5/logs+5+0002.txt,2" }) + void partitionInPathConvention(final int maxTaskId, final String path, final int expectedResult) { + + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTaskId); + final Context ctx = getContext("bucket/topics/{{topic}}/{{partition}}/.*$", path); + assertThat(taskDistribution.getTaskFor(ctx)).isEqualTo(expectedResult); + } + + @ParameterizedTest(name = "[{index}] MaxTasks: {1} Filename: {2}") + @CsvSource({ "1,topics/logs/partition=5/logs+5+0002.txt,0", "4,topics/logs/partition=5/logs+5+0002.txt,1", + "4,topics/logs/partition=5/logs+5+0002.txt,1", "3,topics/logs/partition=5/logs+5+0002.txt,2", + "5,topics/logs/partition=5/logs+5+0002.txt,0", "3,topics/logs/partition=5/logs+5+0002.txt,2" }) + void withLeadingStringPartitionNamingConvention(final int maxTasks, final String path, final int expectedResult) { + + final DistributionStrategy taskDistribution = strategy.getDistributionStrategy(maxTasks); + final Context ctx = getContext("topics/{{topic}}/partition={{partition}}/.*$", path); + + assertThat(taskDistribution.getTaskFor(ctx)).isEqualTo(expectedResult); + } + + public static Context getContext(final String configuredFilenamePattern, final String filename) { + final Optional> ctx = getOptionalContext(configuredFilenamePattern, filename); + assertThat(ctx.isPresent()).isTrue(); + return ctx.get(); + } + + public static Optional> getOptionalContext(final String configuredFilenamePattern, + final String filename) { + final FilePatternUtils utils = new FilePatternUtils(configuredFilenamePattern); + return utils.process(filename); + } + +} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionInFilenameDistributionStrategyTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionInFilenameDistributionStrategyTest.java deleted file mode 100644 index f1993ecba..000000000 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionInFilenameDistributionStrategyTest.java +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.kafka.common.config.ConfigException; - -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.CsvSource; - -final class PartitionInFilenameDistributionStrategyTest { - - @Test - void partitionInFileNameDefaultAivenS3Sink() { - final ObjectDistributionStrategy taskDistribution = new PartitionInFilenameDistributionStrategy(2, - "logs-\\{\\{partition}}-\\{\\{start_offset}}"); - assertThat(taskDistribution.isPartOfTask(1, "logs-1-00112.gz")).isTrue(); - } - - @Test - void partitionLocationNotSetExpectException() { - assertThatThrownBy(() -> new PartitionInFilenameDistributionStrategy(2, "logs-23--")) - .isInstanceOf(ConfigException.class) - .hasMessage( - "Source name format logs-23-- missing partition pattern {{partition}}, please configure the expected source to include the partition pattern."); - - } - - @ParameterizedTest(name = "[{index}] Pattern: {0}, Filename: {1}") - @CsvSource({ "logs-\\{\\{partition}}-\\{\\{start_offset}},logs-0-00112.gz", - "logs-2024-\\{\\{timestamp}}-\\{\\{partition}}-\\{\\{start_offset}},logs-2024-20220201-0-00112.gz", - "logs-2023-\\{\\{partition}}-\\{\\{start_offset}},logs-2023-0-00112.gz", - "logs1-\\{\\{timestamp}}-\\{\\{timestamp}}-\\{\\{timestamp}}-\\{\\{partition}}-\\{\\{start_offset}},logs1-2022-10-02-10-00112.gz", - "8952\\{\\{partition}}-\\{\\{start_offset}},89521-00112.gz", - "Emergency-TEST\\{\\{partition}}-\\{\\{start_offset}},Emergency-TEST1-00112.gz", - "PROD-logs-\\{\\{partition}}-\\{\\{start_offset}},PROD-logs-1-00112.gz", - "DEV_team_\\{\\{partition}}-\\{\\{start_offset}},DEV_team_1-00112.gz", - "timeseries-\\{\\{partition}}-\\{\\{start_offset}},timeseries-1-00112.gz" }) - void testPartitionFileNamesAndExpectedOutcomes(final String configuredFilenamePattern, final String filename) { - final ObjectDistributionStrategy taskDistribution = new PartitionInFilenameDistributionStrategy(1, - configuredFilenamePattern); - // This test is testing the filename matching not the task allocation. - assertThat(taskDistribution.isPartOfTask(0, filename)).isTrue(); - } - - @ParameterizedTest(name = "[{index}] Pattern: {0}, Filename: {1}") - @CsvSource({ "different-topic-\\{\\{partition}}-\\{\\{start_offset}},logs-1-00112.gz", - "no-seperator-in-date-partition-offset-\\{\\{timestamp}}-\\{\\{partition}}-\\{\\{start_offset}},no-seperator-in-date-partition-offset-202420220201100112.gz", - "logs-2024-\\{\\{timestamp}}-\\{\\{partition}}-\\{\\{start_offset}},logs-20201-1-00112.gz", - "logs-2024-\\{\\{timestamp}}\\{\\{partition}}-\\{\\{start_offset}},logs-202011-00112.gz", - "logs-2024-\\{\\{timestamp}}\\{\\{partition}}-\\{\\{start_offset}}, ", - "logs-2023-\\{\\{partition}}-\\{\\{start_offset}},logs-2023-one-00112.gz" }) - void expectFalseOnMalformedFilenames(final String configuredFilenamePattern, final String filename) { - final ObjectDistributionStrategy taskDistribution = new PartitionInFilenameDistributionStrategy(1, - configuredFilenamePattern); - // This test is testing the filename matching not the task allocation. - assertThat(taskDistribution.isPartOfTask(0, filename)).isFalse(); - } - - @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1}, Filename: {1}") - @CsvSource({ "0,10,topics/logs/0/logs-0-0002.txt", "1,10,topics/logs/1/logs-1-0002.txt", - "2,10,topics/logs/2/logs-2-0002.txt", "3,10,topics/logs/3/logs-3-0002.txt", - "4,10,topics/logs/4/logs-4-0002.txt", "5,10,topics/logs/5/logs-5-0002.txt", - "6,10,topics/logs/6/logs-6-0002.txt", "7,10,topics/logs/7/logs-7-0002.txt", - "8,10,topics/logs/8/logs-8-0002.txt", "9,10,topics/logs/9/logs-9-0002.txt" }) - void checkCorrectDistributionAcrossTasks(final int taskId, final int maxTasks, final String path) { - - final ObjectDistributionStrategy taskDistribution = new PartitionInFilenameDistributionStrategy(maxTasks, - "logs-\\{\\{partition}}-\\{\\{start_offset}}"); - - assertThat(taskDistribution.isPartOfTask(taskId, path)).isTrue(); - } - - @ParameterizedTest(name = "[{index}] MaxTasks: {0}, Filename: {1}") - @CsvSource({ "10,topics/logs/0/logs-0002.txt", "10,topics/logs/1/logs-001.txt", "10,topics/logs/2/logs-0002.txt", - "10,topics/logs/3/logs-0002.txt", "10,topics/logs/4/logs-0002.txt", "10,topics/logs/5/logs-0002.txt", - "10,topics/logs/6/logs-0002.txt", "10,topics/logs/7/logs-0002.txt", "10,topics/logs/8/logs-0002.txt", - "10,topics/logs/9/logs-0002.txt" }) - void filenameDistributionExactlyOnceDistribution(final int maxTasks, final String path) { - - final ObjectDistributionStrategy taskDistribution = new PartitionInFilenameDistributionStrategy(maxTasks, - "logs-\\{\\{partition}}.txt"); - final List results = new ArrayList<>(); - for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); - } - - @ParameterizedTest(name = "[{index}] MaxTasks: {0}, TaskId: {1}, Filename: {2}") - @CsvSource({ "10,5,topics/logs/0/logs-0002.txt", "10,5,topics/logs/1/logs-001.txt", - "10,5,topics/logs/2/logs-0002.txt", "10,5,topics/logs/3/logs-0002.txt", "10,5,topics/logs/4/logs-0002.txt", - "10,5,topics/logs/5/logs-0002.txt", "10,5,topics/logs/6/logs-0002.txt", "10,5,topics/logs/7/logs-0002.txt", - "10,5,topics/logs/8/logs-0002.txt", "10,5,topics/logs/9/logs-0002.txt" }) - void filenameDistributionExactlyOnceDistributionWithTaskReconfiguration(final int maxTasks, - final int maxTaskAfterReConfig, final String path) { - - final String expectedSourceNameFormat = "logs-\\{\\{partition}}.txt"; - final ObjectDistributionStrategy taskDistribution = new PartitionInFilenameDistributionStrategy(maxTasks, - expectedSourceNameFormat); - final List results = new ArrayList<>(); - for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); - taskDistribution.reconfigureDistributionStrategy(maxTaskAfterReConfig, expectedSourceNameFormat); - - results.clear(); - for (int taskId = 0; taskId < maxTaskAfterReConfig; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE); - } - - @ParameterizedTest - @CsvSource({ - "logs-{{partition}}.txt,'Source name format logs-{{partition}}.txt missing partition pattern {{partition}}, please configure the expected source to include the partition pattern.'", - " ,'Source name format null missing partition pattern {{partition}}, please configure the expected source to include the partition pattern.'", - "empty-pattern,'Source name format empty-pattern missing partition pattern {{partition}}, please configure the expected source to include the partition pattern.'" }) - void malformedFilenameSetup(final String expectedSourceFormat, final String expectedErrorMessage) { - final int maxTaskId = 1; - - assertThatThrownBy(() -> new PartitionInFilenameDistributionStrategy(maxTaskId, expectedSourceFormat)) - .isInstanceOf(ConfigException.class) - .hasMessage(expectedErrorMessage); - } - - @Test - void errorExpectedNullGivenForSourceNameFormat() { - final int maxTaskId = 1; - - assertThatThrownBy(() -> new PartitionInFilenameDistributionStrategy(maxTaskId, null)) - .isInstanceOf(ConfigException.class) - .hasMessage( - "Source name format null missing partition pattern {{partition}}, please configure the expected source to include the partition pattern."); - } - -} diff --git a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionInPathDistributionStrategyTest.java b/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionInPathDistributionStrategyTest.java deleted file mode 100644 index 4c2a6fede..000000000 --- a/commons/src/test/java/io/aiven/kafka/connect/common/source/task/PartitionInPathDistributionStrategyTest.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright 2024 Aiven Oy - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.aiven.kafka.connect.common.source.task; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.kafka.common.config.ConfigException; -import org.apache.kafka.connect.errors.ConnectException; - -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.CsvSource; - -final class PartitionInPathDistributionStrategyTest { - - @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") - @CsvSource({ "0,1,topics/logs/partition=5/logs+5+0002.txt,true", - "0,4,topics/logs/partition=5/logs+5+0002.txt,false", "1,4,topics/logs/partition=5/logs+5+0002.txt,true", - "0,3,topics/logs/partition=5/logs+5+0002.txt,false", "0,5,topics/logs/partition=5/logs+5+0002.txt,true", - "2,3,topics/logs/partition=5/logs+5+0002.txt,true" }) - void withLeadingStringPartitionNamingConvention(final int taskId, final int maxTasks, final String path, - final boolean expectedResult) { - - final PartitionInPathDistributionStrategy taskDistribution = new PartitionInPathDistributionStrategy(maxTasks, - "topics/logs/partition=\\{\\{partition}}/"); - - assertThat(taskDistribution.isPartOfTask(taskId, path)).isEqualTo(expectedResult); - } - - @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") - @CsvSource({ "0,1,bucket/topics/topic-1/5/logs+5+0002.txt,true", - "0,4,bucket/topics/topic-1/5/logs+5+0002.txt,false", "1,4,bucket/topics/topic-1/5/logs+5+0002.txt,true", - "0,3,bucket/topics/topic-1/5/logs+5+0002.txt,false", "0,5,bucket/topics/topic-1/5/logs+5+0002.txt,true", - "2,3,bucket/topics/topic-1/5/logs+5+0002.txt,true" }) - void partitionInPathConvention(final int taskId, final int maxTaskId, final String path, - final boolean expectedResult) { - - final PartitionInPathDistributionStrategy taskDistribution = new PartitionInPathDistributionStrategy(maxTaskId, - "bucket/topics/topic-1/\\{\\{partition}}/"); - - assertThat(taskDistribution.isPartOfTask(taskId, path)).isEqualTo(expectedResult); - } - - @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") - @CsvSource({ "0,10,topics/logs/0/logs-0002.txt", "1,10,topics/logs/1/logs-0002.txt", - "2,10,topics/logs/2/logs-0002.txt", "3,10,topics/logs/3/logs-0002.txt", "4,10,topics/logs/4/logs-0002.txt", - "5,10,topics/logs/5/logs-0002.txt", "6,10,topics/logs/6/logs-0002.txt", "7,10,topics/logs/7/logs-0002.txt", - "8,10,topics/logs/8/logs-0002.txt", "9,10,topics/logs/9/logs-0002.txt" }) - void checkCorrectDistributionAcrossTasks(final int taskId, final int maxTaskId, final String path) { - - final PartitionInPathDistributionStrategy taskDistribution = new PartitionInPathDistributionStrategy(maxTaskId, - "topics/logs/\\{\\{partition}}/"); - - assertThat(taskDistribution.isPartOfTask(taskId, path)).isTrue(); - } - - @ParameterizedTest(name = "[{index}] TaskId: {0}, MaxTasks: {1} Filename: {2}") - @CsvSource({ "1,10,topcs/logs/0/logs-0002.txt", "2,10,topics/logs/1", "3,10,S3/logs/2/logs-0002.txt", - "4,10,topics/log/3/logs-0002.txt", "5,10,prod/logs/4/logs-0002.txt", "6,10,misspelt/logs/5/logs-0002.txt", - "7,10,test/logs/6/logs-0002.txt", "8,10,random/logs/7/logs-0002.txt", "9,10,DEV/logs/8/logs-0002.txt", - "10,10,poll/logs/9/logs-0002.txt" }) - void expectNoMatchOnUnconfiguredPaths(final int taskId, final int maxTaskId, final String path) { - - final PartitionInPathDistributionStrategy taskDistribution = new PartitionInPathDistributionStrategy(maxTaskId, - "topics/logs/\\{\\{partition}}/"); - - assertThat(taskDistribution.isPartOfTask(taskId, path)).isFalse(); - } - - @Test - void expectExceptionOnNonIntPartitionSupplied() { - final int taskId = 1; - final int maxTaskId = 1; - final String path = "topics/logs/one/test-001.txt"; - - final PartitionInPathDistributionStrategy taskDistribution = new PartitionInPathDistributionStrategy(maxTaskId, - "topics/logs/\\{\\{partition}}/"); - assertThatThrownBy(() -> taskDistribution.isPartOfTask(taskId, path)).isInstanceOf(ConnectException.class) - .hasMessage( - "Unexpected non integer value found parsing path for partitionId: topics/logs/one/test-001.txt"); - } - - @Test - void malformedRegexSetup() { - final int maxTaskId = 1; - - assertThatThrownBy(() -> new PartitionInPathDistributionStrategy(maxTaskId, "topics/logs/{{partition}}/")) - .isInstanceOf(ConfigException.class) - .hasMessage( - "Expected path format topics/logs/{{partition}}/ is missing the identifier '\\{\\{partition}}' to correctly select the partition"); - } - - @ParameterizedTest - @CsvSource({ - ",Expected path format null is missing the identifier '\\{\\{partition}}' to correctly select the partition", - "@adsfs,Expected path format @adsfs is missing the identifier '\\{\\{partition}}' to correctly select the partition", - "empty-path,Expected path format empty-path is missing the identifier '\\{\\{partition}}' to correctly select the partition" }) - void malformedPathSetup(final String expectedPathFormat, final String expectedErrorMessage) { - final int maxTaskId = 1; - - assertThatThrownBy(() -> new PartitionInPathDistributionStrategy(maxTaskId, expectedPathFormat)) - .isInstanceOf(ConfigException.class) - .hasMessage(expectedErrorMessage); - } - - @ParameterizedTest - @CsvSource({ "10,topics/logs/0/logs-0002.txt", "10,topics/logs/1/logs-001.log", "10,topics/logs/2/logs-0002.txt", - "10,topics/logs/3/logs-0002.txt", "10,topics/logs/4/logs-0002.txt", "10,topics/logs/5/logs-0002.txt", - "10,topics/logs/6/logs-0002.txt", "10,topics/logs/7/logs-0002.txt", "10,topics/logs/8/logs-0002.txt", - "10,topics/logs/9/logs-0002.txt" }) - void partitionPathDistributionExactlyOnceDistribution(final int maxTasks, final String path) { - - final ObjectDistributionStrategy taskDistribution = new PartitionInPathDistributionStrategy(maxTasks, - "topics/logs/\\{\\{partition}}"); - final List results = new ArrayList<>(); - for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); - } - - @ParameterizedTest - @CsvSource({ "10,5,topics/logs/0/logs-0002.txt", "10,5,topics/logs/1/logs-001.txt", - "10,5,topics/logs/2/logs-0002.txt", "10,5,topics/logs/3/logs-0002.txt", "10,5,topics/logs/4/logs-0002.txt", - "10,5,topics/logs/5/logs-0002.txt", "10,5,topics/logs/6/logs-0002.txt", "10,5,topics/logs/7/logs-0002.txt", - "10,5,topics/logs/8/logs-0002.txt", "10,5,topics/logs/9/logs-0002.txt" }) - void partitionPathDistributionExactlyOnceDistributionWithTaskReconfiguration(final int maxTasks, - final int maxTaskAfterReConfig, final String path) { - - final String expectedSourceNameFormat = "topics/logs/\\{\\{partition}}"; - final ObjectDistributionStrategy taskDistribution = new PartitionInPathDistributionStrategy(maxTasks, - expectedSourceNameFormat); - final List results = new ArrayList<>(); - for (int taskId = 0; taskId < maxTasks; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE); - taskDistribution.reconfigureDistributionStrategy(maxTaskAfterReConfig, expectedSourceNameFormat); - - results.clear(); - for (int taskId = 0; taskId < maxTaskAfterReConfig; taskId++) { - results.add(taskDistribution.isPartOfTask(taskId, path)); - } - assertThat(results).containsExactlyInAnyOrder(Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, - Boolean.FALSE); - } - -} diff --git a/gcs-sink-connector/build.gradle.kts b/gcs-sink-connector/build.gradle.kts index 2c33f4c67..4af195ba7 100644 --- a/gcs-sink-connector/build.gradle.kts +++ b/gcs-sink-connector/build.gradle.kts @@ -98,7 +98,7 @@ dependencies { testImplementation(apache.kafka.connect.api) testImplementation(apache.kafka.connect.runtime) testImplementation(apache.kafka.connect.json) - testImplementation("com.google.cloud:google-cloud-nio:0.127.26") + testImplementation("com.google.cloud:google-cloud-nio:0.127.27") testImplementation(compressionlibs.snappy) testImplementation(compressionlibs.zstd.jni) diff --git a/gradle-config/aiven-pmd-test-ruleset.xml b/gradle-config/aiven-pmd-test-ruleset.xml index 0cc9ca531..65267db4b 100644 --- a/gradle-config/aiven-pmd-test-ruleset.xml +++ b/gradle-config/aiven-pmd-test-ruleset.xml @@ -78,7 +78,7 @@ - + diff --git a/gradle-config/spotbugs-exclude.xml b/gradle-config/spotbugs-exclude.xml index 3f007d6b8..48e98ab56 100644 --- a/gradle-config/spotbugs-exclude.xml +++ b/gradle-config/spotbugs-exclude.xml @@ -19,7 +19,10 @@ - + + + + diff --git a/s3-commons/build.gradle.kts b/s3-commons/build.gradle.kts index 0e3d825aa..5e54c05ef 100644 --- a/s3-commons/build.gradle.kts +++ b/s3-commons/build.gradle.kts @@ -18,10 +18,13 @@ plugins { id("aiven-apache-kafka-connectors-all.java-conventions") } val amazonS3Version by extra("1.12.777") val amazonSTSVersion by extra("1.12.777") +val amazonV2Version by extra("2.29.34") dependencies { implementation("com.amazonaws:aws-java-sdk-s3:$amazonS3Version") implementation("com.amazonaws:aws-java-sdk-sts:$amazonSTSVersion") + implementation("software.amazon.awssdk:auth:$amazonV2Version") + implementation("software.amazon.awssdk:sts:$amazonV2Version") implementation(project(":commons")) diff --git a/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java b/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java index 4371f4658..2ece623bf 100644 --- a/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java +++ b/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3ConfigFragment.java @@ -41,11 +41,13 @@ import com.amazonaws.services.s3.internal.BucketNameUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; /** * The configuration fragment that defines the S3 specific characteristics. */ -@SuppressWarnings({ "PMD.TooManyMethods", "PMD.ExcessiveImports", "PMD.TooManyStaticImports" }) +@SuppressWarnings({ "PMD.TooManyMethods", "PMD.ExcessiveImports", "PMD.TooManyStaticImports", "PMD.GodClass" }) public final class S3ConfigFragment extends ConfigFragment { private static final Logger LOGGER = LoggerFactory.getLogger(S3ConfigFragment.class); @@ -110,6 +112,8 @@ public final class S3ConfigFragment extends ConfigFragment { public static final String AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_CONFIG = "aws.s3.backoff.max.delay.ms"; public static final String AWS_S3_RETRY_BACKOFF_MAX_RETRIES_CONFIG = "aws.s3.backoff.max.retries"; + public static final String FETCH_PAGE_SIZE = "aws.s3.fetch.page.size"; + private static final String GROUP_AWS = "AWS"; private static final String GROUP_AWS_STS = "AWS STS"; @@ -211,9 +215,17 @@ static void addAwsConfigGroup(final ConfigDef configDef) { awsGroupCounter++, ConfigDef.Width.NONE, AWS_S3_ENDPOINT_CONFIG); configDef.define(AWS_S3_REGION_CONFIG, ConfigDef.Type.STRING, null, new AwsRegionValidator(), - ConfigDef.Importance.MEDIUM, "AWS S3 Region, e.g. us-east-1", GROUP_AWS, awsGroupCounter++, // NOPMD - // UnusedAssignment + ConfigDef.Importance.MEDIUM, "AWS S3 Region, e.g. us-east-1", GROUP_AWS, awsGroupCounter++, ConfigDef.Width.NONE, AWS_S3_REGION_CONFIG); + + configDef.define(AWS_S3_PREFIX_CONFIG, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), + ConfigDef.Importance.MEDIUM, "Prefix for stored objects, e.g. cluster-1/", GROUP_AWS, awsGroupCounter++, + ConfigDef.Width.NONE, AWS_S3_PREFIX_CONFIG); + + configDef.define(FETCH_PAGE_SIZE, ConfigDef.Type.INT, 10, ConfigDef.Range.atLeast(1), + ConfigDef.Importance.MEDIUM, "AWS S3 Fetch page size", GROUP_AWS, awsGroupCounter++, // NOPMD + // UnusedAssignment + ConfigDef.Width.NONE, FETCH_PAGE_SIZE); } static void addAwsStsConfigGroup(final ConfigDef configDef) { @@ -246,10 +258,6 @@ static void addAwsStsConfigGroup(final ConfigDef configDef) { } static void addDeprecatedConfiguration(final ConfigDef configDef) { - configDef.define(AWS_S3_PREFIX_CONFIG, ConfigDef.Type.STRING, null, new ConfigDef.NonEmptyString(), - ConfigDef.Importance.MEDIUM, - "[Deprecated] Use `file.name.template` instead. Prefix for stored objects, e.g. cluster-1/", GROUP_AWS, - 0, ConfigDef.Width.NONE, AWS_S3_PREFIX_CONFIG); configDef.define(AWS_ACCESS_KEY_ID, ConfigDef.Type.PASSWORD, null, new NonEmptyPassword() { @Override @@ -339,7 +347,8 @@ public void validateCredentials() { } } else { final BasicAWSCredentials awsCredentials = getAwsCredentials(); - if (awsCredentials == null) { + final AwsBasicCredentials awsCredentialsV2 = getAwsCredentialsV2(); + if (awsCredentials == null && awsCredentialsV2 == null) { LOGGER.info( "Connector use {} as credential Provider, " + "when configuration for {{}, {}} OR {{}, {}} are absent", @@ -404,11 +413,13 @@ public AwsStsEndpointConfig getStsEndpointConfig() { return new AwsStsEndpointConfig(cfg.getString(AWS_STS_CONFIG_ENDPOINT), cfg.getString(AWS_S3_REGION_CONFIG)); } + @Deprecated public AwsClientBuilder.EndpointConfiguration getAwsEndpointConfiguration() { final AwsStsEndpointConfig config = getStsEndpointConfig(); return new AwsClientBuilder.EndpointConfiguration(config.getServiceEndpoint(), config.getSigningRegion()); } + @Deprecated public BasicAWSCredentials getAwsCredentials() { if (Objects.nonNull(cfg.getPassword(AWS_ACCESS_KEY_ID_CONFIG)) && Objects.nonNull(cfg.getPassword(AWS_SECRET_ACCESS_KEY_CONFIG))) { @@ -424,12 +435,26 @@ public BasicAWSCredentials getAwsCredentials() { return null; } + public AwsBasicCredentials getAwsCredentialsV2() { + if (Objects.nonNull(cfg.getPassword(AWS_ACCESS_KEY_ID_CONFIG)) + && Objects.nonNull(cfg.getPassword(AWS_SECRET_ACCESS_KEY_CONFIG))) { + + return AwsBasicCredentials.create(cfg.getPassword(AWS_ACCESS_KEY_ID_CONFIG).value(), + cfg.getPassword(AWS_SECRET_ACCESS_KEY_CONFIG).value()); + } else if (Objects.nonNull(cfg.getPassword(AWS_ACCESS_KEY_ID)) + && Objects.nonNull(cfg.getPassword(AWS_SECRET_ACCESS_KEY))) { + LOGGER.warn("Config options {} and {} are not supported for this Connector", AWS_ACCESS_KEY_ID, + AWS_SECRET_ACCESS_KEY); + } + return null; + } + public String getAwsS3EndPoint() { return Objects.nonNull(cfg.getString(AWS_S3_ENDPOINT_CONFIG)) ? cfg.getString(AWS_S3_ENDPOINT_CONFIG) : cfg.getString(AWS_S3_ENDPOINT); } - + @Deprecated public Region getAwsS3Region() { // we have priority of properties if old one not set or both old and new one set // the new property value will be selected @@ -442,6 +467,18 @@ public Region getAwsS3Region() { } } + public software.amazon.awssdk.regions.Region getAwsS3RegionV2() { + // we have priority of properties if old one not set or both old and new one set + // the new property value will be selected + if (Objects.nonNull(cfg.getString(AWS_S3_REGION_CONFIG))) { + return software.amazon.awssdk.regions.Region.of(cfg.getString(AWS_S3_REGION_CONFIG)); + } else if (Objects.nonNull(cfg.getString(AWS_S3_REGION))) { + return software.amazon.awssdk.regions.Region.of(cfg.getString(AWS_S3_REGION)); + } else { + return software.amazon.awssdk.regions.Region.of(Regions.US_EAST_1.getName()); + } + } + public String getAwsS3BucketName() { return Objects.nonNull(cfg.getString(AWS_S3_BUCKET_NAME_CONFIG)) ? cfg.getString(AWS_S3_BUCKET_NAME_CONFIG) @@ -477,4 +514,13 @@ public int getS3RetryBackoffMaxRetries() { public AWSCredentialsProvider getCustomCredentialsProvider() { return cfg.getConfiguredInstance(AWS_CREDENTIALS_PROVIDER_CONFIG, AWSCredentialsProvider.class); } + + public AwsCredentialsProvider getCustomCredentialsProviderV2() { + return cfg.getConfiguredInstance(AWS_CREDENTIALS_PROVIDER_CONFIG, AwsCredentialsProvider.class); + } + + public int getFetchPageSize() { + return cfg.getInt(FETCH_PAGE_SIZE); + } + } diff --git a/s3-commons/src/main/java/io/aiven/kafka/connect/iam/AwsCredentialProviderFactory.java b/s3-commons/src/main/java/io/aiven/kafka/connect/iam/AwsCredentialProviderFactory.java index 2a5089726..167d872a7 100644 --- a/s3-commons/src/main/java/io/aiven/kafka/connect/iam/AwsCredentialProviderFactory.java +++ b/s3-commons/src/main/java/io/aiven/kafka/connect/iam/AwsCredentialProviderFactory.java @@ -26,6 +26,11 @@ import com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider; import com.amazonaws.services.securitytoken.AWSSecurityTokenService; import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClientBuilder; +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; +import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; +import software.amazon.awssdk.services.sts.auth.StsAssumeRoleCredentialsProvider; +import software.amazon.awssdk.services.sts.model.AssumeRoleRequest; public class AwsCredentialProviderFactory { @@ -58,4 +63,33 @@ private AWSSecurityTokenService securityTokenService(final S3ConfigFragment conf } return AWSSecurityTokenServiceClientBuilder.defaultClient(); } + + public AwsCredentialsProvider getAwsV2Provider(final S3ConfigFragment config) { + + if (config.hasAwsStsRole()) { + return getV2StsProvider(config); + } + final AwsBasicCredentials awsCredentials = config.getAwsCredentialsV2(); + if (Objects.isNull(awsCredentials)) { + return config.getCustomCredentialsProviderV2(); + } + return StaticCredentialsProvider.create(awsCredentials); + + } + + private StsAssumeRoleCredentialsProvider getV2StsProvider(final S3ConfigFragment config) { + if (config.hasAwsStsRole()) { + return StsAssumeRoleCredentialsProvider.builder() + .refreshRequest(() -> AssumeRoleRequest.builder() + .roleArn(config.getStsRole().getArn()) + // Maker this a unique identifier + .roleSessionName("AwsV2SDKConnectorSession") + .build()) + .build(); + } + + return StsAssumeRoleCredentialsProvider.builder().build(); + + } + } diff --git a/s3-source-connector/README.md b/s3-source-connector/README.md new file mode 100644 index 000000000..3c236d4d0 --- /dev/null +++ b/s3-source-connector/README.md @@ -0,0 +1,328 @@ +# Aiven's S3 Source Connector for Apache Kafka + +This is a source Apache Kafka Connect connector that stores AWS S3 bucket objects onto an Apache Kafka topic. + +**Table of Contents** + +- [How it works](#how-it-works) +- [Data Format](#data-format) +- [Usage](#usage) +- [Configuration](#configuration) +- [Development](#development) + + +## How it works + +The connector connects to Amazon S3 and periodically queries its data +sources. Each object from the s3 response is transformed into a record and +published into the corresponding Kafka topic. + +### Requirements + +The connector requires Java 11 or newer for development and production. + +#### Authorization + +The connector needs the following permissions to the specified bucket: +* ``s3:GetObject`` +* ``s3:ListObjectsV2`` + +In case of ``Access Denied`` error, see https://aws.amazon.com/premiumsupport/knowledge-center/s3-troubleshoot-403/ + +#### Authentication + +To make the connector work, a user has to specify AWS credentials that allow writing to S3. +There are two ways to specify AWS credentials in this connector: + +1) Long term credentials. + + It requires both `aws.access.key.id` and `aws.secret.access.key` to be specified. +2) Short term credentials. + + The connector will request a temporary token from the AWS STS service and assume a role from another AWS account. + It requires `aws.sts.role.arn`, `aws.sts.role.session.name` to be specified. +3) Use default provider chain or custom provider + + If you prefer to use AWS default provider chain, you can leave {`aws.access.key.id` and `aws.secret.access.key`} and + {`aws.sts.role.arn`, `aws.sts.role.session.name`} blank. In case you prefer to build your own custom + provider, pass the custom provider class as a parameter to `aws.credential.provider` + +It is important not to use both 1 and 2 simultaneously. +Using option 2, it is recommended to specify the S3 bucket region in `aws.s3.region` and the +corresponding AWS STS endpoint in `aws.sts.config.endpoint`. It's better to specify both or none. +It is also important to specify `aws.sts.role.external.id` for the security reason. +(see some details [here](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-user_externalid.html)). + +### File name format + +> File name format is tightly related to [Record Grouping](#record-grouping) + +The connector uses the following format for input files (blobs): +``. + +``is the optional prefix that can be used, for example, for +subdirectories in the bucket. +`` is the file name. The connector has a fixed +template for file names. + + Fixed template for file : `{{topic}}-{{partition}}-{{start_offset}}` + +Example object name : customertopic-00001-1734445664111.txt + +## Data Format + +Connector class name, in this case: `io.aiven.kafka.connect.s3.AivenKafkaConnectS3SourceConnector`. + +### S3 Object Names + +S3 connector reads series of files in the specified bucket. +Each object would be of pattern `[]---` + +### Kafka topic names +S3 object keys have format with topic names which would be the target kafka topics. + +### Data File Format + +S3 Object files are text files that contain one record per line (i.e., +they're separated by `\n`) except `PARQUET` format. + +There are four types of data format available: + +- Complex structure, where file is in format of [JSON lines](https://jsonlines.org/). + It contains one record per line and each line is a valid JSON object(`jsonl`) + + Configuration: ```input.format=jsonl```. + +- Complex structure, where file is a valid avro file with multiple records. + + Configuration: ```input.format=avro```. + +- Complex structure, where file is in Apache Parquet file format. + + Configuration: ```input.format=parquet```. +- +- Complex structure, where file is in bytes format. + + Configuration: ```input.format=bytes```. + +The connector can output the following fields from records into the +output: the key, the value, the timestamp, the offset and headers. (The set and the order of +output: the key, the value, the timestamp, the offset and headers. The set of +these output fields is configurable.) The field values are separated by comma. + +#### JSONL Format example + +For example, if we output `key,value,offset,timestamp`, a record line might look like: + +```json + { "key": "k1", "value": "v0", "offset": 1232155, "timestamp":"2020-01-01T00:00:01Z" } +``` + +org.apache.kafka.connect.json.JsonConverter is used internally to convert this data and make output files human-readable. + +**NB!** + +- Value/Key schema will not be presented in output kafka event, even if `value.converter.schemas.enable` property is `true`, + however, if this is set to true, it has no impact at the moment. + +#### Parquet or Avro format example + +For example, if we input `key,offset,timestamp,headers,value`, an input - Parquet schema in an s3 object might look like this: +```json +{ + "type": "record", "fields": [ + {"name": "key", "type": "RecordKeySchema"}, + {"name": "offset", "type": "long"}, + {"name": "timestamp", "type": "long"}, + {"name": "headers", "type": "map"}, + {"name": "value", "type": "RecordValueSchema"} + ] +} +``` +where `RecordKeySchema` - a key schema and `RecordValueSchema` - a record value schema. +This means that in case you have the record and key schema like: + +Key schema: +```json +{ + "type": "string" +} +``` + +Record schema: +```json +{ + "type": "record", "fields": [ + {"name": "foo", "type": "string"}, + {"name": "bar", "type": "long"} + ] +} +``` +the final `Avro` schema for `Parquet` is: +```json +{ + "type": "record", "fields": [ + {"name": "key", "type": "string"}, + {"name": "offset", "type": "long"}, + {"name": "timestamp", "type": "long"}, + {"name": "headers", "type": "map", "values": "long"}, + { "name": "value", + "type": "record", + "fields": [ + {"name": "foo", "type": "string"}, + {"name": "bar", "type": "long"} + ] + } + ] +} +``` +**NB!** + +- Connector works just fine with and without Schema Registry + +## Usage + +### Connector Configuration + +> **Important Note** Since this connector is developed aligning it with S3 sink connector, +> and since version `2.6`, all existing configuration in S3 sink +is deprecated and will be replaced with new one during a certain transition period (within 2-3 releases). Most of the +> configuration parameters remain same. + +List of new configuration parameters: +- `aws.access.key.id` - AWS Access Key ID for accessing S3 bucket. +- `aws.secret.access.key` - AWS S3 Secret Access Key. +- `aws.s3.bucket.name` - - Name of an existing bucket for storing the records. Mandatory. See bucket name rules: +- `aws.s3.endpoint` - The endpoint configuration (service endpoint & signing region) to be used for requests. +- `aws.s3.prefix` - The prefix that will be added to the file name in the bucket. Can be used for putting output files into a subdirectory. +- `aws.s3.region` - Name of the region for the bucket used for storing the records. Defaults to `us-east-1`. +- `aws.sts.role.arn` - AWS role ARN, for cross-account access role instead of `aws.access.key.id` and `aws.secret.access.key` +- `aws.sts.role.external.id` - AWS ExternalId for cross-account access role +- `aws.sts.role.session.name` - AWS session name for cross-account access role +- `aws.sts.role.session.duration` - Session duration for cross-account access role in Seconds. Minimum value - 900. +- `aws.sts.config.endpoint` - AWS STS endpoint for cross-account access role. + +## Configuration + +[Here](https://kafka.apache.org/documentation/#connect_running) you can +read about the Connect workers configuration and +[here](https://kafka.apache.org/documentation/#connect_resuming), about +the connector Configuration. + +Here is an example connector configuration with descriptions: + +```properties +### Standard connector configuration + +## Fill in your values in these: + +## These must have exactly these values: + +# The Java class for the connector +connector.class=io.aiven.kafka.connect.s3.AivenKafkaConnectS3SourceConnector + +# Number of worker tasks to run concurrently +tasks.max=1 + +# The key converter for this connector +key.converter=org.apache.kafka.connect.storage.StringConverter + +# The type of data format used to write data to the kafka events. +# The supported values are: `jsonl`, 'avro', `parquet` and 'bytes'. +input.type=jsonl + +# A comma-separated list of topics to use as output for this connector +# Also a regular expression version `topics.regex` is supported. +# See https://kafka.apache.org/documentation/#connect_configuring +topics=topic1,topic2 + +# A comma-separated list of topic partitions where the connector's offset storage reader +# can read the stored offsets for those partitions. If not mentioned, s3 objects will be read again if +# available in the bucket +topic.partitions=1,2,3 + +### Connector-specific configuration +### Fill in you values +# AWS Access Key ID +aws.access.key.id=YOUR_AWS_KEY_ID + +# AWS Access Secret Key +aws.secret.access.key=YOUR_AWS_SECRET_ACCESS_KEY + +#AWS Region +aws.s3.region=us-east-1 + +#The name of the S3 bucket to use +#Required. +aws.s3.bucket.name=my-bucket + +#The prefix of the S3 bucket to use +#Optional. +aws.s3.prefix=file-prefix + +#Errors tolerance +# Possible values 'none' or 'all'. Default being 'none' +# Errors are logged and ignored for 'all' +errors.tolerance=none +``` + +### Retry strategy configuration + +#### Apache Kafka connect retry strategy configuration property + +- `kafka.retry.backoff.ms` - The retry backoff in milliseconds. This config is used to notify Apache Kafka Connect to retry delivering a message batch or + performing recovery in case of transient exceptions. Maximum value is `24` hours. + +There are four configuration properties to configure retry strategy exists. + +#### AWS S3 retry strategy configuration properties + + `aws.s3.backoff.delay.ms` - S3 default base sleep time +for non-throttled exceptions in milliseconds. +Default is `100` ms. +- `aws.s3.backoff.max.delay.ms` - S3 maximum back-off +time before retrying a request in milliseconds. +Default is `20 000` ms. +- `aws.s3.backoff.max.retries` - Maximum retry limit +(if the value is greater than 30, there can be +integer overflow issues during delay calculation). +Default is `3`. + +### AWS S3 server side encryption properties + +- `aws.s3.sse.algorithm` - The name of the Server-side encryption algorithm to use for uploads. If unset the default SSE-S3 is used. +- To use SSE-S3 set to `AES256` or leave empty +- To use SSE-KMS set to `aws:kms` +- To use DSSE-KMS set to `aws:kms:dsse` + +## Development + +### Developing together with Commons library + +This project depends on [Common Module for Apache Kafka Connect](../commons/README.md). + +### Integration testing + +Integration tests are implemented using JUnit, Gradle and Docker. + +To run them, you need: +- Docker installed. + +Integration testing doesn't require valid AWS credentials. + +To simulate AWS S3 behaviour, tests use [LocalStack](https://github.com/localstack/localstack-java-utils). + +In order to run the integration tests, execute from the project root +directory: + +```bash +./gradlew clean integrationTest +``` + +## License + +This project is licensed under the [Apache License, Version 2.0](LICENSE). + +## Trademarks + +Apache Kafka, Apache Kafka Connect are either registered trademarks or trademarks of the Apache Software Foundation in the United States and/or other countries. AWS S3 is a trademark and property of their respective owners. All product and service names used in this website are for identification purposes only and do not imply endorsement. diff --git a/s3-source-connector/build.gradle.kts b/s3-source-connector/build.gradle.kts new file mode 100644 index 000000000..5d8e44ac7 --- /dev/null +++ b/s3-source-connector/build.gradle.kts @@ -0,0 +1,313 @@ +import com.github.spotbugs.snom.SpotBugsTask + +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +plugins { id("aiven-apache-kafka-connectors-all.java-conventions") } + +val amazonS3Version by extra("2.29.34") +val amazonSTSVersion by extra("2.29.34") +val s3mockVersion by extra("0.2.6") +val kafkaVersion by extra("3.3.0") + +val integrationTest: SourceSet = + sourceSets.create("integrationTest") { + java { srcDir("src/integration-test/java") } + resources { srcDir("src/integration-test/resources") } + compileClasspath += sourceSets.main.get().output + configurations.testRuntimeClasspath.get() + runtimeClasspath += output + compileClasspath + } + +val integrationTestImplementation: Configuration by + configurations.getting { extendsFrom(configurations.implementation.get()) } + +tasks.register("integrationTest") { + description = "Runs the integration tests." + group = "verification" + testClassesDirs = integrationTest.output.classesDirs + classpath = integrationTest.runtimeClasspath + + // defines testing order + shouldRunAfter("test") + // requires archive for connect runner + dependsOn("distTar") + useJUnitPlatform() + + // Run always. + outputs.upToDateWhen { false } + + val distTarTask = tasks.get("distTar") as Tar + val distributionFilePath = distTarTask.archiveFile.get().asFile.path + systemProperty("integration-test.distribution.file.path", distributionFilePath) +} + +idea { + module { + testSources.from(integrationTest.java.srcDirs) + testSources.from(integrationTest.resources.srcDirs) + } +} + +dependencies { + compileOnly("org.apache.kafka:connect-api:$kafkaVersion") + compileOnly("org.apache.kafka:connect-runtime:$kafkaVersion") + + implementation(apache.commons.collection4) + implementation(project(":commons")) + implementation(project(":s3-commons")) + implementation("software.amazon.awssdk:s3:$amazonS3Version") + implementation("software.amazon.awssdk:sts:$amazonSTSVersion") + + implementation(tools.spotbugs.annotations) + implementation(logginglibs.slf4j) + implementation(apache.avro) + implementation(confluent.kafka.connect.avro.converter) { + exclude(group = "org.apache.kafka", module = "kafka-clients") + } + integrationTestImplementation(apache.parquet.hadoop) + testImplementation(compressionlibs.snappy) + testImplementation(compressionlibs.zstd.jni) + + testImplementation("org.apache.kafka:connect-api:$kafkaVersion") + testImplementation("org.apache.kafka:connect-runtime:$kafkaVersion") + testImplementation("org.apache.kafka:connect-json:$kafkaVersion") + + testImplementation(testinglibs.junit.jupiter) + testImplementation(testinglibs.assertj.core) + + testImplementation(testinglibs.mockito.core) + testImplementation("io.findify:s3mock_2.11:$s3mockVersion") + + testRuntimeOnly(testinglibs.junit.jupiter.engine) + testImplementation(testinglibs.mockito.junit.jupiter) + // implementation(apache.hadoop.common) + + implementation(apache.hadoop.common) { + exclude(group = "org.apache.hadoop", module = "hadoop-yarn-client") + exclude(group = "org.apache.hadoop.thirdparty", module = "hadoop-shaded-protobuf_3_7") + exclude(group = "com.google.guava", module = "guava") + exclude(group = "commons-cli", module = "commons-cli") + exclude(group = "org.apache.commons", module = "commons-math3") + exclude(group = "org.apache.httpcomponents", module = "httpclient") + exclude(group = "commons-codec", module = "commons-codec") + exclude(group = "commons-net", module = "commons-net") + exclude(group = "org.eclipse.jetty") + exclude(group = "org.eclipse.jetty.websocket") + exclude(group = "javax.servlet") + exclude(group = "javax.servlet.jsp") + exclude(group = "javax.activation") + exclude(group = "com.sun.jersey") + exclude(group = "log4j") + exclude(group = "org.apache.commons", module = "commons-text") + exclude(group = "org.slf4j", module = "slf4j-api") + // exclude(group = "org.apache.hadoop", module = "hadoop-auth") + exclude(group = "org.apache.hadoop", module = "hadoop-yarn-api") + exclude(group = "com.google.re2j") + exclude(group = "com.google.protobuf") + exclude(group = "com.google.code.gson") + exclude(group = "com.jcraft") + exclude(group = "org.apache.curator") + exclude(group = "org.apache.zookeeper") + exclude(group = "org.apache.htrace") + exclude(group = "com.google.code.findbugs") + exclude(group = "org.apache.kerby") + exclude(group = "com.fasterxml.jackson.core") + exclude(group = "com.fasterxml.woodstox", module = "woodstox-core:5.0.3") + exclude(group = "org.apache.avro", module = "avro") + exclude(group = "org.apache.hadoop", module = "hadoop-yarn-common") + exclude(group = "com.google.inject.extensions", module = "guice-servlet") + exclude(group = "io.netty", module = "netty") + } + + testRuntimeOnly(logginglibs.logback.classic) + + integrationTestImplementation(testinglibs.localstack) { + exclude(group = "io.netty", module = "netty-transport-native-epoll") + } + integrationTestImplementation(testcontainers.junit.jupiter) + integrationTestImplementation(testcontainers.kafka) // this is not Kafka version + integrationTestImplementation(testcontainers.localstack) + integrationTestImplementation(testinglibs.wiremock) + + // TODO: add avro-converter to ConnectRunner via plugin.path instead of on worker classpath + integrationTestImplementation(confluent.kafka.connect.avro.converter) { + exclude(group = "org.apache.kafka", module = "kafka-clients") + } + + testImplementation(apache.hadoop.mapreduce.client.core) { + exclude(group = "org.apache.hadoop", module = "hadoop-yarn-client") + exclude(group = "org.apache.hadoop.thirdparty", module = "hadoop-shaded-protobuf_3_7") + exclude(group = "com.google.guava", module = "guava") + exclude(group = "commons-cli", module = "commons-cli") + exclude(group = "org.apache.commons", module = "commons-math3") + exclude(group = "org.apache.httpcomponents", module = "httpclient") + exclude(group = "commons-codec", module = "commons-codec") + exclude(group = "commons-net", module = "commons-net") + exclude(group = "org.eclipse.jetty") + exclude(group = "org.eclipse.jetty.websocket") + exclude(group = "javax.servlet") + exclude(group = "javax.servlet.jsp") + exclude(group = "javax.activation") + exclude(group = "com.sun.jersey") + exclude(group = "log4j") + exclude(group = "org.apache.commons", module = "commons-text") + exclude(group = "org.slf4j", module = "slf4j-api") + exclude(group = "org.apache.hadoop", module = "hadoop-auth") + exclude(group = "org.apache.hadoop", module = "hadoop-yarn-api") + exclude(group = "com.google.re2j") + exclude(group = "com.google.protobuf") + exclude(group = "com.google.code.gson") + exclude(group = "com.jcraft") + exclude(group = "org.apache.curator") + exclude(group = "org.apache.zookeeper") + exclude(group = "org.apache.htrace") + exclude(group = "com.google.code.findbugs") + exclude(group = "org.apache.kerby") + exclude(group = "com.fasterxml.jackson.core") + exclude(group = "com.fasterxml.woodstox", module = "woodstox-core:5.0.3") + exclude(group = "org.apache.avro", module = "avro") + exclude(group = "org.apache.hadoop", module = "hadoop-yarn-common") + exclude(group = "com.google.inject.extensions", module = "guice-servlet") + exclude(group = "io.netty", module = "netty") + } + + integrationTestImplementation("org.apache.kafka:connect-runtime:${kafkaVersion}:test") + integrationTestImplementation("org.apache.kafka:connect-runtime:${kafkaVersion}") + integrationTestImplementation("org.apache.kafka:kafka-clients:${kafkaVersion}:test") + integrationTestImplementation("org.apache.kafka:kafka_2.13:${kafkaVersion}:test") + integrationTestImplementation("org.apache.kafka:kafka_2.13:${kafkaVersion}") + + // Make test utils from 'test' available in 'integration-test' + integrationTestImplementation(sourceSets["test"].output) + integrationTestImplementation(testinglibs.awaitility) +} + +tasks.named("pmdIntegrationTest") { + ruleSetFiles = files("${project.rootDir}/gradle-config/aiven-pmd-test-ruleset.xml") + ruleSets = emptyList() // Clear the default rulesets +} + +tasks.named("spotbugsIntegrationTest") { + reports.create("html") { setStylesheet("fancy-hist.xsl") } +} + +tasks.processResources { + filesMatching("s3-source-connector-for-apache-kafka-version.properties") { + expand(mapOf("version" to version)) + } +} + +tasks.jar { manifest { attributes(mapOf("Version" to project.version)) } } + +tasks.distTar { dependsOn(":commons:jar") } + +tasks.distZip { dependsOn(":commons:jar") } + +distributions { + main { + contents { + from("jar") + from(configurations.runtimeClasspath.get().map { if (it.isDirectory) it else zipTree(it) }) + + into("/") { + from("$projectDir") + include("version.txt", "README*", "LICENSE*", "NOTICE*", "licenses/") + include("config/") + } + } + } +} + +publishing { + publications { + create("publishMavenJavaArtifact") { + groupId = group.toString() + artifactId = "s3-source-connector-for-apache-kafka" + version = version.toString() + + from(components["java"]) + + pom { + name = "Aiven's S3 Source Connector for Apache Kafka" + description = "Aiven's S3 Source Connector for Apache Kafka" + url = "https://github.com/aiven-open/s3-source-connector-for-apache-kafka" + organization { + name = "Aiven Oy" + url = "https://aiven.io" + } + + licenses { + license { + name = "Apache 2.0" + url = "http://www.apache.org/licenses/LICENSE-2.0" + distribution = "repo" + } + } + + developers { + developer { + id = "aiven" + name = "Aiven Opensource" + email = "opensource@aiven.io" + } + } + + scm { + connection = "scm:git:git://github.com:aiven/s3-source-connector-for-apache-kafka.git" + developerConnection = + "scm:git:ssh://github.com:aiven/s3-source-connector-for-apache-kafka.git" + url = "https://github.com/aiven-open/s3-source-connector-for-apache-kafka" + } + } + } + } + + repositories { + maven { + name = "sonatype" + + val releasesRepoUrl = uri("https://oss.sonatype.org/service/local/staging/deploy/maven2") + val snapshotsRepoUrl = uri("https://oss.sonatype.org/content/repositories/snapshots") + url = if (version.toString().endsWith("SNAPSHOT")) snapshotsRepoUrl else releasesRepoUrl + + credentials(PasswordCredentials::class) + } + } +} + +signing { + sign(publishing.publications["publishMavenJavaArtifact"]) + useGpgCmd() + // Some issue in the plugin: + // GPG outputs already armored signatures. The plugin also does armoring for `asc` files. + // This results in double armored signatures, i.e. garbage. + // Override the signature type provider to use unarmored output for `asc` files, which works well + // with GPG. + class ASCSignatureProvider() : AbstractSignatureTypeProvider() { + val binary = + object : BinarySignatureType() { + override fun getExtension(): String { + return "asc" + } + } + + init { + register(binary) + setDefaultType(binary.extension) + } + } + signatureTypes = ASCSignatureProvider() +} diff --git a/s3-source-connector/gradle.properties b/s3-source-connector/gradle.properties new file mode 100644 index 000000000..e1c4d767e --- /dev/null +++ b/s3-source-connector/gradle.properties @@ -0,0 +1,4 @@ +version=0.0.1-SNAPSHOT + +sonatypeUsername= +sonatypePassword= diff --git a/s3-source-connector/licenses/LICENSE-aws.txt b/s3-source-connector/licenses/LICENSE-aws.txt new file mode 100644 index 000000000..aeea99958 --- /dev/null +++ b/s3-source-connector/licenses/LICENSE-aws.txt @@ -0,0 +1,63 @@ +Apache License +Version 2.0, January 2004 + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + + 1. You must give any other recipients of the Work or Derivative Works a copy of this License; and + 2. You must cause any modified files to carry prominent notices stating that You changed the files; and + 3. You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and + 4. If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. + +You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +Note: Other license terms may apply to certain, identified software files contained within or distributed with the accompanying software if such terms are included in the directory containing the accompanying software. Such other license terms will then apply in lieu of the terms of the software license above. + +JSON processing code subject to the JSON License from JSON.org: + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +The Software shall be used for Good, not Evil. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/s3-source-connector/notices/NOTICE-aws.txt b/s3-source-connector/notices/NOTICE-aws.txt new file mode 100644 index 000000000..979460ec7 --- /dev/null +++ b/s3-source-connector/notices/NOTICE-aws.txt @@ -0,0 +1,13 @@ +AWS IoT Device SDK for Java +Copyright 2010-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +This product includes software developed by +Amazon Technologies, Inc (http://www.amazon.com/). + +********************** +THIRD PARTY COMPONENTS +********************** +This software includes third party software subject to the following copyrights: +- PKCS#1 and PKCS#8 PEM encoded private key parsing and utility functions from oauth.googlecode.com - Copyright 1998-2010 AOL Inc. + +The licenses for these third party components are included in LICENSE.txt \ No newline at end of file diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java new file mode 100644 index 000000000..39a6f7f2d --- /dev/null +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/AwsIntegrationTest.java @@ -0,0 +1,305 @@ +/* + * Copyright 2025 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import static io.aiven.kafka.connect.common.config.CommonConfig.MAX_TASKS; +import static io.aiven.kafka.connect.common.config.CommonConfig.TASK_ID; +import static io.aiven.kafka.connect.common.config.FileNameFragment.FILE_NAME_TEMPLATE_CONFIG; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.AVRO_VALUE_SERIALIZER; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPIC_PARTITIONS; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_ACCESS_KEY_ID_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; +import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.kafka.connect.source.SourceTaskContext; +import org.apache.kafka.connect.storage.OffsetStorageReader; + +import io.aiven.kafka.connect.common.source.input.InputFormat; +import io.aiven.kafka.connect.common.source.input.TransformerFactory; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; +import io.aiven.kafka.connect.s3.source.utils.AWSV2SourceClient; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; +import io.aiven.kafka.connect.s3.source.utils.S3SourceRecord; +import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; + +import org.apache.avro.Schema; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; +import org.testcontainers.containers.localstack.LocalStackContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.S3Object; + +@Testcontainers +class AwsIntegrationTest implements IntegrationBase { + + private static final String COMMON_PREFIX = "s3-source-connector-for-apache-kafka-AWS-test-"; + + @Container + public static final LocalStackContainer LOCALSTACK = IntegrationBase.createS3Container(); + + private S3Client s3Client; + private String s3Endpoint; + + private BucketAccessor testBucketAccessor; + + @Override + public S3Client getS3Client() { + return s3Client; + } + + @BeforeEach + void setupAWS() { + s3Client = IntegrationBase.createS3Client(LOCALSTACK); + s3Endpoint = LOCALSTACK.getEndpoint().toString(); + testBucketAccessor = new BucketAccessor(s3Client, TEST_BUCKET_NAME); + testBucketAccessor.createBucket(); + } + + @AfterEach + void tearDownAWS() { + testBucketAccessor.removeBucket(); + s3Client.close(); + } + + private Map getConfig(final String topics, final int maxTasks) { + final Map config = new HashMap<>(); + config.put(AWS_ACCESS_KEY_ID_CONFIG, S3_ACCESS_KEY_ID); + config.put(AWS_SECRET_ACCESS_KEY_CONFIG, S3_SECRET_ACCESS_KEY); + config.put(AWS_S3_ENDPOINT_CONFIG, s3Endpoint); + config.put(AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET_NAME); + config.put(TARGET_TOPIC_PARTITIONS, "0,1"); + config.put(TARGET_TOPICS, topics); + config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put(MAX_TASKS, String.valueOf(maxTasks)); + return config; + } + + /** + * Test the integration with the Amazon connector + * + * @param testInfo + * The testing configuration. + */ + @Test + void sourceRecordIteratorBytesTest(final TestInfo testInfo) { + final var topicName = IntegrationBase.topicName(testInfo); + final int maxTasks = 1; + final int taskId = 0; + final Map configData = getConfig(topicName, maxTasks); + + configData.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); + configData.put(FILE_NAME_TEMPLATE_CONFIG, "{{topic}}-{{partition}}-{{start_offset}}"); + configData.put(TASK_ID, String.valueOf(taskId)); + configData.put(MAX_TASKS, String.valueOf(maxTasks)); + final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; + final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; + + final List offsetKeys = new ArrayList<>(); + final List expectedKeys = new ArrayList<>(); + // write 2 objects to s3 + expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "0")); + expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "0")); + expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "1")); + expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "1")); + + // we don't expext the empty one. + offsetKeys.addAll(expectedKeys); + offsetKeys.add(writeToS3(topicName, new byte[0], "3")); + + assertThat(testBucketAccessor.listObjects()).hasSize(5); + + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configData); + final SourceTaskContext context = mock(SourceTaskContext.class); + final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); + when(context.offsetStorageReader()).thenReturn(offsetStorageReader); + when(offsetStorageReader.offsets(any())).thenReturn(new HashMap<>()); + + final OffsetManager offsetManager = new OffsetManager(context, s3SourceConfig); + + final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig); + + final Iterator sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, offsetManager, + TransformerFactory.getTransformer(InputFormat.BYTES), sourceClient); + + final HashSet seenKeys = new HashSet<>(); + while (sourceRecordIterator.hasNext()) { + final S3SourceRecord s3SourceRecord = sourceRecordIterator.next(); + final String key = OBJECT_KEY + SEPARATOR + s3SourceRecord.getObjectKey(); + assertThat(offsetKeys).contains(key); + seenKeys.add(key); + } + assertThat(seenKeys).containsAll(expectedKeys); + } + + @Test + void sourceRecordIteratorAvroTest(final TestInfo testInfo) throws IOException { + final var topicName = IntegrationBase.topicName(testInfo); + final int maxTasks = 1; + final int taskId = 0; + + final Map configData = getConfig(topicName, maxTasks); + + configData.put(INPUT_FORMAT_KEY, InputFormat.AVRO.getValue()); + configData.put(VALUE_CONVERTER_KEY, "io.confluent.connect.avro.AvroConverter"); + configData.put(AVRO_VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); + configData.put(FILE_NAME_TEMPLATE_CONFIG, "{{topic}}-{{partition}}-{{start_offset}}"); + configData.put(TASK_ID, String.valueOf(taskId)); + configData.put(MAX_TASKS, String.valueOf(maxTasks)); + + // Define Avro schema + final String schemaJson = "{\n" + " \"type\": \"record\",\n" + " \"name\": \"TestRecord\",\n" + + " \"fields\": [\n" + " {\"name\": \"message\", \"type\": \"string\"},\n" + + " {\"name\": \"id\", \"type\": \"int\"}\n" + " ]\n" + "}"; + final Schema.Parser parser = new Schema.Parser(); + final Schema schema = parser.parse(schemaJson); + + final int numOfRecsFactor = 5000; + + final byte[] outputStream1 = IntegrationBase.generateNextAvroMessagesStartingFromId(1, numOfRecsFactor, schema); + final byte[] outputStream2 = IntegrationBase.generateNextAvroMessagesStartingFromId(numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream3 = IntegrationBase.generateNextAvroMessagesStartingFromId(2 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream4 = IntegrationBase.generateNextAvroMessagesStartingFromId(3 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream5 = IntegrationBase.generateNextAvroMessagesStartingFromId(4 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + + final Set offsetKeys = new HashSet<>(); + + offsetKeys.add(writeToS3(topicName, outputStream1, "1")); + offsetKeys.add(writeToS3(topicName, outputStream2, "1")); + + offsetKeys.add(writeToS3(topicName, outputStream3, "2")); + offsetKeys.add(writeToS3(topicName, outputStream4, "2")); + offsetKeys.add(writeToS3(topicName, outputStream5, "2")); + + assertThat(testBucketAccessor.listObjects()).hasSize(5); + + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configData); + final SourceTaskContext context = mock(SourceTaskContext.class); + final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); + when(context.offsetStorageReader()).thenReturn(offsetStorageReader); + when(offsetStorageReader.offsets(any())).thenReturn(new HashMap<>()); + + final OffsetManager offsetManager = new OffsetManager(context, s3SourceConfig); + + final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig); + + final Iterator sourceRecordIterator = new SourceRecordIterator(s3SourceConfig, offsetManager, + TransformerFactory.getTransformer(InputFormat.AVRO), sourceClient); + + final HashSet seenKeys = new HashSet<>(); + final Map> seenRecords = new HashMap<>(); + while (sourceRecordIterator.hasNext()) { + final S3SourceRecord s3SourceRecord = sourceRecordIterator.next(); + final String key = OBJECT_KEY + SEPARATOR + s3SourceRecord.getObjectKey(); + seenRecords.compute(key, (k, v) -> { + final List lst = v == null ? new ArrayList<>() : v; // NOPMD new object inside loop + lst.add(s3SourceRecord.getRecordNumber()); + return lst; + }); + assertThat(offsetKeys).contains(key); + seenKeys.add(key); + } + assertThat(seenKeys).containsAll(offsetKeys); + assertThat(seenRecords).hasSize(5); + final List expected = new ArrayList<>(); + for (long l = 0; l < numOfRecsFactor; l++) { + expected.add(l + 1); + } + for (final String key : offsetKeys) { + final List seen = seenRecords.get(key); + assertThat(seen).as("Count for " + key).containsExactlyInAnyOrderElementsOf(expected); + } + } + + @Test + void verifyIteratorRehydration(final TestInfo testInfo) { + // create 2 files. + final var topicName = IntegrationBase.topicName(testInfo); + final Map configData = getConfig(topicName, 1); + + configData.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); + + final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; + final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; + final String testData3 = "Hello, Kafka Connect S3 Source! object 3"; + + final List expectedKeys = new ArrayList<>(); + + final List actualKeys = new ArrayList<>(); + + // write 2 objects to s3 + expectedKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "0") + .substring((OBJECT_KEY + SEPARATOR).length())); + expectedKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "0") + .substring((OBJECT_KEY + SEPARATOR).length())); + + assertThat(testBucketAccessor.listObjects()).hasSize(2); + + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configData); + final AWSV2SourceClient sourceClient = new AWSV2SourceClient(s3SourceConfig); + final Iterator iter = sourceClient.getS3ObjectIterator(null); + + assertThat(iter).hasNext(); + S3Object object = iter.next(); + actualKeys.add(object.key()); + assertThat(iter).hasNext(); + object = iter.next(); + actualKeys.add(object.key()); + assertThat(iter).isExhausted(); + assertThat(actualKeys).containsAll(expectedKeys); + + // write 3rd object to s3 + expectedKeys.add(writeToS3(topicName, testData3.getBytes(StandardCharsets.UTF_8), "0") + .substring((OBJECT_KEY + SEPARATOR).length())); + assertThat(testBucketAccessor.listObjects()).hasSize(3); + + assertThat(iter).hasNext(); + object = iter.next(); + actualKeys.add(object.key()); + assertThat(iter).isExhausted(); + assertThat(actualKeys).containsAll(expectedKeys); + + } +} diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/ConnectRunner.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/ConnectRunner.java new file mode 100644 index 000000000..d746405da --- /dev/null +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/ConnectRunner.java @@ -0,0 +1,93 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; + +import org.apache.kafka.connect.util.clusters.EmbeddedConnectCluster; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +final class ConnectRunner { + private static final Logger LOGGER = LoggerFactory.getLogger(ConnectRunner.class); + + private EmbeddedConnectCluster connectCluster; + + private final int offsetFlushIntervalMs; + + public ConnectRunner(final int offsetFlushIntervalMs) { + this.offsetFlushIntervalMs = offsetFlushIntervalMs; + } + + void startConnectCluster(final String connectorName, final int localPort, final int containerPort) { + + final Properties brokerProperties = new Properties(); + brokerProperties.put("advertised.listeners", "PLAINTEXT://localhost:" + localPort + + ",TESTCONTAINERS://host.testcontainers.internal:" + containerPort); + brokerProperties.put("listeners", + "PLAINTEXT://localhost:" + localPort + ",TESTCONTAINERS://localhost:" + containerPort); + brokerProperties.put("listener.security.protocol.map", "PLAINTEXT:PLAINTEXT,TESTCONTAINERS:PLAINTEXT"); + + connectCluster = new EmbeddedConnectCluster.Builder().name(connectorName) + .brokerProps(brokerProperties) + .workerProps(getWorkerProperties()) + .build(); + connectCluster.start(); + LOGGER.info("connectCluster started"); + } + + String getBootstrapServers() { + return connectCluster.kafka().bootstrapServers(); + } + + void deleteConnector(final String connectorName) { + connectCluster.deleteConnector(connectorName); + } + + void stopConnectCluster() { + // stop all Connect, Kafka and Zk threads. + if (connectCluster != null) { + connectCluster.stop(); + } + LOGGER.info("connectCluster stopped"); + } + + String configureConnector(final String connName, final Map connConfig) { + return connectCluster.configureConnector(connName, connConfig); + } + + private Map getWorkerProperties() { + final Map workerProps = new HashMap<>(); + + workerProps.put("offset.flush.interval.ms", Integer.toString(offsetFlushIntervalMs)); + + // These don't matter much (each connector sets its own converters), but need to be filled with valid classes. + workerProps.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + workerProps.put("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + workerProps.put("internal.key.converter", "org.apache.kafka.connect.json.JsonConverter"); + workerProps.put("internal.key.converter.schemas.enable", "true"); + workerProps.put("internal.value.converter", "org.apache.kafka.connect.json.JsonConverter"); + workerProps.put("internal.value.converter.schemas.enable", "true"); + + workerProps.put("plugin.discovery", "hybrid_warn"); + + return workerProps; + } +} diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java new file mode 100644 index 000000000..fa4f60b76 --- /dev/null +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationBase.java @@ -0,0 +1,286 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; +import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; +import static org.assertj.core.api.Assertions.assertThat; +import static org.awaitility.Awaitility.await; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.net.ServerSocket; +import java.net.URI; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.concurrent.ExecutionException; +import java.util.stream.Collectors; + +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.AdminClientConfig; +import org.apache.kafka.clients.admin.NewTopic; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; +import org.apache.kafka.common.serialization.Deserializer; +import org.apache.kafka.common.serialization.StringDeserializer; +import org.apache.kafka.connect.json.JsonDeserializer; + +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import io.confluent.kafka.serializers.KafkaAvroDeserializer; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.DatumWriter; +import org.junit.jupiter.api.TestInfo; +import org.testcontainers.containers.Container; +import org.testcontainers.containers.localstack.LocalStackContainer; +import org.testcontainers.utility.DockerImageName; +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; +import software.amazon.awssdk.core.sync.RequestBody; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.PutObjectRequest; + +@SuppressWarnings("PMD.ExcessiveImports") +public interface IntegrationBase { + String PLUGINS_S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA = "plugins/s3-source-connector-for-apache-kafka/"; + String S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA_TEST = "s3-source-connector-for-apache-kafka-test-"; + ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + String TEST_BUCKET_NAME = "test-bucket0"; + String S3_ACCESS_KEY_ID = "test-key-id0"; + String VALUE_CONVERTER_KEY = "value.converter"; + String S3_SECRET_ACCESS_KEY = "test_secret_key0"; + + static byte[] generateNextAvroMessagesStartingFromId(final int messageId, final int noOfAvroRecs, + final Schema schema) throws IOException { + final DatumWriter datumWriter = new GenericDatumWriter<>(schema); + try (DataFileWriter dataFileWriter = new DataFileWriter<>(datumWriter); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) { + dataFileWriter.create(schema, outputStream); + for (int i = messageId; i < messageId + noOfAvroRecs; i++) { + final GenericRecord avroRecord = new GenericData.Record(schema); // NOPMD + avroRecord.put("message", "Hello, Kafka Connect S3 Source! object " + i); + avroRecord.put("id", i); + dataFileWriter.append(avroRecord); + } + + dataFileWriter.flush(); + return outputStream.toByteArray(); + } + } + + S3Client getS3Client(); + + /** + * Write file to s3 with the specified key and data. + * + * @param objectKey + * the key. + * @param testDataBytes + * the data. + */ + default void writeToS3WithKey(final String objectKey, final byte[] testDataBytes) { + final PutObjectRequest request = PutObjectRequest.builder() + .bucket(IntegrationTest.TEST_BUCKET_NAME) + .key(objectKey) + .build(); + getS3Client().putObject(request, RequestBody.fromBytes(testDataBytes)); + + } + + /** + * Writes to S3 using a key of the form {@code [prefix]topicName-partitionId-systemTime.txt}. + * + * @param topicName + * the topic name to use + * @param testDataBytes + * the data. + * @param partitionId + * the partition id. + * @return the key prefixed by {@link S3SourceTask#OBJECT_KEY} and + * {@link io.aiven.kafka.connect.s3.source.utils.OffsetManager#SEPARATOR} + */ + default String writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId) { + final String objectKey = topicName + "-" + partitionId + "-" + System.currentTimeMillis() + ".txt"; + writeToS3WithKey(objectKey, testDataBytes); + return OBJECT_KEY + SEPARATOR + objectKey; + } + + default AdminClient newAdminClient(final String bootstrapServers) { + final Properties adminClientConfig = new Properties(); + adminClientConfig.put(AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); + return AdminClient.create(adminClientConfig); + } + + static void extractConnectorPlugin(Path pluginDir) throws IOException, InterruptedException { + final File distFile = new File(System.getProperty("integration-test.distribution.file.path")); + assertThat(distFile).exists(); + + final String cmd = String.format("tar -xf %s --strip-components=1 -C %s", distFile, pluginDir.toString()); + final Process process = Runtime.getRuntime().exec(cmd); + assert process.waitFor() == 0; + } + + static Path getPluginDir() throws IOException { + final Path testDir = Files.createTempDirectory(S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA_TEST); + return Files.createDirectories(testDir.resolve(PLUGINS_S3_SOURCE_CONNECTOR_FOR_APACHE_KAFKA)); + } + + static String topicName(final TestInfo testInfo) { + return testInfo.getTestMethod().get().getName(); + } + + static void createTopics(final AdminClient adminClient, final List topicNames) + throws ExecutionException, InterruptedException { + final var newTopics = topicNames.stream().map(s -> new NewTopic(s, 4, (short) 1)).collect(Collectors.toList()); + adminClient.createTopics(newTopics).all().get(); + } + + static void waitForRunningContainer(final Container container) { + await().atMost(Duration.ofMinutes(1)).until(container::isRunning); + } + + static S3Client createS3Client(final LocalStackContainer localStackContainer) { + return S3Client.builder() + .endpointOverride( + URI.create(localStackContainer.getEndpointOverride(LocalStackContainer.Service.S3).toString())) + .region(Region.of(localStackContainer.getRegion())) + .credentialsProvider(StaticCredentialsProvider.create(AwsBasicCredentials + .create(localStackContainer.getAccessKey(), localStackContainer.getSecretKey()))) + .build(); + } + + static LocalStackContainer createS3Container() { + return new LocalStackContainer(DockerImageName.parse("localstack/localstack:2.0.2")) + .withServices(LocalStackContainer.Service.S3); + } + + /** + * Finds 2 simultaneously free port for Kafka listeners + * + * @return list of 2 ports + * @throws IOException + * when port allocation failure happens + */ + static List getKafkaListenerPorts() throws IOException { + try (ServerSocket socket = new ServerSocket(0); ServerSocket socket2 = new ServerSocket(0)) { + return Arrays.asList(socket.getLocalPort(), socket2.getLocalPort()); + } catch (IOException e) { + throw new IOException("Failed to allocate port for test", e); + } + } + + static List consumeByteMessages(final String topic, final int expectedMessageCount, + String bootstrapServers) { + final Properties consumerProperties = getConsumerProperties(bootstrapServers, ByteArrayDeserializer.class, + ByteArrayDeserializer.class); + final List objects = consumeMessages(topic, expectedMessageCount, Duration.ofSeconds(60), + consumerProperties); + return objects.stream().map(String::new).collect(Collectors.toList()); + } + + static List consumeAvroMessages(final String topic, final int expectedMessageCount, + final Duration expectedMaxDuration, final String bootstrapServers, final String schemaRegistryUrl) { + final Properties consumerProperties = getConsumerProperties(bootstrapServers, StringDeserializer.class, + KafkaAvroDeserializer.class, schemaRegistryUrl); + return consumeMessages(topic, expectedMessageCount, expectedMaxDuration, consumerProperties); + } + + static List consumeJsonMessages(final String topic, final int expectedMessageCount, + final String bootstrapServers) { + final Properties consumerProperties = getConsumerProperties(bootstrapServers, StringDeserializer.class, + JsonDeserializer.class); + return consumeMessages(topic, expectedMessageCount, Duration.ofSeconds(60), consumerProperties); + } + + static List consumeMessages(final String topic, final int expectedMessageCount, + final Duration expectedMaxDuration, final Properties consumerProperties) { + try (KafkaConsumer consumer = new KafkaConsumer<>(consumerProperties)) { + consumer.subscribe(Collections.singletonList(topic)); + + final List recordValues = new ArrayList<>(); + await().atMost(expectedMaxDuration).pollInterval(Duration.ofSeconds(1)).untilAsserted(() -> { + assertThat(consumeRecordsInProgress(consumer, recordValues)).hasSize(expectedMessageCount); + }); + return recordValues; + } + } + + private static List consumeRecordsInProgress(KafkaConsumer consumer, List recordValues) { + int recordsRetrieved; + do { + final ConsumerRecords records = consumer.poll(Duration.ofMillis(500L)); + recordsRetrieved = records.count(); + for (final ConsumerRecord record : records) { + recordValues.add(record.value()); + } + // Choosing 10 records as it allows for integration tests with a smaller max poll to be added + // while maintaining efficiency, a slightly larger number could be added but this is slightly more efficient + // than larger numbers. + } while (recordsRetrieved > 10); + return recordValues; + } + + static Map consumeOffsetMessages(KafkaConsumer consumer) throws IOException { + // Poll messages from the topic + final Map messages = new HashMap<>(); + final ConsumerRecords records = consumer.poll(Duration.ofSeconds(1)); + for (final ConsumerRecord record : records) { + Map offsetRec = OBJECT_MAPPER.readValue(record.value(), new TypeReference<>() { // NOPMD + }); + messages.putAll(offsetRec); + } + return messages; + } + + static Properties getConsumerProperties(String bootstrapServers, + Class> keyDeserializer, Class> valueDeserializer, + String schemaRegistryUrl) { + final Properties props = getConsumerProperties(bootstrapServers, keyDeserializer, valueDeserializer); + props.put("specific.avro.reader", "false"); // Use GenericRecord instead of specific Avro classes + props.put("schema.registry.url", schemaRegistryUrl); // URL of the schema registry + return props; + } + + static Properties getConsumerProperties(String bootstrapServers, + Class> keyDeserializer, Class> valueDeserializer) { + final Properties props = new Properties(); + props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); + props.put(ConsumerConfig.GROUP_ID_CONFIG, "test-consumer-group"); + props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, keyDeserializer.getName()); + props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, valueDeserializer.getName()); + props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); + return props; + } +} diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java new file mode 100644 index 000000000..387a6105d --- /dev/null +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/IntegrationTest.java @@ -0,0 +1,412 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import static io.aiven.kafka.connect.common.config.CommonConfig.MAX_TASKS; +import static io.aiven.kafka.connect.common.config.FileNameFragment.FILE_NAME_TEMPLATE_CONFIG; +import static io.aiven.kafka.connect.common.config.FileNameFragment.FILE_PATH_PREFIX_TEMPLATE_CONFIG; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.AVRO_VALUE_SERIALIZER; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.VALUE_CONVERTER_SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.DISTRIBUTION_TYPE; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPIC_PARTITIONS; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_ACCESS_KEY_ID_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_PREFIX_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; +import static io.aiven.kafka.connect.s3.source.utils.OffsetManager.SEPARATOR; +import static java.util.Map.entry; +import static org.assertj.core.api.Assertions.assertThat; +import static org.awaitility.Awaitility.await; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; + +import io.aiven.kafka.connect.common.source.input.InputFormat; +import io.aiven.kafka.connect.common.source.task.DistributionType; +import io.aiven.kafka.connect.s3.source.testutils.BucketAccessor; +import io.aiven.kafka.connect.s3.source.testutils.ContentUtils; + +import com.fasterxml.jackson.databind.JsonNode; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.commons.lang3.StringUtils; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.localstack.LocalStackContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.PutObjectRequest; + +@Testcontainers +@SuppressWarnings("PMD.ExcessiveImports") +final class IntegrationTest implements IntegrationBase { + + private static final Logger LOGGER = LoggerFactory.getLogger(IntegrationTest.class); + private static final String CONNECTOR_NAME = "aiven-s3-source-connector"; + private static final int OFFSET_FLUSH_INTERVAL_MS = 500; + + private static String s3Endpoint; + private static String s3Prefix; + private static BucketAccessor testBucketAccessor; + + @Container + public static final LocalStackContainer LOCALSTACK = IntegrationBase.createS3Container(); + private SchemaRegistryContainer schemaRegistry; + + private AdminClient adminClient; + private ConnectRunner connectRunner; + + private static S3Client s3Client; + private TestInfo testInfo; + + @Override + public S3Client getS3Client() { + return s3Client; + } + + public + + @BeforeAll static void setUpAll() throws IOException, InterruptedException { + s3Client = IntegrationBase.createS3Client(LOCALSTACK); + s3Endpoint = LOCALSTACK.getEndpoint().toString(); + testBucketAccessor = new BucketAccessor(s3Client, TEST_BUCKET_NAME); + + final Path pluginDir = IntegrationBase.getPluginDir(); + IntegrationBase.extractConnectorPlugin(pluginDir); + } + + @BeforeEach + void setUp(final TestInfo testInfo) throws Exception { + testBucketAccessor.createBucket(); + this.testInfo = testInfo; + + connectRunner = new ConnectRunner(OFFSET_FLUSH_INTERVAL_MS); + final List ports = IntegrationBase.getKafkaListenerPorts(); + final int localListenerPort = ports.get(0); + final int containerListenerPort = ports.get(1); + connectRunner.startConnectCluster(CONNECTOR_NAME, localListenerPort, containerListenerPort); + + adminClient = newAdminClient(connectRunner.getBootstrapServers()); + final String topicName = IntegrationBase.topicName(testInfo); + final var topics = List.of(topicName); + IntegrationBase.createTopics(adminClient, topics); + + // This should be done after the process listening the port is already started by host but + // before the container that will access it is started. + org.testcontainers.Testcontainers.exposeHostPorts(containerListenerPort); + schemaRegistry = new SchemaRegistryContainer("host.testcontainers.internal:" + containerListenerPort); + schemaRegistry.start(); + IntegrationBase.waitForRunningContainer(schemaRegistry); + } + + @AfterEach + void tearDown() { + adminClient.close(); + connectRunner.deleteConnector(CONNECTOR_NAME); + connectRunner.stopConnectCluster(); + schemaRegistry.stop(); + testBucketAccessor.removeBucket(); + } + + @ParameterizedTest + @ValueSource(booleans = { true, false }) + void bytesTest(final boolean addPrefix) { + final var topicName = IntegrationBase.topicName(testInfo); + final DistributionType distributionType; + final int partitionId = 0; + final String prefixPattern = "topics/{{topic}}/partition={{partition}}/"; + String s3Prefix = ""; + if (addPrefix) { + distributionType = DistributionType.PARTITION; + s3Prefix = "topics/" + topicName + "/partition=" + partitionId + "/"; + } else { + distributionType = DistributionType.PARTITION; + } + + final String fileNamePatternSeparator = "_"; + + final Map connectorConfig = getConfig(CONNECTOR_NAME, topicName, 1, distributionType, addPrefix, + s3Prefix, prefixPattern, fileNamePatternSeparator); + + connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); + connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); + + final String testData1 = "Hello, Kafka Connect S3 Source! object 1"; + final String testData2 = "Hello, Kafka Connect S3 Source! object 2"; + + final List offsetKeys = new ArrayList<>(); + + // write 2 objects to s3 + offsetKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "0", s3Prefix, + fileNamePatternSeparator)); + offsetKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "0", s3Prefix, + fileNamePatternSeparator)); + offsetKeys.add(writeToS3(topicName, testData1.getBytes(StandardCharsets.UTF_8), "1", s3Prefix, + fileNamePatternSeparator)); + offsetKeys.add(writeToS3(topicName, testData2.getBytes(StandardCharsets.UTF_8), "1", s3Prefix, + fileNamePatternSeparator)); + offsetKeys.add(writeToS3(topicName, new byte[0], "3", s3Prefix, "-")); + + assertThat(testBucketAccessor.listObjects()).hasSize(5); + + // Poll messages from the Kafka topic and verify the consumed data + final List records = IntegrationBase.consumeByteMessages(topicName, 4, + connectRunner.getBootstrapServers()); + + // Verify that the correct data is read from the S3 bucket and pushed to Kafka + assertThat(records).containsOnly(testData1, testData2); + + // Verify offset positions + final Map expectedOffsetRecords = offsetKeys.subList(0, offsetKeys.size() - 1) + .stream() + .collect(Collectors.toMap(Function.identity(), s -> 1)); + verifyOffsetPositions(expectedOffsetRecords, connectRunner.getBootstrapServers()); + } + + @Test + void avroTest(final TestInfo testInfo) throws IOException { + final var topicName = IntegrationBase.topicName(testInfo); + final boolean addPrefix = false; + final Map connectorConfig = getAvroConfig(topicName, InputFormat.AVRO, addPrefix, "", "", + DistributionType.OBJECT_HASH); + + connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); + + // Define Avro schema + final String schemaJson = "{\n" + " \"type\": \"record\",\n" + " \"name\": \"TestRecord\",\n" + + " \"fields\": [\n" + " {\"name\": \"message\", \"type\": \"string\"},\n" + + " {\"name\": \"id\", \"type\": \"int\"}\n" + " ]\n" + "}"; + final Schema.Parser parser = new Schema.Parser(); + final Schema schema = parser.parse(schemaJson); + + final int numOfRecsFactor = 5000; + + final byte[] outputStream1 = IntegrationBase.generateNextAvroMessagesStartingFromId(1, numOfRecsFactor, schema); + final byte[] outputStream2 = IntegrationBase.generateNextAvroMessagesStartingFromId(numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream3 = IntegrationBase.generateNextAvroMessagesStartingFromId(2 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream4 = IntegrationBase.generateNextAvroMessagesStartingFromId(3 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + final byte[] outputStream5 = IntegrationBase.generateNextAvroMessagesStartingFromId(4 * numOfRecsFactor + 1, + numOfRecsFactor, schema); + + final Set offsetKeys = new HashSet<>(); + + final String s3Prefix = ""; + + offsetKeys.add(writeToS3(topicName, outputStream1, "1", s3Prefix, "-")); + offsetKeys.add(writeToS3(topicName, outputStream2, "1", s3Prefix, "-")); + + offsetKeys.add(writeToS3(topicName, outputStream3, "2", s3Prefix, "-")); + offsetKeys.add(writeToS3(topicName, outputStream4, "2", s3Prefix, "-")); + offsetKeys.add(writeToS3(topicName, outputStream5, "2", s3Prefix, "-")); + + assertThat(testBucketAccessor.listObjects()).hasSize(5); + + // Poll Avro messages from the Kafka topic and deserialize them + // Waiting for 25k kafka records in this test so a longer Duration is added. + final List records = IntegrationBase.consumeAvroMessages(topicName, numOfRecsFactor * 5, + Duration.ofMinutes(3), connectRunner.getBootstrapServers(), schemaRegistry.getSchemaRegistryUrl()); + // Ensure this method deserializes Avro + + // Verify that the correct data is read from the S3 bucket and pushed to Kafka + assertThat(records).map(record -> entry(record.get("id"), String.valueOf(record.get("message")))) + .contains(entry(1, "Hello, Kafka Connect S3 Source! object 1"), + entry(2, "Hello, Kafka Connect S3 Source! object 2"), + entry(numOfRecsFactor, "Hello, Kafka Connect S3 Source! object " + numOfRecsFactor), + entry(2 * numOfRecsFactor, "Hello, Kafka Connect S3 Source! object " + (2 * numOfRecsFactor)), + entry(3 * numOfRecsFactor, "Hello, Kafka Connect S3 Source! object " + (3 * numOfRecsFactor)), + entry(4 * numOfRecsFactor, "Hello, Kafka Connect S3 Source! object " + (4 * numOfRecsFactor)), + entry(5 * numOfRecsFactor, "Hello, Kafka Connect S3 Source! object " + (5 * numOfRecsFactor))); + + verifyOffsetPositions(offsetKeys.stream().collect(Collectors.toMap(Function.identity(), s -> numOfRecsFactor)), + connectRunner.getBootstrapServers()); + } + + @ParameterizedTest + @ValueSource(booleans = { true, false }) + void parquetTest(final boolean addPrefix) throws IOException { + final var topicName = IntegrationBase.topicName(testInfo); + + final String partition = "0"; + final DistributionType distributionType; + final String prefixPattern = "bucket/topics/{{topic}}/partition/{{partition}}/"; + String s3Prefix = ""; + distributionType = DistributionType.PARTITION; + if (addPrefix) { + s3Prefix = "bucket/topics/" + topicName + "/partition/" + partition + "/"; + } + + final String fileName = (StringUtils.isNotBlank(s3Prefix) ? s3Prefix : "") + topicName + "-" + partition + "-" + + System.currentTimeMillis() + ".txt"; + final String name = "testuser"; + + final Map connectorConfig = getAvroConfig(topicName, InputFormat.PARQUET, addPrefix, s3Prefix, + prefixPattern, distributionType); + connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); + final Path path = ContentUtils.getTmpFilePath(name); + + try { + s3Client.putObject(PutObjectRequest.builder().bucket(TEST_BUCKET_NAME).key(fileName).build(), path); + } catch (final Exception e) { // NOPMD broad exception caught + LOGGER.error("Error in reading file {}", e.getMessage(), e); + } finally { + Files.delete(path); + } + + // Waiting for a small number of messages so using a smaller Duration of a minute + final List records = IntegrationBase.consumeAvroMessages(topicName, 100, Duration.ofSeconds(60), + connectRunner.getBootstrapServers(), schemaRegistry.getSchemaRegistryUrl()); + final List expectedRecordNames = IntStream.range(0, 100) + .mapToObj(i -> name + i) + .collect(Collectors.toList()); + assertThat(records).extracting(record -> record.get("name").toString()) + .containsExactlyInAnyOrderElementsOf(expectedRecordNames); + } + + private Map getAvroConfig(final String topicName, final InputFormat inputFormat, + final boolean addPrefix, final String s3Prefix, final String prefixPattern, + final DistributionType distributionType) { + final Map connectorConfig = getConfig(CONNECTOR_NAME, topicName, 4, distributionType, addPrefix, + s3Prefix, prefixPattern, "-"); + connectorConfig.put(INPUT_FORMAT_KEY, inputFormat.getValue()); + connectorConfig.put(SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); + connectorConfig.put(VALUE_CONVERTER_KEY, "io.confluent.connect.avro.AvroConverter"); + connectorConfig.put(VALUE_CONVERTER_SCHEMA_REGISTRY_URL, schemaRegistry.getSchemaRegistryUrl()); + connectorConfig.put(AVRO_VALUE_SERIALIZER, "io.confluent.kafka.serializers.KafkaAvroSerializer"); + return connectorConfig; + } + + @Test + void jsonTest(final TestInfo testInfo) { + final var topicName = IntegrationBase.topicName(testInfo); + final Map connectorConfig = getConfig(CONNECTOR_NAME, topicName, 1, DistributionType.PARTITION, + false, "", "", "-"); + connectorConfig.put(INPUT_FORMAT_KEY, InputFormat.JSONL.getValue()); + connectorConfig.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.json.JsonConverter"); + + connectRunner.configureConnector(CONNECTOR_NAME, connectorConfig); + final String testMessage = "This is a test "; + final StringBuilder jsonBuilder = new StringBuilder(); + for (int i = 0; i < 500; i++) { + final String jsonContent = "{\"message\": \"" + testMessage + "\", \"id\":\"" + i + "\"}"; + jsonBuilder.append(jsonContent).append("\n"); // NOPMD + } + final byte[] jsonBytes = jsonBuilder.toString().getBytes(StandardCharsets.UTF_8); + + final String offsetKey = writeToS3(topicName, jsonBytes, "1", "", "-"); + + // Poll Json messages from the Kafka topic and deserialize them + final List records = IntegrationBase.consumeJsonMessages(topicName, 500, + connectRunner.getBootstrapServers()); + + assertThat(records).map(jsonNode -> jsonNode.get("payload")).anySatisfy(jsonNode -> { + assertThat(jsonNode.get("message").asText()).contains(testMessage); + assertThat(jsonNode.get("id").asText()).contains("1"); + }); + + // Verify offset positions + verifyOffsetPositions(Map.of(offsetKey, 500), connectRunner.getBootstrapServers()); + } + + private Map getConfig(final String connectorName, final String topics, final int maxTasks, + final DistributionType taskDistributionConfig, final boolean addPrefix, final String s3Prefix, + final String prefixPattern, final String fileNameSeparator) { + final Map config = new HashMap<>(basicS3ConnectorConfig(addPrefix, s3Prefix)); + config.put("name", connectorName); + config.put(TARGET_TOPICS, topics); + config.put("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put(VALUE_CONVERTER_KEY, "org.apache.kafka.connect.converters.ByteArrayConverter"); + config.put(MAX_TASKS, String.valueOf(maxTasks)); + config.put(DISTRIBUTION_TYPE, taskDistributionConfig.value()); + config.put(FILE_NAME_TEMPLATE_CONFIG, + "{{topic}}" + fileNameSeparator + "{{partition}}" + fileNameSeparator + "{{start_offset}}"); + if (addPrefix) { + config.put(FILE_PATH_PREFIX_TEMPLATE_CONFIG, prefixPattern); + } + return config; + } + + private static Map basicS3ConnectorConfig(final boolean addPrefix, final String s3Prefix) { + final Map config = new HashMap<>(); + config.put("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); + config.put(AWS_ACCESS_KEY_ID_CONFIG, S3_ACCESS_KEY_ID); + config.put(AWS_SECRET_ACCESS_KEY_CONFIG, S3_SECRET_ACCESS_KEY); + config.put(AWS_S3_ENDPOINT_CONFIG, s3Endpoint); + config.put(AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET_NAME); + if (addPrefix) { + config.put(AWS_S3_PREFIX_CONFIG, s3Prefix); + } + config.put(TARGET_TOPIC_PARTITIONS, "0,1"); + + return config; + } + + static void verifyOffsetPositions(final Map expectedRecords, final String bootstrapServers) { + final Properties consumerProperties = IntegrationBase.getConsumerProperties(bootstrapServers, + ByteArrayDeserializer.class, ByteArrayDeserializer.class); + + final Map offsetRecs = new HashMap<>(); + try (KafkaConsumer consumer = new KafkaConsumer<>(consumerProperties)) { + consumer.subscribe(Collections.singletonList("connect-offset-topic-" + CONNECTOR_NAME)); + await().atMost(Duration.ofMinutes(1)).pollInterval(Duration.ofSeconds(1)).untilAsserted(() -> { + offsetRecs.putAll(IntegrationBase.consumeOffsetMessages(consumer)); + assertThat(offsetRecs).containsExactlyInAnyOrderEntriesOf(expectedRecords); + }); + } + } + + String writeToS3(final String topicName, final byte[] testDataBytes, final String partitionId, + final String s3Prefix, final String separator) { + final String objectKey = (StringUtils.isNotBlank(s3Prefix) ? s3Prefix : "") + topicName + separator + + partitionId + separator + System.currentTimeMillis() + ".txt"; + writeToS3WithKey(objectKey, testDataBytes); + return OBJECT_KEY + SEPARATOR + objectKey; + } +} diff --git a/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java new file mode 100644 index 000000000..5e2e1201b --- /dev/null +++ b/s3-source-connector/src/integration-test/java/io/aiven/kafka/connect/s3/source/SchemaRegistryContainer.java @@ -0,0 +1,65 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import java.time.Duration; + +import com.github.dockerjava.api.model.Ulimit; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.wait.strategy.Wait; + +public final class SchemaRegistryContainer extends GenericContainer { + public static final int SCHEMA_REGISTRY_PORT = 8081; + + public SchemaRegistryContainer(final String bootstrapServer) { + this("4.1.0", bootstrapServer); + } + + public SchemaRegistryContainer(final String karapaceVersion, final String bootstrapServer) { + super("ghcr.io/aiven-open/karapace:" + karapaceVersion); + withAccessToHost(true); + withEnv("KARAPACE_ADVERTISED_HOSTNAME", "karapace-registry"); + withEnv("KARAPACE_BOOTSTRAP_URI", bootstrapServer); + withEnv("KARAPACE_PORT", String.valueOf(SCHEMA_REGISTRY_PORT)); + withEnv("KARAPACE_HOST", "0.0.0.0"); + withEnv("KARAPACE_CLIENT_ID", "karapace"); + withEnv("KARAPACE_GROUP_ID", "karapace-registry"); + withEnv("KARAPACE_MASTER_ELIGIBILITY", "true"); + withEnv("KARAPACE_TOPIC_NAME", "_schemas"); + withEnv("KARAPACE_LOG_LEVEL", "WARNING");// This can be set to DEBUG for more verbose logging + withEnv("KARAPACE_COMPATIBILITY", "FULL"); + withEnv("KARAPACE_KAFKA_SCHEMA_READER_STRICT_MODE", "false"); + withEnv("KARAPACE_KAFKA_RETRIABLE_ERRORS_SILENCED", "true"); + withExposedPorts(SCHEMA_REGISTRY_PORT); + withCommand("/bin/bash", "/opt/karapace/start.sh", "registry"); + + // When started, check any API to see if the service is ready, which also indicates that it is connected to the + // Kafka bootstrap server. + waitingFor(Wait.forHttp("/_health") + .forPort(8081) + .withReadTimeout(Duration.ofMinutes(1)) + .forResponsePredicate(response -> response.contains("\"schema_registry_ready\":true"))); + + withCreateContainerCmdModifier( + cmd -> cmd.getHostConfig().withUlimits(new Ulimit[] { new Ulimit("nofile", 30_000L, 30_000L) })); + } + + public String getSchemaRegistryUrl() { + return String.format("http://%s:%s", getHost(), getMappedPort(SCHEMA_REGISTRY_PORT)); + + } +} diff --git a/s3-source-connector/src/integration-test/resources/logback-test.xml b/s3-source-connector/src/integration-test/resources/logback-test.xml new file mode 100644 index 000000000..fd146afc4 --- /dev/null +++ b/s3-source-connector/src/integration-test/resources/logback-test.xml @@ -0,0 +1,17 @@ + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + + + \ No newline at end of file diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java new file mode 100644 index 000000000..18d0f0adb --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/AivenKafkaConnectS3SourceConnector.java @@ -0,0 +1,84 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import static io.aiven.kafka.connect.common.config.CommonConfig.TASK_ID; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.connect.connector.Task; +import org.apache.kafka.connect.source.SourceConnector; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.utils.Version; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * AivenKafkaConnectS3SourceConnector is a Kafka Connect Connector implementation that watches a S3 bucket and generates + * tasks to ingest contents. + */ +public class AivenKafkaConnectS3SourceConnector extends SourceConnector { + + private static final Logger LOGGER = LoggerFactory.getLogger(AivenKafkaConnectS3SourceConnector.class); + + private Map configProperties; + + @Override + public ConfigDef config() { + return S3SourceConfig.configDef(); + } + + @Override + public String version() { + return Version.VERSION; + } + + @Override + public Class taskClass() { + return S3SourceTask.class; + } + + @Override + public List> taskConfigs(final int maxTasks) { + final var taskProps = new ArrayList>(); + for (int i = 0; i < maxTasks; i++) { + final var props = new HashMap<>(configProperties); // NOPMD + props.put(TASK_ID, String.valueOf(i)); + taskProps.add(props); + } + return taskProps; + } + + @Override + public void start(final Map properties) { + Objects.requireNonNull(properties, "properties haven't been set"); + configProperties = Map.copyOf(properties); + LOGGER.info("Start S3 Source connector"); + } + + @Override + public void stop() { + LOGGER.info("Stop S3 Source connector"); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java new file mode 100644 index 000000000..5466435af --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/S3SourceTask.java @@ -0,0 +1,173 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import java.util.Iterator; +import java.util.Map; +import java.util.Objects; + +import org.apache.kafka.connect.source.SourceRecord; + +import io.aiven.kafka.connect.common.config.SourceCommonConfig; +import io.aiven.kafka.connect.common.source.AbstractSourceTask; +import io.aiven.kafka.connect.common.source.input.Transformer; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.utils.AWSV2SourceClient; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; +import io.aiven.kafka.connect.s3.source.utils.RecordProcessor; +import io.aiven.kafka.connect.s3.source.utils.S3SourceRecord; +import io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator; +import io.aiven.kafka.connect.s3.source.utils.Version; + +import org.apache.commons.collections4.IteratorUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import software.amazon.awssdk.core.exception.SdkException; + +/** + * S3SourceTask is a Kafka Connect SourceTask implementation that reads from source-s3 buckets and generates Kafka + * Connect records. + */ +public class S3SourceTask extends AbstractSourceTask { + /** The logger to write to */ + private static final Logger LOGGER = LoggerFactory.getLogger(S3SourceTask.class); + + public static final String BUCKET = "bucket"; + public static final String TOPIC = "topic"; + + public static final String OBJECT_KEY = "object_key"; + public static final String PARTITION = "topicPartition"; + + /** An iterator or S3SourceRecords */ + private Iterator s3SourceRecordIterator; + /** + * The transformer that we are using TODO move this to AbstractSourceTask + */ + private Transformer transformer; + /** The AWS Source client */ + + private AWSV2SourceClient awsv2SourceClient; + /** The offset manager this task uses */ + private OffsetManager offsetManager; + private S3SourceConfig s3SourceConfig; + + public S3SourceTask() { + super(LOGGER); + } + + @Override + public String version() { + return Version.VERSION; + } + + @Override + protected Iterator getIterator(BackoffConfig config) { // NOPMD cognitive complexity + final Iterator inner = new Iterator<>() { + /** + * The backoff for Amazon retryable exceptions + */ + final Backoff backoff = new Backoff(config); + + @Override + public boolean hasNext() { + while (stillPolling()) { + try { + return s3SourceRecordIterator.hasNext(); + } catch (SdkException exception) { + if (exception.retryable()) { + LOGGER.warn("Retryable error encountered during polling. Waiting before retrying...", + exception); + try { + backoff.delay(); + } catch (InterruptedException e) { + LOGGER.warn("Backoff delay was interrupted. Throwing original exception: {}", + exception.getMessage()); + throw exception; + } + } else { + // TODO validate that the iterator does not lose an S3Object. Add test to + // S3ObjectIterator. + throw exception; + } + } + } + return false; + } + + @Override + public SourceRecord next() { + final S3SourceRecord s3SourceRecord = s3SourceRecordIterator.next(); + offsetManager.updateAndReturnCurrentOffsets(s3SourceRecord.getPartitionMap(), + s3SourceRecord.getObjectKey(), s3SourceRecord.getRecordNumber()); + return RecordProcessor.createSourceRecord(s3SourceRecord, s3SourceConfig, awsv2SourceClient, + offsetManager); + } + }; + return IteratorUtils.filteredIterator(inner, Objects::nonNull); + } + + @Override + protected SourceCommonConfig configure(final Map props) { + LOGGER.info("S3 Source task started."); + this.s3SourceConfig = new S3SourceConfig(props); + this.transformer = s3SourceConfig.getTransformer(); + offsetManager = new OffsetManager(context, s3SourceConfig); + awsv2SourceClient = new AWSV2SourceClient(s3SourceConfig); + setS3SourceRecordIterator( + new SourceRecordIterator(s3SourceConfig, offsetManager, this.transformer, awsv2SourceClient)); + return s3SourceConfig; + } + + @Override + public void commit() { + LOGGER.info("Committed all records through last poll()"); + } + + @Override + public void commitRecord(final SourceRecord record) { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("Committed individual record {} committed", (Map) record.sourceOffset()); + } + } + + /** + * Set the S3 source record iterator that this task is using. Protected to be overridden in testing implementation. + * + * @param iterator + * The S3SourceRecord iterator to use. + */ + protected void setS3SourceRecordIterator(final Iterator iterator) { + s3SourceRecordIterator = iterator; + } + + @Override + protected void closeResources() { + awsv2SourceClient.shutdown(); + } + + // below for visibility in tests + + /** + * Get the transformer that we are using. + * + * @return the transformer that we are using. + */ + public Transformer getTransformer() { + return transformer; + } + +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java new file mode 100644 index 000000000..13ff4d690 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3ClientFactory.java @@ -0,0 +1,66 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.config; + +import java.net.URI; +import java.time.Duration; +import java.util.Objects; +import java.util.Random; + +import io.aiven.kafka.connect.iam.AwsCredentialProviderFactory; + +import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration; +import software.amazon.awssdk.core.retry.RetryMode; +import software.amazon.awssdk.retries.api.internal.backoff.ExponentialDelayWithJitter; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.S3Configuration; + +public class S3ClientFactory { + + private final AwsCredentialProviderFactory credentialFactory = new AwsCredentialProviderFactory(); + + public S3Client createAmazonS3Client(final S3SourceConfig config) { + + final ExponentialDelayWithJitter backoffStrategy = new ExponentialDelayWithJitter(Random::new, + Duration.ofMillis(Math.toIntExact(config.getS3RetryBackoffDelayMs())), + Duration.ofMillis(Math.toIntExact(config.getS3RetryBackoffMaxDelayMs()))); + + final ClientOverrideConfiguration clientOverrideConfiguration = ClientOverrideConfiguration.builder() + .retryStrategy(RetryMode.STANDARD) + .build(); + if (Objects.isNull(config.getAwsS3EndPoint())) { + return S3Client.builder() + .overrideConfiguration(clientOverrideConfiguration) + .overrideConfiguration(o -> o.retryStrategy( + r -> r.backoffStrategy(backoffStrategy).maxAttempts(config.getS3RetryBackoffMaxRetries()))) + .region(config.getAwsS3Region()) + .credentialsProvider(credentialFactory.getAwsV2Provider(config.getS3ConfigFragment())) + .build(); + } else { + // TODO This is definitely used for testing but not sure if customers use it. + return S3Client.builder() + .overrideConfiguration(clientOverrideConfiguration) + .region(config.getAwsS3Region()) + .credentialsProvider(credentialFactory.getAwsV2Provider(config.getS3ConfigFragment())) + .endpointOverride(URI.create(config.getAwsS3EndPoint())) + .serviceConfiguration(S3Configuration.builder().pathStyleAccessEnabled(true).build()) + .build(); + } + + } + +} diff --git a/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3SourceBaseConfig.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java similarity index 55% rename from s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3SourceBaseConfig.java rename to s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java index f1db8eddc..ebcffdba5 100644 --- a/s3-commons/src/main/java/io/aiven/kafka/connect/config/s3/S3SourceBaseConfig.java +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfig.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package io.aiven.kafka.connect.config.s3; +package io.aiven.kafka.connect.s3.source.config; import static io.aiven.kafka.connect.config.s3.S3CommonConfig.handleDeprecatedYyyyUppercase; @@ -22,27 +22,52 @@ import org.apache.kafka.common.config.ConfigDef; +import io.aiven.kafka.connect.common.config.FileNameFragment; +import io.aiven.kafka.connect.common.config.OutputFieldType; +import io.aiven.kafka.connect.common.config.OutputFormatFragment; +import io.aiven.kafka.connect.common.config.SchemaRegistryFragment; import io.aiven.kafka.connect.common.config.SourceCommonConfig; +import io.aiven.kafka.connect.common.config.SourceConfigFragment; +import io.aiven.kafka.connect.config.s3.S3ConfigFragment; import io.aiven.kafka.connect.iam.AwsStsEndpointConfig; import io.aiven.kafka.connect.iam.AwsStsRole; -import com.amazonaws.auth.AWSCredentialsProvider; -import com.amazonaws.auth.BasicAWSCredentials; -import com.amazonaws.client.builder.AwsClientBuilder; -import com.amazonaws.regions.Region; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -@SuppressWarnings({ "PMD.ExcessiveImports", "PMD.TooManyStaticImports" }) -public class S3SourceBaseConfig extends SourceCommonConfig { - public static final Logger LOGGER = LoggerFactory.getLogger(S3SourceBaseConfig.class); +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.regions.Region; + +final public class S3SourceConfig extends SourceCommonConfig { + + public static final Logger LOGGER = LoggerFactory.getLogger(S3SourceConfig.class); + private final S3ConfigFragment s3ConfigFragment; - protected S3SourceBaseConfig(ConfigDef definition, Map originals) { // NOPMD UnusedAssignment - super(definition, handleDeprecatedYyyyUppercase(originals)); + private final FileNameFragment s3FileNameFragment; + public S3SourceConfig(final Map properties) { + super(configDef(), handleDeprecatedYyyyUppercase(properties)); s3ConfigFragment = new S3ConfigFragment(this); - validate(); + s3FileNameFragment = new FileNameFragment(this); + validate(); // NOPMD ConstructorCallsOverridableMethod getStsRole is called + } + + public static ConfigDef configDef() { + + final var configDef = new S3SourceConfigDef(); + S3ConfigFragment.update(configDef); + SourceConfigFragment.update(configDef); + FileNameFragment.update(configDef); + SchemaRegistryFragment.update(configDef); + OutputFormatFragment.update(configDef, OutputFieldType.VALUE); + + return configDef; } private void validate() { + + // s3ConfigFragment is validated in this method as it is created here. + // Other Fragments created in the ConfigDef are validated in the parent classes their instances are created in. + // e.g. SourceConfigFragment, FileNameFragment, SchemaRegistryFragment and OutputFormatFragment are all + // validated in SourceCommonConfig. s3ConfigFragment.validate(); } @@ -62,12 +87,8 @@ public AwsStsEndpointConfig getStsEndpointConfig() { return s3ConfigFragment.getStsEndpointConfig(); } - public AwsClientBuilder.EndpointConfiguration getAwsEndpointConfiguration() { - return s3ConfigFragment.getAwsEndpointConfiguration(); - } - - public BasicAWSCredentials getAwsCredentials() { - return s3ConfigFragment.getAwsCredentials(); + public AwsBasicCredentials getAwsCredentials() { + return s3ConfigFragment.getAwsCredentialsV2(); } public String getAwsS3EndPoint() { @@ -75,7 +96,7 @@ public String getAwsS3EndPoint() { } public Region getAwsS3Region() { - return s3ConfigFragment.getAwsS3Region(); + return s3ConfigFragment.getAwsS3RegionV2(); } public String getAwsS3BucketName() { @@ -106,8 +127,12 @@ public int getS3RetryBackoffMaxRetries() { return s3ConfigFragment.getS3RetryBackoffMaxRetries(); } - public AWSCredentialsProvider getCustomCredentialsProvider() { - return s3ConfigFragment.getCustomCredentialsProvider(); + public S3ConfigFragment getS3ConfigFragment() { + return s3ConfigFragment; + } + + public FileNameFragment getS3FileNameFragment() { + return s3FileNameFragment; } } diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigDef.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigDef.java new file mode 100644 index 000000000..e823f94a9 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigDef.java @@ -0,0 +1,32 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.config; + +import static io.aiven.kafka.connect.config.s3.S3CommonConfig.handleDeprecatedYyyyUppercase; + +import java.util.List; +import java.util.Map; + +import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.common.config.ConfigValue; + +public class S3SourceConfigDef extends ConfigDef { + @Override + public List validate(final Map props) { + return super.validate(handleDeprecatedYyyyUppercase(props)); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java new file mode 100644 index 000000000..d9dbc0d45 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java @@ -0,0 +1,174 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import java.io.InputStream; +import java.util.Iterator; +import java.util.Objects; +import java.util.function.Predicate; +import java.util.stream.Stream; + +import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import org.apache.commons.io.function.IOSupplier; +import org.apache.commons.lang3.StringUtils; +import software.amazon.awssdk.core.ResponseBytes; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.GetObjectRequest; +import software.amazon.awssdk.services.s3.model.GetObjectResponse; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; +import software.amazon.awssdk.services.s3.model.S3Object; + +/** + * Called AWSV2SourceClient as this source client implements the V2 version of the aws client library. Handles all calls + * and authentication to AWS and returns useable objects to the SourceRecordIterator. + */ +public class AWSV2SourceClient { + + public static final int PAGE_SIZE_FACTOR = 2; + private final S3SourceConfig s3SourceConfig; + private final S3Client s3Client; + private final String bucketName; + + private Predicate filterPredicate = s3Object -> s3Object.size() > 0; + + /** + * @param s3SourceConfig + * configuration for Source connector + */ + public AWSV2SourceClient(final S3SourceConfig s3SourceConfig) { + this(new S3ClientFactory().createAmazonS3Client(s3SourceConfig), s3SourceConfig); + } + + /** + * Valid for testing + * + * @param s3Client + * amazonS3Client + * @param s3SourceConfig + * configuration for Source connector + */ + AWSV2SourceClient(final S3Client s3Client, final S3SourceConfig s3SourceConfig) { + this.s3SourceConfig = s3SourceConfig; + this.s3Client = s3Client; + this.bucketName = s3SourceConfig.getAwsS3BucketName(); + } + + /** + * Creates a stream from which we will create an iterator. + * + * @param startToken + * the beginning key, or {@code null} to start at the beginning. + * @return a Stream of S3Objects for the current state of the S3 storage. + */ + private Stream getS3ObjectStream(final String startToken) { + final ListObjectsV2Request request = ListObjectsV2Request.builder() + .bucket(bucketName) + .maxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR) + .prefix(StringUtils.defaultIfBlank(s3SourceConfig.getAwsS3Prefix(), null)) + .startAfter(StringUtils.defaultIfBlank(startToken, null)) + .build(); + + return Stream.iterate(s3Client.listObjectsV2(request), Objects::nonNull, response -> { + // This is called every time next() is called on the iterator. + if (response.isTruncated()) { + return s3Client.listObjectsV2(ListObjectsV2Request.builder() + .maxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR) + .continuationToken(response.nextContinuationToken()) + .build()); + } else { + return null; + } + + }).flatMap(response -> response.contents().stream().filter(filterPredicate)); + } + + /** + * Creates an S3Object iterator that will return the objects from the current objects in S3 storage and then try to + * refresh on every {@code hasNext()} that returns false. This should pick up new files as they are dropped on the + * file system. + * + * @param startToken + * the beginning key, or {@code null} to start at the beginning. + * @return an Iterator on the S3Objects. + */ + public Iterator getS3ObjectIterator(final String startToken) { + return new S3ObjectIterator(startToken); + } + + /** + * Gets an iterator of keys from the current S3 storage. + * + * @param startToken + * the beginning key, or {@code null} to start at the beginning. + * @return an Iterator on the keys of the current S3Objects. + */ + public Iterator getListOfObjectKeys(final String startToken) { + return getS3ObjectStream(startToken).map(S3Object::key).iterator(); + } + + public IOSupplier getObject(final String objectKey) { + final GetObjectRequest getObjectRequest = GetObjectRequest.builder().bucket(bucketName).key(objectKey).build(); + final ResponseBytes s3ObjectResponse = s3Client.getObjectAsBytes(getObjectRequest); + return s3ObjectResponse::asInputStream; + } + + public void shutdown() { + s3Client.close(); + } + + public void addPredicate(final Predicate objectPredicate) { + this.filterPredicate = this.filterPredicate.and(objectPredicate); + } + + /** + * An iterator that reads from + */ + public class S3ObjectIterator implements Iterator { + + /** The current iterator. */ + private Iterator inner; + /** The last object key that was seen. */ + private String lastSeenObjectKey; + + private S3ObjectIterator(final String initialKey) { + lastSeenObjectKey = initialKey; + inner = getS3ObjectStream(lastSeenObjectKey).iterator(); + } + @Override + public boolean hasNext() { + if (!inner.hasNext()) { + inner = getS3ObjectStream(lastSeenObjectKey).iterator(); + } + return inner.hasNext(); + } + + @Override + public S3Object next() { + final S3Object result = inner.next(); + lastSeenObjectKey = result.key(); + return result; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + } + +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java new file mode 100644 index 000000000..6c60bb8ed --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/ConnectUtils.java @@ -0,0 +1,39 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import static io.aiven.kafka.connect.s3.source.S3SourceTask.BUCKET; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.PARTITION; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.TOPIC; + +import java.util.HashMap; +import java.util.Map; + +final public class ConnectUtils { + + private ConnectUtils() { + // hidden + } + public static Map getPartitionMap(final String topicName, final Integer defaultPartitionId, + final String bucketName) { + final Map partitionMap = new HashMap<>(); + partitionMap.put(BUCKET, bucketName); + partitionMap.put(TOPIC, topicName); + partitionMap.put(PARTITION, defaultPartitionId); + return partitionMap; + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java new file mode 100644 index 000000000..95bc4053d --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/OffsetManager.java @@ -0,0 +1,124 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; +import static java.util.stream.Collectors.toMap; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Hashtable; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.kafka.connect.source.SourceTaskContext; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class OffsetManager { + + private static final Logger LOGGER = LoggerFactory.getLogger(OffsetManager.class); + public static final String SEPARATOR = "_"; + private final Map, Map> offsets; + + public OffsetManager(final SourceTaskContext context, final S3SourceConfig s3SourceConfig) { + final String s3Bucket = s3SourceConfig.getAwsS3BucketName(); + final Set partitions = parsePartitions(s3SourceConfig); + final Set topics = parseTopics(s3SourceConfig); + + // Build the partition keys and fetch offsets from offset storage + final List> partitionKeys = buildPartitionKeys(s3Bucket, partitions, topics); + final Map, Map> offsetMap = context.offsetStorageReader() + .offsets(partitionKeys); + + LOGGER.info(" ********** offsetMap ***** {}", offsetMap); + this.offsets = offsetMap.entrySet() + .stream() + .filter(e -> e.getValue() != null) + .collect(toMap(entry -> new HashMap<>(entry.getKey()), entry -> new HashMap<>(entry.getValue()))); + LOGGER.info(" ********** offsets ***** {}", offsets); + } + + public Map, Map> getOffsets() { + return Collections.unmodifiableMap(offsets); + } + + public long incrementAndUpdateOffsetMap(final Map partitionMap, final String currentObjectKey, + final long startOffset) { + if (offsets.containsKey(partitionMap)) { + final Map offsetValue = new HashMap<>(offsets.get(partitionMap)); + if (offsetValue.containsKey(getObjectMapKey(currentObjectKey))) { + final long newOffsetVal = (long) offsetValue.get(getObjectMapKey(currentObjectKey)) + 1L; + offsetValue.put(getObjectMapKey(currentObjectKey), newOffsetVal); + offsets.put(partitionMap, offsetValue); + return newOffsetVal; + } else { + offsetValue.put(getObjectMapKey(currentObjectKey), startOffset); + offsets.put(partitionMap, offsetValue); + return startOffset; + } + } + return startOffset; + } + + public Map updateAndReturnCurrentOffsets(final Map partitionMap, + final String currentObjectKey, final long offset) { + final Map offsetMap = offsets.compute(partitionMap, (k, v) -> { + final Map map = v == null ? new Hashtable<>() : v; + map.put(getObjectMapKey(currentObjectKey), offset); + return map; + }); + return new HashMap<>(offsetMap); + } + + public static String getObjectMapKey(final String currentObjectKey) { + return OBJECT_KEY + SEPARATOR + currentObjectKey; + } + + public long recordsProcessedForObjectKey(final Map partitionMap, final String currentObjectKey) { + if (offsets.containsKey(partitionMap)) { + return (long) offsets.get(partitionMap).getOrDefault(getObjectMapKey(currentObjectKey), 0L); + } + return 0L; + } + + private static Set parsePartitions(final S3SourceConfig s3SourceConfig) { + final String partitionString = s3SourceConfig.getTargetTopicPartitions(); + return Arrays.stream(partitionString.split(",")).map(Integer::parseInt).collect(Collectors.toSet()); + } + + private static Set parseTopics(final S3SourceConfig s3SourceConfig) { + final String topicString = s3SourceConfig.getTargetTopics(); + return Arrays.stream(topicString.split(",")).collect(Collectors.toSet()); + } + + private static List> buildPartitionKeys(final String bucket, final Set partitions, + final Set topics) { + final List> partitionKeys = new ArrayList<>(); + partitions.forEach(partition -> topics.forEach(topic -> { + partitionKeys.add(ConnectUtils.getPartitionMap(topic, partition, bucket)); + })); + return partitionKeys; + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java new file mode 100644 index 000000000..cab511693 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessor.java @@ -0,0 +1,52 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.errors.DataException; +import org.apache.kafka.connect.source.SourceRecord; + +import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public final class RecordProcessor { + + private static final Logger LOGGER = LoggerFactory.getLogger(RecordProcessor.class); + + private RecordProcessor() { + } + + public static SourceRecord createSourceRecord(final S3SourceRecord s3SourceRecord, + final S3SourceConfig s3SourceConfig, final AWSV2SourceClient sourceClient, + final OffsetManager offsetManager) { + try { + return s3SourceRecord.getSourceRecord(offsetManager); + } catch (DataException e) { + if (ErrorsTolerance.NONE.equals(s3SourceConfig.getErrorsTolerance())) { + throw new ConnectException("Data Exception caught during S3 record to source record transformation", e); + } else { + LOGGER.warn( + "Data Exception caught during S3 record to source record transformation {} . errors.tolerance set to 'all', logging warning and continuing to process.", + e.getMessage(), e); + return null; + } + } + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java new file mode 100644 index 000000000..05ca02ba4 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/S3SourceRecord.java @@ -0,0 +1,83 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.source.SourceRecord; + +public class S3SourceRecord { + private final Map partitionMap; + private final long recordNumber; + private final String topic; + private final Integer topicPartition; + private final SchemaAndValue keyData; + + private final SchemaAndValue valueData; + + private final String objectKey; + + public S3SourceRecord(final Map partitionMap, final long recordNumber, final String topic, + final Integer topicPartition, final String objectKey, final SchemaAndValue keyData, + final SchemaAndValue valueData) { + this.partitionMap = new HashMap<>(partitionMap); + this.recordNumber = recordNumber; + this.topic = topic; + this.topicPartition = topicPartition; + this.keyData = keyData; + this.valueData = valueData; + this.objectKey = objectKey; + } + + public Map getPartitionMap() { + return Collections.unmodifiableMap(partitionMap); + } + + public long getRecordNumber() { + return recordNumber; + } + + public String getTopic() { + return topic; + } + + public Integer partition() { + return topicPartition; + } + + public String getObjectKey() { + return objectKey; + } + + public SchemaAndValue getKey() { + return new SchemaAndValue(keyData.schema(), keyData.value()); + } + + public SchemaAndValue getValue() { + return new SchemaAndValue(valueData.schema(), valueData.value()); + } + + public SourceRecord getSourceRecord(final OffsetManager offsetManager) { + final Map offsetMap = offsetManager.updateAndReturnCurrentOffsets(getPartitionMap(), + getObjectKey(), getRecordNumber()); + return new SourceRecord(getPartitionMap(), offsetMap, topic, partition(), keyData.schema(), keyData.value(), + valueData.schema(), valueData.value()); + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java new file mode 100644 index 000000000..2eb31fff2 --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIterator.java @@ -0,0 +1,205 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import java.util.Collections; +import java.util.Iterator; +import java.util.Map; +import java.util.Optional; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.stream.Stream; + +import org.apache.kafka.connect.data.SchemaAndValue; + +import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; +import io.aiven.kafka.connect.common.source.input.Transformer; +import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; +import io.aiven.kafka.connect.common.source.task.Context; +import io.aiven.kafka.connect.common.source.task.DistributionStrategy; +import io.aiven.kafka.connect.common.source.task.DistributionType; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import software.amazon.awssdk.services.s3.model.S3Object; + +/** + * Iterator that processes S3 files and creates Kafka source records. Supports different output formats (Avro, JSON, + * Parquet). + */ +public final class SourceRecordIterator implements Iterator { + public static final long BYTES_TRANSFORMATION_NUM_OF_RECS = 1L; + private static final Logger LOGGER = LoggerFactory.getLogger(SourceRecordIterator.class); + + private final OffsetManager offsetManager; + + private final S3SourceConfig s3SourceConfig; + private final String bucketName; + + private final Transformer transformer; + // Once we decouple the S3Object from the Source Iterator we can change this to be the SourceApiClient + // At which point it will work for al our integrations. + private final AWSV2SourceClient sourceClient; + + private Context context; + + private final DistributionStrategy distributionStrategy; + private int taskId; + + private final Iterator inner; + + private Iterator outer; + private FilePatternUtils filePattern; + private final Optional targetTopics; + + public SourceRecordIterator(final S3SourceConfig s3SourceConfig, final OffsetManager offsetManager, + final Transformer transformer, final AWSV2SourceClient sourceClient) { + super(); + this.s3SourceConfig = s3SourceConfig; + this.offsetManager = offsetManager; + + this.bucketName = s3SourceConfig.getAwsS3BucketName(); + this.transformer = transformer; + this.sourceClient = sourceClient; + this.targetTopics = Optional.ofNullable(s3SourceConfig.getTargetTopics()); + this.distributionStrategy = initializeDistributionStrategy(); + + // Initialize predicates + sourceClient.addPredicate(this::isFileMatchingPattern); + sourceClient.addPredicate(obj -> isFileAssignedToTask(context, taskId)); + + // call filters out bad file names and extracts topic/partition + inner = sourceClient.getS3ObjectIterator(null); + outer = Collections.emptyIterator(); + } + + public boolean isFileMatchingPattern(final S3Object s3Object) { + final Optional> optionalCtx = filePattern.process(s3Object.key()); + if (optionalCtx.isPresent()) { + context = optionalCtx.get(); + return true; + } + return false; + } + + public boolean isFileAssignedToTask(final Context ctx, final int taskId) { + return taskId == distributionStrategy.getTaskFor(ctx); + } + + @Override + public boolean hasNext() { + while (!outer.hasNext() && inner.hasNext()) { + outer = convert(inner.next()).iterator(); + } + return outer.hasNext(); + } + + @Override + public S3SourceRecord next() { + return outer.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("This iterator is unmodifiable"); + } + + /** + * Converts the S3Object into stream of S3SourceRecords. + * + * @param s3Object + * the S3Object to read data from. + * @return a stream of S3SourceRecords created from the input stream of the S3Object. + */ + private Stream convert(final S3Object s3Object) { + // Set the target topic in the context if it has been set from configuration. + if (targetTopics.isPresent()) { + overrideContextTopic(); + } + final Map partitionMap = ConnectUtils.getPartitionMap(context.getTopic().get(), + context.getPartition().get(), bucketName); + final long recordCount = offsetManager.recordsProcessedForObjectKey(partitionMap, s3Object.key()); + + // Optimizing without reading stream again. + if (transformer instanceof ByteArrayTransformer && recordCount > 0) { + return Stream.empty(); + } + + final SchemaAndValue keyData = transformer.getKeyData(s3Object.key(), context.getTopic().get(), s3SourceConfig); + + return transformer + .getRecords(sourceClient.getObject(s3Object.key()), context.getTopic().get(), + context.getPartition().get(), s3SourceConfig, recordCount) + .map(new Mapper(partitionMap, recordCount, keyData, s3Object.key())); + } + + private Consumer overrideContextTopic() { + if (context.getTopic().isPresent()) { + LOGGER.debug( + "Overriding topic '{}' extracted from S3 Object Key with topic '{}' from configuration 'topics'. ", + context.getTopic().get(), targetTopics.get()); + } + return context::setTopic; + } + + private DistributionStrategy initializeDistributionStrategy() { + final DistributionType distributionType = s3SourceConfig.getDistributionType(); + final int maxTasks = s3SourceConfig.getMaxTasks(); + this.taskId = s3SourceConfig.getTaskId() % maxTasks; + this.filePattern = new FilePatternUtils( + s3SourceConfig.getS3FileNameFragment().getFilenameTemplate().toString()); + return distributionType.getDistributionStrategy(maxTasks); + } + + /** + * maps the data from the @{link Transformer} stream to an S3SourceRecord given all the additional data required. + */ + class Mapper implements Function { + /** + * The partition map + */ + private final Map partitionMap; + /** + * The record number for the record being created. + */ + private long recordCount; + /** + * The schema and value for the key + */ + private final SchemaAndValue keyData; + /** + * The object key from S3 + */ + private final String objectKey; + + public Mapper(final Map partitionMap, final long recordCount, final SchemaAndValue keyData, + final String objectKey) { + this.partitionMap = partitionMap; + this.recordCount = recordCount; + this.keyData = keyData; + this.objectKey = objectKey; + } + + @Override + public S3SourceRecord apply(final SchemaAndValue valueData) { + recordCount++; + return new S3SourceRecord(partitionMap, recordCount, context.getTopic().get(), context.getPartition().get(), + objectKey, keyData, valueData); + } + } +} diff --git a/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/Version.java b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/Version.java new file mode 100644 index 000000000..1d4dcb33d --- /dev/null +++ b/s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/Version.java @@ -0,0 +1,43 @@ +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import java.io.InputStream; +import java.util.Properties; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public final class Version { + private static final Logger LOGGER = LoggerFactory.getLogger(Version.class); + + private static final String PROPERTIES_FILENAME = "s3-source-connector-for-apache-kafka-version.properties"; + + public static final String VERSION; // NOPMD AvoidFieldNameMatchingTypeName + + static { + final Properties props = new Properties(); + try (InputStream resourceStream = Thread.currentThread() + .getContextClassLoader() + .getResourceAsStream(PROPERTIES_FILENAME)) { + props.load(resourceStream); + } catch (final Exception e) { // NOPMD AvoidCatchingGenericException + LOGGER.warn("Error while loading {}: {}", PROPERTIES_FILENAME, e.getMessage()); + } + VERSION = props.getProperty("version", "unknown").trim(); + } +} diff --git a/s3-source-connector/src/main/resources/META-INF/services/org.apache.kafka.connect.source.SourceConnector b/s3-source-connector/src/main/resources/META-INF/services/org.apache.kafka.connect.source.SourceConnector new file mode 100644 index 000000000..46c0eaf4f --- /dev/null +++ b/s3-source-connector/src/main/resources/META-INF/services/org.apache.kafka.connect.source.SourceConnector @@ -0,0 +1 @@ +io.aiven.kafka.connect.s3.source.AivenKafkaConnectS3SourceConnector diff --git a/s3-source-connector/src/main/resources/s3-source-connector-for-apache-kafka-version.properties b/s3-source-connector/src/main/resources/s3-source-connector-for-apache-kafka-version.properties new file mode 100644 index 000000000..9c2421c8a --- /dev/null +++ b/s3-source-connector/src/main/resources/s3-source-connector-for-apache-kafka-version.properties @@ -0,0 +1,16 @@ +## +# Copyright 2024 Aiven Oy +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +## +version=${version ?: 'unknown'} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java new file mode 100644 index 000000000..e7b958ab3 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/S3SourceTaskTest.java @@ -0,0 +1,410 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source; + +import static io.aiven.kafka.connect.common.config.CommonConfig.MAX_TASKS; +import static io.aiven.kafka.connect.common.config.CommonConfig.TASK_ID; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPIC_PARTITIONS; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.source.SourceRecord; +import org.apache.kafka.connect.source.SourceTaskContext; +import org.apache.kafka.connect.storage.OffsetStorageReader; + +import io.aiven.kafka.connect.common.config.SourceConfigFragment; +import io.aiven.kafka.connect.common.source.AbstractSourceTask; +import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; +import io.aiven.kafka.connect.common.source.input.InputFormat; +import io.aiven.kafka.connect.config.s3.S3ConfigFragment; +import io.aiven.kafka.connect.iam.AwsCredentialProviderFactory; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; +import io.aiven.kafka.connect.s3.source.utils.ConnectUtils; +import io.aiven.kafka.connect.s3.source.utils.OffsetManager; +import io.aiven.kafka.connect.s3.source.utils.S3SourceRecord; + +import io.findify.s3mock.S3Mock; +import org.apache.commons.lang3.time.StopWatch; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration; +import software.amazon.awssdk.core.retry.RetryMode; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.S3Configuration; + +final class S3SourceTaskTest { + + /** + * The amount of extra time that we will allow for timing errors. + */ + private static final long TIMING_DELTA = 500; + + private static final Random RANDOM = new Random(); + private Map properties; + + private static final String TEST_BUCKET = "test-bucket"; + + private static final String TOPIC = "TOPIC1"; + + private static final int PARTITION = 1; + + private static final String OBJECT_KEY = "object_key"; + + // TODO S3Mock has not been maintained in 4 years + // Adobe have an alternative we can move to. + private static S3Mock s3Api; + private static S3Client s3Client; + + private static Map commonProperties; + + @BeforeAll + public static void setUpClass() throws URISyntaxException { + final int s3Port = RANDOM.nextInt(10_000) + 10_000; + + s3Api = new S3Mock.Builder().withPort(s3Port).withInMemoryBackend().build(); + s3Api.start(); + + commonProperties = Map.of(S3ConfigFragment.AWS_ACCESS_KEY_ID_CONFIG, "test_key_id", + S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG, "test_secret_key", + S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET, S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG, + "http://localhost:" + s3Port, S3ConfigFragment.AWS_S3_REGION_CONFIG, "us-west-2"); + + final AwsCredentialProviderFactory credentialFactory = new AwsCredentialProviderFactory(); + final S3SourceConfig config = new S3SourceConfig(commonProperties); + final ClientOverrideConfiguration clientOverrideConfiguration = ClientOverrideConfiguration.builder() + .retryStrategy(RetryMode.STANDARD) + .build(); + + s3Client = S3Client.builder() + .overrideConfiguration(clientOverrideConfiguration) + .region(config.getAwsS3Region()) + .endpointOverride(URI.create(config.getAwsS3EndPoint())) + .serviceConfiguration(S3Configuration.builder().pathStyleAccessEnabled(true).build()) + .credentialsProvider(credentialFactory.getAwsV2Provider(config.getS3ConfigFragment())) + .build(); + } + + @AfterAll + public static void tearDownClass() { + s3Api.stop(); + } + + @BeforeEach + public void setUp() { + properties = new HashMap<>(commonProperties); + s3Client.createBucket(create -> create.bucket(TEST_BUCKET).build()); + } + + @AfterEach + public void tearDown() { + s3Client.deleteBucket(delete -> delete.bucket(TEST_BUCKET).build()); + } + + @Test + void testS3SourceTaskInitialization() { + final S3SourceTask s3SourceTask = new S3SourceTask(); + startSourceTask(s3SourceTask); + + assertThat(s3SourceTask.getTransformer()).isInstanceOf(ByteArrayTransformer.class); + + assertThat(s3SourceTask.isRunning()).isTrue(); + } + + @Test + void testStop() { + final S3SourceTask s3SourceTask = new S3SourceTask(); + startSourceTask(s3SourceTask); + s3SourceTask.stop(); + + assertThat(s3SourceTask.isRunning()).isFalse(); + } + + private static S3SourceRecord createS3SourceRecord(final String topicName, final Integer defaultPartitionId, + final String bucketName, final String objectKey, final byte[] key, final byte[] value) { + return new S3SourceRecord(ConnectUtils.getPartitionMap(topicName, defaultPartitionId, bucketName), 0L, + topicName, defaultPartitionId, objectKey, new SchemaAndValue(Schema.OPTIONAL_BYTES_SCHEMA, key), + new SchemaAndValue(Schema.OPTIONAL_BYTES_SCHEMA, value)); + } + + private void startSourceTask(final S3SourceTask s3SourceTask) { + final SourceTaskContext mockedSourceTaskContext = mock(SourceTaskContext.class); + final OffsetStorageReader mockedOffsetStorageReader = mock(OffsetStorageReader.class); + when(mockedSourceTaskContext.offsetStorageReader()).thenReturn(mockedOffsetStorageReader); + s3SourceTask.initialize(mockedSourceTaskContext); + + setBasicProperties(); + s3SourceTask.start(properties); + } + + private void setBasicProperties() { + properties.putIfAbsent(INPUT_FORMAT_KEY, InputFormat.BYTES.getValue()); + properties.putIfAbsent("name", "test_source_connector"); + properties.putIfAbsent("key.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + properties.putIfAbsent("value.converter", "org.apache.kafka.connect.converters.ByteArrayConverter"); + properties.putIfAbsent(MAX_TASKS, "1"); + properties.put(TASK_ID, "1"); + properties.putIfAbsent("connector.class", AivenKafkaConnectS3SourceConnector.class.getName()); + properties.putIfAbsent(TARGET_TOPIC_PARTITIONS, "0,1"); + properties.putIfAbsent(TARGET_TOPICS, "testtopic"); + + } + + @Test + void testPollWithNoDataReturned() { + final S3SourceConfig s3SourceConfig = mock(S3SourceConfig.class); + when(s3SourceConfig.getMaxPollRecords()).thenReturn(5); + final Iterator sourceRecordIterator = Collections.emptyIterator(); + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + + startSourceTask(s3SourceTask); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + final List results = s3SourceTask.poll(); + stopWatch.stop(); + assertThat(results).isNull(); + assertThat(stopWatch.getTime()).isLessThan(AbstractSourceTask.MAX_POLL_TIME.toMillis() + TIMING_DELTA); + } + + private void assertEquals(final S3SourceRecord s3Record, final SourceRecord sourceRecord) { + assertThat(sourceRecord).isNotNull(); + assertThat(sourceRecord.sourcePartition()).isEqualTo(s3Record.getPartitionMap()); + final Map map = (Map) sourceRecord.sourceOffset(); + + assertThat(map.get(OffsetManager.getObjectMapKey(s3Record.getObjectKey()))) + .isEqualTo(s3Record.getRecordNumber()); + assertThat(sourceRecord.key()).isEqualTo(s3Record.getKey().value()); + assertThat(sourceRecord.value()).isEqualTo(s3Record.getValue().value()); + } + + @Test + void testPollsWithRecords() { + final List lst = createS3SourceRecords(2); + final Iterator sourceRecordIterator = lst.iterator(); + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + + startSourceTask(s3SourceTask); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + final List results = s3SourceTask.poll(); + stopWatch.stop(); + + assertThat(results).hasSize(2); + assertEquals(lst.get(0), results.get(0)); + assertEquals(lst.get(1), results.get(1)); + assertThat(stopWatch.getTime()).isLessThan(AbstractSourceTask.MAX_POLL_TIME.toMillis()); + } + + private List createS3SourceRecords(final int count) { + final List lst = new ArrayList<>(); + if (count > 0) { + lst.add(createS3SourceRecord(TOPIC, PARTITION, TEST_BUCKET, OBJECT_KEY, + "Hello".getBytes(StandardCharsets.UTF_8), "Hello World".getBytes(StandardCharsets.UTF_8))); + for (int i = 1; i < count; i++) { + lst.add(createS3SourceRecord(TOPIC, PARTITION, TEST_BUCKET, OBJECT_KEY + i, + "Goodbye".getBytes(StandardCharsets.UTF_8), + String.format("Goodbye cruel World (%s)", i).getBytes(StandardCharsets.UTF_8))); + } + } + return lst; + } + + @Test + void testPollWithInterruptedIterator() { + final List lst = createS3SourceRecords(3); + + final Iterator inner1 = lst.subList(0, 2).iterator(); + final Iterator inner2 = lst.subList(2, 3).iterator(); + final Iterator sourceRecordIterator = new Iterator<>() { + Iterator inner = inner1; + @Override + public boolean hasNext() { + if (inner == null) { + inner = inner2; + return false; + } + return inner.hasNext(); + } + + @Override + public S3SourceRecord next() { + final S3SourceRecord result = inner.next(); + if (!inner.hasNext()) { + inner = null; // NOPMD null assignment + } + return result; + } + }; + + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + startSourceTask(s3SourceTask); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + List results = s3SourceTask.poll(); + stopWatch.stop(); + + assertThat(results).hasSize(2); + assertEquals(lst.get(0), results.get(0)); + assertEquals(lst.get(1), results.get(1)); + + results = s3SourceTask.poll(); + assertThat(results).hasSize(1); + + assertThat(stopWatch.getTime()).isLessThan(AbstractSourceTask.MAX_POLL_TIME.toMillis()); + + } + + @Test + void testPollWithSlowProducer() { + final List lst = createS3SourceRecords(3); + + final Iterator sourceRecordIterator = new Iterator<>() { + final Iterator inner = lst.iterator(); + @Override + public boolean hasNext() { + return inner.hasNext(); + } + + @Override + public S3SourceRecord next() { + try { + Thread.sleep(Duration.ofSeconds(6).toMillis()); + } catch (InterruptedException e) { + // do nothing. + } + return inner.next(); + } + }; + + final List results = new ArrayList<>(); + // since the polling is returning data at or near the time limit the 3 record may be returned as follows + // Record 1 may be returned in Poll1 or Poll2 + // Record 2 may be returned in Poll2 or Poll2 + // Record 3 may be returned in Poll3 or Poll4 + + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + startSourceTask(s3SourceTask); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + // poll 1 + List pollResult = s3SourceTask.poll(); + stopWatch.stop(); + if (pollResult != null) { + results.addAll(pollResult); + } + assertThat(results).hasSizeLessThanOrEqualTo(1); + // poll 2 + stopWatch.reset(); + stopWatch.start(); + pollResult = s3SourceTask.poll(); + stopWatch.stop(); + if (pollResult != null) { + results.addAll(pollResult); + } + assertThat(results).hasSizeLessThanOrEqualTo(2); + // poll 3 + stopWatch.reset(); + stopWatch.start(); + pollResult = s3SourceTask.poll(); + stopWatch.stop(); + if (pollResult != null) { + results.addAll(pollResult); + } + assertThat(results).hasSizeLessThanOrEqualTo(3); + // poll 4 + stopWatch.reset(); + stopWatch.start(); + pollResult = s3SourceTask.poll(); + stopWatch.stop(); + if (results.size() == lst.size()) { + assertThat(pollResult).isNull(); + } else { + results.addAll(pollResult); + } + assertThat(results).hasSize(3); + } + + @Test + void testPollsWithExcessRecords() { + // test that multiple polls to get all records succeeds. + properties.put(SourceConfigFragment.MAX_POLL_RECORDS, "2"); + + final List lst = createS3SourceRecords(3); + + final Iterator sourceRecordIterator = lst.iterator(); + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + + startSourceTask(s3SourceTask); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + List results = s3SourceTask.poll(); + assertThat(results).hasSize(2); + results = s3SourceTask.poll(); + assertThat(results).hasSize(1); + stopWatch.stop(); + assertThat(stopWatch.getTime()).isLessThan(AbstractSourceTask.MAX_POLL_TIME.toMillis() * 2); + } + + @Test + void testPollWhenConnectorStopped() { + final List lst = createS3SourceRecords(3); + final Iterator sourceRecordIterator = lst.iterator(); + final S3SourceTask s3SourceTask = new TestingS3SourceTask(sourceRecordIterator); + + startSourceTask(s3SourceTask); + s3SourceTask.stop(); + final StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + final List results = s3SourceTask.poll(); + stopWatch.stop(); + assertThat(results).isNull(); + assertThat(stopWatch.getTime()).isLessThan(TIMING_DELTA); + + } + + private static class TestingS3SourceTask extends S3SourceTask { // NOPMD not a test class + + TestingS3SourceTask(final Iterator realIterator) { + super(); + super.setS3SourceRecordIterator(realIterator); + } + + @Override + protected void setS3SourceRecordIterator(final Iterator iterator) { + // do nothing. + } + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java new file mode 100644 index 000000000..10939c511 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/config/S3SourceConfigTest.java @@ -0,0 +1,71 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.config; + +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.INPUT_FORMAT_KEY; +import static io.aiven.kafka.connect.common.config.SchemaRegistryFragment.SCHEMA_REGISTRY_URL; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPIC_PARTITIONS; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.HashMap; + +import io.aiven.kafka.connect.common.source.input.InputFormat; +import io.aiven.kafka.connect.config.s3.S3ConfigFragment; + +import org.junit.jupiter.api.Test; +import software.amazon.awssdk.regions.Region; + +final class S3SourceConfigTest { + @Test + void correctFullConfig() { + final var props = new HashMap(); + + // aws props + props.put(S3ConfigFragment.AWS_ACCESS_KEY_ID_CONFIG, "AWS_ACCESS_KEY_ID"); + props.put(S3ConfigFragment.AWS_SECRET_ACCESS_KEY_CONFIG, "AWS_SECRET_ACCESS_KEY"); + props.put(S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG, "the-bucket"); + props.put(S3ConfigFragment.AWS_S3_ENDPOINT_CONFIG, "AWS_S3_ENDPOINT"); + props.put(S3ConfigFragment.AWS_S3_PREFIX_CONFIG, "AWS_S3_PREFIX"); + props.put(S3ConfigFragment.AWS_S3_REGION_CONFIG, Region.US_EAST_1.id()); + + // record, topic specific props + props.put(INPUT_FORMAT_KEY, InputFormat.AVRO.getValue()); + props.put(TARGET_TOPIC_PARTITIONS, "0,1"); + props.put(TARGET_TOPICS, "testtopic"); + props.put(SCHEMA_REGISTRY_URL, "localhost:8081"); + + final var conf = new S3SourceConfig(props); + final var awsCredentials = conf.getAwsCredentials(); + + assertThat(awsCredentials.accessKeyId()).isEqualTo("AWS_ACCESS_KEY_ID"); + assertThat(awsCredentials.secretAccessKey()).isEqualTo("AWS_SECRET_ACCESS_KEY"); + assertThat(conf.getAwsS3BucketName()).isEqualTo("the-bucket"); + assertThat(conf.getAwsS3EndPoint()).isEqualTo("AWS_S3_ENDPOINT"); + assertThat(conf.getAwsS3Region()).isEqualTo(Region.of("us-east-1")); + + assertThat(conf.getInputFormat()).isEqualTo(InputFormat.AVRO); + assertThat(conf.getTargetTopics()).isEqualTo("testtopic"); + assertThat(conf.getTargetTopicPartitions()).isEqualTo("0,1"); + assertThat(conf.getSchemaRegistryUrl()).isEqualTo("localhost:8081"); + + assertThat(conf.getS3RetryBackoffDelayMs()).isEqualTo(S3ConfigFragment.AWS_S3_RETRY_BACKOFF_DELAY_MS_DEFAULT); + assertThat(conf.getS3RetryBackoffMaxDelayMs()) + .isEqualTo(S3ConfigFragment.AWS_S3_RETRY_BACKOFF_MAX_DELAY_MS_DEFAULT); + assertThat(conf.getS3RetryBackoffMaxRetries()).isEqualTo(S3ConfigFragment.S3_RETRY_BACKOFF_MAX_RETRIES_DEFAULT); + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java new file mode 100644 index 000000000..8b34f73d0 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/BucketAccessor.java @@ -0,0 +1,194 @@ +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.testutils; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Base64; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; + +import io.aiven.kafka.connect.common.config.CompressionType; + +import com.github.luben.zstd.ZstdInputStream; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xerial.snappy.SnappyInputStream; +import software.amazon.awssdk.core.exception.SdkException; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.Delete; +import software.amazon.awssdk.services.s3.model.DeleteBucketRequest; +import software.amazon.awssdk.services.s3.model.DeleteObjectsRequest; +import software.amazon.awssdk.services.s3.model.HeadObjectRequest; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; +import software.amazon.awssdk.services.s3.model.NoSuchKeyException; +import software.amazon.awssdk.services.s3.model.ObjectIdentifier; +import software.amazon.awssdk.services.s3.model.S3Exception; +import software.amazon.awssdk.services.s3.model.S3Object; + +public class BucketAccessor { + + private final String bucketName; + private final S3Client s3Client; + + private static final Logger LOGGER = LoggerFactory.getLogger(BucketAccessor.class); + + @SuppressFBWarnings(value = "EI_EXPOSE_REP2", justification = "stores mutable s3Client object") + public BucketAccessor(final S3Client s3Client, final String bucketName) { + this.bucketName = bucketName; + this.s3Client = s3Client; + } + + public final void createBucket() { + s3Client.createBucket(builder -> builder.bucket(bucketName).build()); + } + + public final void removeBucket() { + final var deleteIds = s3Client.listObjectsV2(ListObjectsV2Request.builder().bucket(bucketName).build()) + .contents() + .stream() + .map(S3Object::key) + .map(key -> ObjectIdentifier.builder().key(key).build()) + .collect(Collectors.toList()); + + try { + s3Client.deleteObjects(DeleteObjectsRequest.builder() + .bucket(bucketName) + .delete(Delete.builder().objects(deleteIds).build()) + .build()); + } catch (final S3Exception e) { + LOGGER.warn( + String.format("Couldn't delete objects. Reason: [%s] %s", e.awsErrorDetails().errorMessage(), e)); + } catch (final SdkException e) { + + LOGGER.error("Couldn't delete objects: {}, Exception{} ", deleteIds, e.getMessage()); + } + s3Client.deleteBucket(DeleteBucketRequest.builder().bucket(bucketName).build()); + } + + // TODO NOT Currently used + public final Boolean doesObjectExist(final String objectName) { + try { + s3Client.headObject(HeadObjectRequest.builder().bucket(bucketName).key(objectName).build()); + return true; + } catch (NoSuchKeyException e) { + return false; + } + } + + public final List> readAndDecodeLines(final String blobName, final String compression, + final int... fieldsToDecode) throws IOException { + Objects.requireNonNull(blobName, "blobName cannot be null"); + Objects.requireNonNull(fieldsToDecode, "fieldsToDecode cannot be null"); + + return readAndDecodeLines0(blobName, compression, fieldsToDecode); + } + + private List> readAndDecodeLines0(final String blobName, final String compression, + final int[] fieldsToDecode) throws IOException { + return readLines(blobName, compression).stream() + .map(l -> l.split(",")) + .map(fields -> decodeRequiredFields(fields, fieldsToDecode)) + .collect(Collectors.toList()); + } + + public final byte[] readBytes(final String blobName, final String compression) throws IOException { + Objects.requireNonNull(blobName, "blobName cannot be null"); + final byte[] blobBytes = s3Client.getObjectAsBytes(builder -> builder.key(blobName).bucket(bucketName).build()) + .asByteArray(); + try (ByteArrayInputStream bais = new ByteArrayInputStream(blobBytes); + InputStream decompressedStream = getDecompressedStream(bais, compression); + ByteArrayOutputStream decompressedBytes = new ByteArrayOutputStream()) { + final byte[] readBuffer = new byte[1024]; + int bytesRead; + while ((bytesRead = decompressedStream.read(readBuffer)) != -1) { // NOPMD AssignmentInOperand + decompressedBytes.write(readBuffer, 0, bytesRead); + } + return decompressedBytes.toByteArray(); + } catch (final IOException e) { + throw new RuntimeException(e); // NOPMD AvoidThrowingRawExceptionTypes + } + } + + public final byte[] readBytes(final String blobName) throws IOException { + return readBytes(blobName, "none"); + } + + public final List readLines(final String blobName, final String compression) throws IOException { + final byte[] blobBytes = readBytes(blobName, compression); + try (ByteArrayInputStream bais = new ByteArrayInputStream(blobBytes); + InputStreamReader reader = new InputStreamReader(bais, StandardCharsets.UTF_8); + BufferedReader bufferedReader = new BufferedReader(reader)) { + return bufferedReader.lines().collect(Collectors.toList()); + } catch (final IOException e) { + throw new RuntimeException(e); // NOPMD AvoidThrowingRawExceptionTypes + } + } + + public final List listObjects() { + + return s3Client.listObjectsV2(ListObjectsV2Request.builder().bucket(bucketName).build()) + .contents() + .stream() + .map(S3Object::key) + .collect(Collectors.toList()); + } + + private InputStream getDecompressedStream(final InputStream inputStream, final String compression) + throws IOException { + Objects.requireNonNull(inputStream, "inputStream cannot be null"); + Objects.requireNonNull(compression, "compression cannot be null"); + + final CompressionType compressionType = CompressionType.forName(compression); + switch (compressionType) { + case ZSTD : + return new ZstdInputStream(inputStream); + case GZIP : + return new GZIPInputStream(inputStream); + case SNAPPY : + return new SnappyInputStream(inputStream); + default : + return inputStream; + } + } + + private List decodeRequiredFields(final String[] originalFields, final int[] fieldsToDecode) { + Objects.requireNonNull(originalFields, "originalFields cannot be null"); + Objects.requireNonNull(fieldsToDecode, "fieldsToDecode cannot be null"); + + final List result = Arrays.asList(originalFields); + for (final int fieldIdx : fieldsToDecode) { + result.set(fieldIdx, b64Decode(result.get(fieldIdx))); + } + return result; + } + + private String b64Decode(final String value) { + Objects.requireNonNull(value, "value cannot be null"); + + return new String(Base64.getDecoder().decode(value), StandardCharsets.UTF_8); + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java new file mode 100644 index 000000000..a2b4db378 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/ContentUtils.java @@ -0,0 +1,99 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.testutils; + +import static org.apache.kafka.connect.data.Schema.INT32_SCHEMA; +import static org.apache.kafka.connect.data.Schema.STRING_SCHEMA; + +import java.io.IOException; +import java.net.ConnectException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.kafka.common.record.TimestampType; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.sink.SinkRecord; + +import io.aiven.kafka.connect.common.config.OutputField; +import io.aiven.kafka.connect.common.config.OutputFieldEncodingType; +import io.aiven.kafka.connect.common.config.OutputFieldType; +import io.aiven.kafka.connect.common.output.parquet.ParquetOutputWriter; + +public final class ContentUtils { + private ContentUtils() { + } + public static Path getTmpFilePath(final String name1) throws IOException { + final String tmpFile = "users.parquet"; + final Path parquetFileDir = Files.createTempDirectory("parquet_tests"); + final String parquetFilePath = parquetFileDir.toAbsolutePath() + "/" + tmpFile; + + writeParquetFile(parquetFilePath, name1); + return Paths.get(parquetFilePath); + } + + public static void writeParquetFile(final String tempFilePath, final String name1) throws IOException { + // Define the Avro schema + final Schema schema = SchemaBuilder.struct() + .field("name", STRING_SCHEMA) + .field("age", INT32_SCHEMA) + .field("email", STRING_SCHEMA) + .build(); + // Write the Parquet file + try { + writeParquetFile(tempFilePath, schema, name1, 100); + } catch (IOException e) { + throw new ConnectException("Error writing parquet file"); + } + } + + @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops") + private static void writeParquetFile(final String outputPath, final Schema schema, final String name1, + final int numOfRecords) throws IOException { + + final List allParquetRecords = new ArrayList<>(); + + for (int i = 0; i < numOfRecords; i++) { + allParquetRecords + .add(new Struct(schema).put("name", name1 + i).put("age", 30).put("email", name1 + "@test")); + } + + // Create a Parquet writer + final Path outputFilePath = Paths.get(outputPath); + try (var outputStream = Files.newOutputStream(outputFilePath.toAbsolutePath()); + var parquetWriter = new ParquetOutputWriter( + List.of(new OutputField(OutputFieldType.VALUE, OutputFieldEncodingType.NONE)), outputStream, + Collections.emptyMap(), false)) { + int counter = 0; + final var sinkRecords = new ArrayList(); + for (final var r : allParquetRecords) { + final var sinkRecord = new SinkRecord( // NOPMD AvoidInstantiatingObjectsInLoops + "some-topic", 1, STRING_SCHEMA, "some-key-" + counter, schema, r, 100L, 1000L + counter, + TimestampType.CREATE_TIME, null); + sinkRecords.add(sinkRecord); + counter++; + } + parquetWriter.writeRecords(sinkRecords); + } + + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/IndexesToString.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/IndexesToString.java new file mode 100644 index 000000000..d54faa941 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/IndexesToString.java @@ -0,0 +1,22 @@ +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.testutils; + +@FunctionalInterface +public interface IndexesToString { + String generate(int partition, int epoch, int currIdx); +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/KeyValueGenerator.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/KeyValueGenerator.java new file mode 100644 index 000000000..b02103cb8 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/KeyValueGenerator.java @@ -0,0 +1,62 @@ +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.testutils; + +import java.util.Iterator; + +public class KeyValueGenerator implements Iterable { + + public final int numPartitions; + public final int numEpochs; + public final IndexesToString keyGenerator; + public final IndexesToString valueGenerator; + + public KeyValueGenerator(final int numPartitions, final int numEpochs, final IndexesToString keyGenerator, + final IndexesToString valueGenerator) { + this.numPartitions = numPartitions; + this.numEpochs = numEpochs; + this.keyGenerator = keyGenerator; + this.valueGenerator = valueGenerator; + } + + @Override + public Iterator iterator() { + return new Iterator<>() { + int partition; + int epoch; + int currIdx; + + @Override + public boolean hasNext() { + return epoch < numEpochs; + } + + @Override + public KeyValueMessage next() { + final KeyValueMessage msg = new KeyValueMessage(keyGenerator.generate(partition, epoch, currIdx), + valueGenerator.generate(partition, epoch, currIdx), partition, currIdx, epoch); + currIdx += 1; + partition += 1; + if (partition >= numPartitions) { + epoch += 1; + partition = 0; + } + return msg; + } + }; + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/KeyValueMessage.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/KeyValueMessage.java new file mode 100644 index 000000000..fed5372c8 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/testutils/KeyValueMessage.java @@ -0,0 +1,33 @@ +/* + * Copyright 2020 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.testutils; + +public class KeyValueMessage { + public final String key; + public final String value; + public final int partition; + public final int idx; + public final int epoch; + + public KeyValueMessage(final String key, final String value, final int partition, final int idx, final int epoch) { + this.key = key; + this.value = value; + this.partition = partition; + this.idx = idx; + this.epoch = epoch; + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java new file mode 100644 index 000000000..1a160d780 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClientTest.java @@ -0,0 +1,214 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG; +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_PREFIX_CONFIG; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import org.junit.jupiter.api.Test; +import org.mockito.ArgumentCaptor; +import org.mockito.Captor; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Response; +import software.amazon.awssdk.services.s3.model.S3Object; + +class AWSV2SourceClientTest { + + private S3Client s3Client; + + private AWSV2SourceClient awsv2SourceClient; + + @Captor + ArgumentCaptor requestCaptor; + + private static Map getConfigMap() { + final Map configMap = new HashMap<>(); + + configMap.put(AWS_S3_BUCKET_NAME_CONFIG, "test-bucket"); + return configMap; + } + + @Test + void testFetchObjectSummariesWithNoObjects() { + initializeWithTaskConfigs(); + final ListObjectsV2Response listObjectsV2Response = createListObjectsV2Response(Collections.emptyList(), null); + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Response); + + final Iterator summaries = awsv2SourceClient.getListOfObjectKeys(null); + assertThat(summaries).isExhausted(); + } + + @Test + void testFetchObjectSummariesWithOneObjectWithBasicConfig() { + final String objectKey = "any-key"; + + initializeWithTaskConfigs(); + final Iterator summaries = getS3ObjectKeysIterator(objectKey); + assertThat(summaries).hasNext(); + } + + @Test + void testFetchObjectSummariesWithZeroByteObject() { + initializeWithTaskConfigs(); + final ListObjectsV2Response listObjectsV2Response = getListObjectsV2Response(); + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Response); + + final Iterator summaries = awsv2SourceClient.getListOfObjectKeys(null); + + assertThat(summaries.next()).isNotBlank(); + assertThat(summaries.next()).isNotBlank(); + assertThat(summaries).isExhausted(); + } + + @Test + void testFetchObjectSummariesWithPagination() throws IOException { + initializeWithTaskConfigs(); + final S3Object object1 = createObjectSummary(1, "key1"); + final S3Object object2 = createObjectSummary(2, "key2"); + final List firstBatch = List.of(object1); + final List secondBatch = List.of(object2); + + final ListObjectsV2Response firstResult = createListObjectsV2Response(firstBatch, "nextToken"); + final ListObjectsV2Response secondResult = createListObjectsV2Response(secondBatch, null); + + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(firstResult).thenReturn(secondResult); + + final Iterator summaries = awsv2SourceClient.getListOfObjectKeys(null); + verify(s3Client, times(1)).listObjectsV2(any(ListObjectsV2Request.class)); + assertThat(summaries.next()).isNotNull(); + assertThat(summaries.next()).isNotNull(); + } + + @Test + void testFetchObjectWithPrefix() { + final Map configMap = getConfigMap(); + configMap.put(AWS_S3_PREFIX_CONFIG, "test/"); + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configMap); + s3Client = mock(S3Client.class); + awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig); + requestCaptor = ArgumentCaptor.forClass(ListObjectsV2Request.class); + final S3Object object1 = createObjectSummary(1, "topics/key1/1/key1.txt"); + final S3Object object2 = createObjectSummary(1, "topics/key2/2/key2.txt"); + + final ListObjectsV2Response firstResult = createListObjectsV2Response(List.of(object1), "nextToken"); + final ListObjectsV2Response secondResult = createListObjectsV2Response(List.of(object2), null); + + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(firstResult).thenReturn(secondResult); + + final Iterator summaries = awsv2SourceClient.getListOfObjectKeys(null); + verify(s3Client, times(1)).listObjectsV2(any(ListObjectsV2Request.class)); + + assertThat(summaries.next()).isNotNull(); + assertThat(summaries.next()).isNotNull(); + + verify(s3Client, times(2)).listObjectsV2(requestCaptor.capture()); + final List allRequests = requestCaptor.getAllValues(); + assertThat(summaries).isExhausted(); + + assertThat(allRequests.get(0).prefix()).isEqualTo(s3SourceConfig.getAwsS3Prefix()); + // Not required with continuation token + assertThat(allRequests.get(1).prefix()).isNull(); + assertThat(allRequests.get(1).continuationToken()).isEqualTo("nextToken"); + } + + @Test + void testFetchObjectWithInitialStartAfter() { + final Map configMap = getConfigMap(); + final String startAfter = "file-option-1-12000.txt"; + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configMap); + s3Client = mock(S3Client.class); + awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig); + requestCaptor = ArgumentCaptor.forClass(ListObjectsV2Request.class); + final S3Object object1 = createObjectSummary(1, "key1-1-10000"); + final S3Object object2 = createObjectSummary(1, "key2-2-20000"); + + final ListObjectsV2Response firstResult = createListObjectsV2Response(List.of(object1), "nextToken"); + final ListObjectsV2Response secondResult = createListObjectsV2Response(List.of(object2), null); + + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(firstResult).thenReturn(secondResult); + + final Iterator summaries = awsv2SourceClient.getListOfObjectKeys(startAfter); + verify(s3Client, times(1)).listObjectsV2(any(ListObjectsV2Request.class)); + + assertThat(summaries.next()).isNotNull(); + assertThat(summaries.next()).isNotNull(); + + verify(s3Client, times(2)).listObjectsV2(requestCaptor.capture()); + final List allRequests = requestCaptor.getAllValues(); + assertThat(summaries).isExhausted(); + + assertThat(allRequests.get(0).startAfter()).isEqualTo(startAfter); + // Not required with continuation token + assertThat(allRequests.get(1).startAfter()).isNull(); + assertThat(allRequests.get(1).continuationToken()).isEqualTo("nextToken"); + + } + + private ListObjectsV2Response createListObjectsV2Response(final List summaries, final String nextToken) { + final ListObjectsV2Response result = mock(ListObjectsV2Response.class); + when(result.contents()).thenReturn(summaries); + when(result.nextContinuationToken()).thenReturn(nextToken); + when(result.isTruncated()).thenReturn(nextToken != null); + return result; + } + + private S3Object createObjectSummary(final long sizeOfObject, final String objectKey) { + final S3Object summary = mock(S3Object.class); + when(summary.size()).thenReturn(sizeOfObject); + when(summary.key()).thenReturn(objectKey); + return summary; + } + + private Iterator getS3ObjectKeysIterator(final String objectKey) { + final S3Object objectSummary = createObjectSummary(1, objectKey); + final ListObjectsV2Response listObjectsV2Result = createListObjectsV2Response( + Collections.singletonList(objectSummary), null); + when(s3Client.listObjectsV2(any(ListObjectsV2Request.class))).thenReturn(listObjectsV2Result); + + return awsv2SourceClient.getListOfObjectKeys(null); + } + + private void initializeWithTaskConfigs() { + final Map configMap = getConfigMap(); + final S3SourceConfig s3SourceConfig = new S3SourceConfig(configMap); + s3Client = mock(S3Client.class); + awsv2SourceClient = new AWSV2SourceClient(s3Client, s3SourceConfig); + } + + private ListObjectsV2Response getListObjectsV2Response() { + final S3Object zeroByteObject = createObjectSummary(0, "key1"); + final S3Object nonZeroByteObject1 = createObjectSummary(1, "key2"); + final S3Object nonZeroByteObject2 = createObjectSummary(1, "key3"); + return createListObjectsV2Response(List.of(zeroByteObject, nonZeroByteObject1, nonZeroByteObject2), null); + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java new file mode 100644 index 000000000..1367d71f0 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/OffsetManagerTest.java @@ -0,0 +1,147 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPICS; +import static io.aiven.kafka.connect.common.config.SourceConfigFragment.TARGET_TOPIC_PARTITIONS; +import static io.aiven.kafka.connect.s3.source.S3SourceTask.OBJECT_KEY; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.kafka.connect.source.SourceTaskContext; +import org.apache.kafka.connect.storage.OffsetStorageReader; + +import io.aiven.kafka.connect.config.s3.S3ConfigFragment; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; + +final class OffsetManagerTest { + + private Map properties; + private static final String TEST_BUCKET = "test-bucket"; + + @Mock + private SourceTaskContext sourceTaskContext; + + private S3SourceConfig s3SourceConfig; + + private OffsetManager offsetManager; + + @BeforeEach + public void setUp() { + properties = new HashMap<>(); + setBasicProperties(); + s3SourceConfig = new S3SourceConfig(properties); + } + + @Test + void testWithOffsets() { + sourceTaskContext = mock(SourceTaskContext.class); + final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); + when(sourceTaskContext.offsetStorageReader()).thenReturn(offsetStorageReader); + + final Map partitionKey = new HashMap<>(); + partitionKey.put("topic", "topic1"); + partitionKey.put("partition", 0); + partitionKey.put("bucket", TEST_BUCKET); + + final Map offsetValue = new HashMap<>(); + offsetValue.put("object_key_file", 5L); + final Map, Map> offsets = new HashMap<>(); + offsets.put(partitionKey, offsetValue); + + when(offsetStorageReader.offsets(any())).thenReturn(offsets); + + offsetManager = new OffsetManager(sourceTaskContext, s3SourceConfig); + + final Map, Map> retrievedOffsets = offsetManager.getOffsets(); + assertThat(retrievedOffsets.size()).isEqualTo(1); + assertThat(retrievedOffsets.values().iterator().next().get("object_key_file")).isEqualTo(5L); + } + + @Test + void testIncrementAndUpdateOffsetMapExistingOffset() { + sourceTaskContext = mock(SourceTaskContext.class); + final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); + when(sourceTaskContext.offsetStorageReader()).thenReturn(offsetStorageReader); + + // Mock partition and offset values + final String objectKey = "testObject"; + final String offsetObjectKey = OBJECT_KEY + "_" + objectKey; + + final Map partitionKey = new HashMap<>(); + partitionKey.put("topic", "topic1"); + partitionKey.put("partition", 0); + partitionKey.put("bucket", "bucket"); + + final Map offsetValue = new HashMap<>(); + offsetValue.put(offsetObjectKey, 1L); // Existing offset value + final Map, Map> offsets = new HashMap<>(); + offsets.put(partitionKey, offsetValue); + + when(offsetStorageReader.offsets(any())).thenReturn(offsets); // Mock offset retrieval + + // Initialize offset manager + offsetManager = new OffsetManager(sourceTaskContext, s3SourceConfig); + + // Invoke method and assert new offset value + final long newOffset = offsetManager.incrementAndUpdateOffsetMap(partitionKey, objectKey, 2L); + + assertThat(newOffset).isEqualTo(2L); // Expect incremented offset + assertThat(offsetManager.getOffsets().get(partitionKey).get(offsetObjectKey)).isEqualTo(2L); // Verify updated + // offset in map + } + + @Test + void testIncrementAndUpdateOffsetMapNonExistingOffset() { + sourceTaskContext = mock(SourceTaskContext.class); + final OffsetStorageReader offsetStorageReader = mock(OffsetStorageReader.class); + when(sourceTaskContext.offsetStorageReader()).thenReturn(offsetStorageReader); + + // Mock partition without any existing offset + final Map partitionKey = new HashMap<>(); + partitionKey.put("topic", "topic1"); + partitionKey.put("partition", 0); + + when(offsetStorageReader.offsets(any())).thenReturn(Collections.emptyMap()); // No existing offset + + // Initialize offset manager + offsetManager = new OffsetManager(sourceTaskContext, s3SourceConfig); + + // Invoke method and assert new offset value + final long startOffset = 5L; + final long newOffset = offsetManager.incrementAndUpdateOffsetMap(partitionKey, "", startOffset); + + // Expect the startOffset to be returned when no existing offset is found + assertThat(newOffset).isEqualTo(startOffset); + } + + private void setBasicProperties() { + properties.put(S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG, TEST_BUCKET); + properties.put(TARGET_TOPIC_PARTITIONS, "0,1"); + properties.put(TARGET_TOPICS, "topic1,topic2"); + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java new file mode 100644 index 000000000..cc9db65cd --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/RecordProcessorTest.java @@ -0,0 +1,133 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; +import static org.mockito.internal.verification.VerificationModeFactory.times; + +import java.util.function.Supplier; + +import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.errors.DataException; +import org.apache.kafka.connect.source.SourceRecord; +import org.apache.kafka.connect.storage.Converter; + +import io.aiven.kafka.connect.common.config.enums.ErrorsTolerance; +import io.aiven.kafka.connect.common.source.input.Transformer; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +class RecordProcessorTest { + + @Mock + private S3SourceConfig s3SourceConfig; + @Mock + private Converter valueConverter; + @Mock + private Transformer transformer; + @Mock + private Converter keyConverter; + @Mock + private OffsetManager offsetManager; + + @Mock + private AWSV2SourceClient sourceClient; + + private static final Supplier TRUE = () -> true; + private static final Supplier FALSE = () -> false; + + @Test + void testCreateSourceRecord() { + + final SourceRecord mockSourceRecord = mock(SourceRecord.class); + final S3SourceRecord mockRecord = mock(S3SourceRecord.class); + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenReturn(mockSourceRecord); + + final SourceRecord result = RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, + offsetManager); + + verify(mockRecord, times(1)).getSourceRecord(any()); + assertThat(result).isEqualTo(mockSourceRecord); + + } + + @Test + void testCreateSourceRecordWithDataError() { + + final S3SourceRecord mockRecord = mock(S3SourceRecord.class); + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenThrow(new DataException("Testing exception")); + + when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.NONE); + + assertThatExceptionOfType(ConnectException.class).as("Errors tolerance: NONE") + .isThrownBy(() -> RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, + offsetManager)); + + when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.ALL); + final SourceRecord result = RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, + offsetManager); + assertThat(result).isNull(); + } + + @Test + void testCreateSourceRecords() { + final S3SourceRecord mockRecord = mock(S3SourceRecord.class); + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenReturn(mock(SourceRecord.class)); + + final SourceRecord sourceRecords = RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, + offsetManager); + + assertThat(sourceRecords).isNotNull(); + } + + @Test + void errorToleranceOnNONE() { + final S3SourceRecord mockRecord = mock(S3SourceRecord.class); + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenThrow(new DataException("generic issue")); + + when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.NONE); + + assertThatThrownBy( + () -> RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, offsetManager)) + .isInstanceOf(ConnectException.class) + .hasMessage("Data Exception caught during S3 record to source record transformation"); + + } + + @Test + void errorToleranceOnALL() { + final S3SourceRecord mockRecord = mock(S3SourceRecord.class); + when(mockRecord.getSourceRecord(any(OffsetManager.class))).thenThrow(new DataException("generic issue")); + + when(s3SourceConfig.getErrorsTolerance()).thenReturn(ErrorsTolerance.ALL); + + assertThat(RecordProcessor.createSourceRecord(mockRecord, s3SourceConfig, sourceClient, offsetManager)) + .isNull(); + + } +} diff --git a/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java new file mode 100644 index 000000000..e5e8ad613 --- /dev/null +++ b/s3-source-connector/src/test/java/io/aiven/kafka/connect/s3/source/utils/SourceRecordIteratorTest.java @@ -0,0 +1,347 @@ +/* + * Copyright 2024 Aiven Oy + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.aiven.kafka.connect.s3.source.utils; + +import static io.aiven.kafka.connect.config.s3.S3ConfigFragment.AWS_S3_BUCKET_NAME_CONFIG; +import static io.aiven.kafka.connect.s3.source.utils.SourceRecordIterator.BYTES_TRANSFORMATION_NUM_OF_RECS; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.anyMap; +import static org.mockito.Mockito.any; +import static org.mockito.Mockito.anyInt; +import static org.mockito.Mockito.anyLong; +import static org.mockito.Mockito.anyString; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Queue; +import java.util.function.Consumer; +import java.util.function.Predicate; +import java.util.stream.Stream; + +import org.apache.kafka.connect.data.SchemaAndValue; + +import io.aiven.kafka.connect.common.config.FileNameFragment; +import io.aiven.kafka.connect.common.source.input.AvroTransformer; +import io.aiven.kafka.connect.common.source.input.ByteArrayTransformer; +import io.aiven.kafka.connect.common.source.input.InputFormat; +import io.aiven.kafka.connect.common.source.input.Transformer; +import io.aiven.kafka.connect.common.source.input.TransformerFactory; +import io.aiven.kafka.connect.common.source.input.utils.FilePatternUtils; +import io.aiven.kafka.connect.common.source.task.DistributionType; +import io.aiven.kafka.connect.common.templating.Template; +import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; + +import org.apache.commons.lang3.tuple.Pair; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; +import software.amazon.awssdk.core.ResponseBytes; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.GetObjectRequest; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Response; +import software.amazon.awssdk.services.s3.model.S3Object; +@SuppressWarnings("PMD.ExcessiveImports") +final class SourceRecordIteratorTest { + + private S3SourceConfig mockConfig; + private OffsetManager mockOffsetManager; + private Transformer mockTransformer; + private FileNameFragment mockFileNameFrag; + + private AWSV2SourceClient sourceApiClient; + + @BeforeEach + public void setUp() { + mockConfig = mock(S3SourceConfig.class); + mockOffsetManager = mock(OffsetManager.class); + mockTransformer = mock(Transformer.class); + mockFileNameFrag = mock(FileNameFragment.class); + } + + private S3SourceConfig getConfig(final Map data) { + final Map defaults = new HashMap<>(); + defaults.put(AWS_S3_BUCKET_NAME_CONFIG, "bucket-name"); + defaults.putAll(data); + return new S3SourceConfig(defaults); + } + + private void mockSourceConfig(final S3SourceConfig s3SourceConfig, final String filePattern, final int taskId, final int maxTasks,final String targetTopic ){ + when(s3SourceConfig.getDistributionType()).thenReturn(DistributionType.OBJECT_HASH); + when(s3SourceConfig.getTaskId()).thenReturn(taskId); + when(s3SourceConfig.getMaxTasks()).thenReturn(maxTasks); + when(s3SourceConfig.getS3FileNameFragment()).thenReturn(mockFileNameFrag); + when(mockFileNameFrag.getFilenameTemplate()).thenReturn(Template.of(filePattern)); + when(mockConfig.getTargetTopics()).thenReturn(targetTopic); + } + + @Test + void testIteratorProcessesS3Objects() throws Exception { + + final String key = "topic-00001-abc123.txt"; + final String filePattern = "{{topic}}-{{partition}}"; + final S3SourceConfig config = getConfig(Collections.emptyMap()); + final S3ClientBuilder builder = new S3ClientBuilder(); + sourceApiClient = new AWSV2SourceClient(builder.build(), config); + + mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); + + when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); + + mockSourceConfig(mockConfig, filePattern, 0, 1, null); + + final Iterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, sourceApiClient); + assertThat(iterator).isExhausted(); + + builder.reset().addObject(key, "Hello World").endOfBlock(); + sourceApiClient = new AWSV2SourceClient(builder.build(), config); + final Iterator s3ObjectIterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, sourceApiClient); + + assertThat(s3ObjectIterator).hasNext(); + assertThat(s3ObjectIterator.next()).isNotNull(); + assertThat(s3ObjectIterator).isExhausted(); + + } + + @Test + void testIteratorExpectExceptionWhenGetsContextWithNoTopic() throws Exception { + + final String key = "topic-00001-abc123.txt"; + final String filePattern = "{{partition}}"; + final S3SourceConfig config = getConfig(Collections.emptyMap()); + final S3ClientBuilder builder = new S3ClientBuilder(); + sourceApiClient = new AWSV2SourceClient(builder.build(), config); + + mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); + + when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); + + mockSourceConfig(mockConfig, filePattern, 0, 1, null); + + final Iterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, sourceApiClient); + assertThat(iterator).isExhausted(); + + builder.reset().addObject(key, "Hello World").endOfBlock(); + sourceApiClient = new AWSV2SourceClient(builder.build(), config); + final Iterator s3ObjectIterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, sourceApiClient); + + assertThatThrownBy(s3ObjectIterator::hasNext).isInstanceOf(NoSuchElementException.class) + .hasMessage("No value present"); + + } + + @Test + void testIteratorProcessesS3ObjectsForByteArrayTransformer() throws Exception { + final String key = "topic-00001-abc123.txt"; + final String filePattern = "{{topic}}-{{partition}}"; + + final S3SourceConfig config = getConfig(Collections.emptyMap()); + final S3ClientBuilder builder = new S3ClientBuilder(); + + builder.reset().addObject(key, "Hello World").endOfBlock(); + sourceApiClient = new AWSV2SourceClient(builder.build(), config); + + mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); + + when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); + + mockSourceConfig(mockConfig, filePattern, 0, 1, null); + + // With ByteArrayTransformer + + mockTransformer = mock(ByteArrayTransformer.class); + when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) + .thenReturn(Stream.of(SchemaAndValue.NULL)); + + when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); + + when(mockOffsetManager.recordsProcessedForObjectKey(anyMap(), anyString())) + .thenReturn(BYTES_TRANSFORMATION_NUM_OF_RECS); + + // should skip if any records were produced by source record iterator. + final Iterator byteArrayIterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, sourceApiClient); + + assertThat(byteArrayIterator).isExhausted(); + + verify(mockTransformer, never()).getRecords(any(), anyString(), anyInt(), any(), anyLong()); + + // With AvroTransformer + + mockTransformer = mock(AvroTransformer.class); + + when(mockOffsetManager.recordsProcessedForObjectKey(anyMap(), anyString())) + .thenReturn(BYTES_TRANSFORMATION_NUM_OF_RECS); + + when(mockTransformer.getKeyData(anyString(), anyString(), any())).thenReturn(SchemaAndValue.NULL); + when(mockTransformer.getRecords(any(), anyString(), anyInt(), any(), anyLong())) + .thenReturn(Arrays.asList(SchemaAndValue.NULL).stream()); + + final Iterator avroIterator = new SourceRecordIterator(mockConfig, mockOffsetManager, + mockTransformer, sourceApiClient); + assertThat(avroIterator).isExhausted(); + + verify(mockTransformer, times(0)).getRecords(any(), anyString(), anyInt(), any(), anyLong()); + + } + + @ParameterizedTest + @CsvSource({ "4, 2, key1", "4, 3, key2", "4, 0, key3", "4, 1, key4" }) + void testFetchObjectSummariesWithOneNonZeroByteObjectWithTaskIdAssigned(final int maxTasks, final int taskId, + final String objectKey) { + + mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); + when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); + + final String key = "topic-00001-abc123.txt"; + final String filePattern = "{{partition}}"; + final String topic = "topic"; + final FilePatternUtils filePatternUtils = new FilePatternUtils(filePattern); + final S3SourceConfig config = getConfig(Collections.emptyMap()); + final S3ClientBuilder builder = new S3ClientBuilder(); + mockSourceConfig(mockConfig, filePattern, taskId, maxTasks, topic); + final S3Object obj = S3Object.builder().key(objectKey).build(); + + // Build s3 Client + builder.reset().addObject(key, "Hello World").endOfBlock(); + sourceApiClient = new AWSV2SourceClient(builder.build(), config); + + final SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, + sourceApiClient); + final Predicate s3ObjectPredicate = s3Object -> iterator.isFileMatchingPattern(s3Object) + && iterator.isFileAssignedToTask(filePatternUtils.process(s3Object.key()).orElseThrow(), taskId); + // Assert + assertThat(s3ObjectPredicate).accepts(obj); + } + + @ParameterizedTest + @CsvSource({ "4, 1, topic1-2-0", "4, 3,key1", "4, 0, key1", "4, 1, key2", "4, 2, key2", "4, 0, key2", "4, 1, key3", + "4, 2, key3", "4, 3, key3", "4, 0, key4", "4, 2, key4", "4, 3, key4" }) + void testFetchObjectSummariesWithOneNonZeroByteObjectWithTaskIdUnassigned(final int maxTasks, final int taskId, + final String objectKey) { + mockTransformer = TransformerFactory.getTransformer(InputFormat.BYTES); + when(mockOffsetManager.getOffsets()).thenReturn(Collections.emptyMap()); + final String filePattern = "{{partition}}"; + final String topic = "topic"; + mockSourceConfig(mockConfig, filePattern, taskId, maxTasks, topic); + final S3ClientBuilder builder = new S3ClientBuilder(); + final S3SourceConfig config = getConfig(Collections.emptyMap()); + final FilePatternUtils filePatternUtils = new FilePatternUtils(filePattern); + + final S3Object obj = S3Object.builder().key(objectKey).build(); + + builder.reset().addObject(objectKey, "Hello World").endOfBlock(); + sourceApiClient = new AWSV2SourceClient(builder.build(), config); + + final SourceRecordIterator iterator = new SourceRecordIterator(mockConfig, mockOffsetManager, mockTransformer, + sourceApiClient); + + final Predicate stringPredicate = s3Object -> iterator.isFileMatchingPattern(s3Object) + && iterator.isFileAssignedToTask(filePatternUtils.process(s3Object.key()).orElseThrow(), taskId); + // Assert + assertThat(stringPredicate.test(obj)).as("Predicate should accept the objectKey: " + objectKey).isFalse(); + } + + @Test + void testS3ClientIteratorMock() { + final S3ClientBuilder builder = new S3ClientBuilder(); + builder.addObject("Key", "value"); + final S3Client client = builder.build(); // NOPMD is asking to close client is done so on line 254 + final ListObjectsV2Response response = client.listObjectsV2(ListObjectsV2Request.builder().build()); + client.close(); + assertThat(response.contents()).isNotEmpty(); + + sourceApiClient = new AWSV2SourceClient(builder.build(), getConfig(Collections.emptyMap())); + final Iterator iterator = sourceApiClient.getS3ObjectIterator(null); + assertThat(iterator.hasNext()).isTrue(); + + } + + static class S3ClientBuilder { + Queue, Map>> blocks = new LinkedList<>(); + List objects = new ArrayList<>(); + Map data = new HashMap<>(); + + public S3ClientBuilder addObject(final String key, final byte[] data) { + objects.add(S3Object.builder().key(key).size((long) data.length).build()); + this.data.put(key, data); + return this; + } + + public S3ClientBuilder endOfBlock() { + blocks.add(Pair.of(objects, data)); + return reset(); + } + + public S3ClientBuilder reset() { + objects = new ArrayList<>(); + data = new HashMap<>(); + return this; + } + + public S3ClientBuilder addObject(final String key, final String data) { + return addObject(key, data.getBytes(StandardCharsets.UTF_8)); + } + + private ResponseBytes getResponse(final String key) { + return ResponseBytes.fromByteArray(new byte[0], data.get(key)); + } + + private ListObjectsV2Response dequeueData() { + if (blocks.isEmpty()) { + objects = Collections.emptyList(); + data = Collections.emptyMap(); + } else { + final Pair, Map> pair = blocks.remove(); + objects = pair.getLeft(); + data = pair.getRight(); + } + return ListObjectsV2Response.builder().contents(objects).isTruncated(false).build(); + } + + public S3Client build() { + if (!objects.isEmpty()) { + endOfBlock(); + } + final S3Client result = mock(S3Client.class); + when(result.listObjectsV2(any(ListObjectsV2Request.class))).thenAnswer(env -> dequeueData()); + when(result.listObjectsV2(any(Consumer.class))).thenAnswer(env -> dequeueData()); + when(result.getObjectAsBytes(any(GetObjectRequest.class))) + .thenAnswer(env -> getResponse(env.getArgument(0, GetObjectRequest.class).key())); + return result; + } + } +} diff --git a/s3-source-connector/src/test/resources/blns.txt b/s3-source-connector/src/test/resources/blns.txt new file mode 100644 index 000000000..ef5671914 --- /dev/null +++ b/s3-source-connector/src/test/resources/blns.txt @@ -0,0 +1,739 @@ +# The MIT License (MIT) +# +# Copyright (c) 2015 Max Woolf +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE + +# +# The Big List of Naughty Strings +# https://github.com/minimaxir/big-list-of-naughty-strings +# + +# Reserved Strings +# +# Strings which may be used elsewhere in code + +undefined +undef +null +NULL +(null) +nil +NIL +true +false +True +False +TRUE +FALSE +None +hasOwnProperty +then +\ +\\ + +# Numeric Strings +# +# Strings which can be interpreted as numeric + +0 +1 +1.00 +$1.00 +1/2 +1E2 +1E02 +1E+02 +-1 +-1.00 +-$1.00 +-1/2 +-1E2 +-1E02 +-1E+02 +1/0 +0/0 +-2147483648/-1 +-9223372036854775808/-1 +-0 +-0.0 ++0 ++0.0 +0.00 +0..0 +. +0.0.0 +0,00 +0,,0 +, +0,0,0 +0.0/0 +1.0/0.0 +0.0/0.0 +1,0/0,0 +0,0/0,0 +--1 +- +-. +-, +999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999 +NaN +Infinity +-Infinity +INF +1#INF +-1#IND +1#QNAN +1#SNAN +1#IND +0x0 +0xffffffff +0xffffffffffffffff +0xabad1dea +123456789012345678901234567890123456789 +1,000.00 +1 000.00 +1'000.00 +1,000,000.00 +1 000 000.00 +1'000'000.00 +1.000,00 +1 000,00 +1'000,00 +1.000.000,00 +1 000 000,00 +1'000'000,00 +01000 +08 +09 +2.2250738585072011e-308 + +# Special Characters +# +# ASCII punctuation. All of these characters may need to be escaped in some +# contexts. Divided into three groups based on (US-layout) keyboard position. + +,./;'[]\-= +<>?:"{}|_+ +!@#$%^&*()`~ + +# Non-whitespace C0 controls: U+0001 through U+0008, U+000E through U+001F, +# and U+007F (DEL) +# Often forbidden to appear in various text-based file formats (e.g. XML), +# or reused for internal delimiters on the theory that they should never +# appear in input. +# The next line may appear to be blank or mojibake in some viewers. + + +# Non-whitespace C1 controls: U+0080 through U+0084 and U+0086 through U+009F. +# Commonly misinterpreted as additional graphic characters. +# The next line may appear to be blank, mojibake, or dingbats in some viewers. +€‚ƒ„†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ + +# Whitespace: all of the characters with category Zs, Zl, or Zp (in Unicode +# version 8.0.0), plus U+0009 (HT), U+000B (VT), U+000C (FF), U+0085 (NEL), +# and U+200B (ZERO WIDTH SPACE), which are in the C categories but are often +# treated as whitespace in some contexts. +# This file unfortunately cannot express strings containing +# U+0000, U+000A, or U+000D (NUL, LF, CR). +# The next line may appear to be blank or mojibake in some viewers. +# The next line may be flagged for "trailing whitespace" in some viewers. + …             ​

    + +# Unicode additional control characters: all of the characters with +# general category Cf (in Unicode 8.0.0). +# The next line may appear to be blank or mojibake in some viewers. +­؀؁؂؃؄؅؜۝܏᠎​‌‍‎‏‪‫‬‭‮⁠⁡⁢⁣⁤⁦⁧⁨⁩𑂽𛲠𛲡𛲢𛲣𝅳𝅴𝅵𝅶𝅷𝅸𝅹𝅺󠀁󠀠󠀡󠀢󠀣󠀤󠀥󠀦󠀧󠀨󠀩󠀪󠀫󠀬󠀭󠀮󠀯󠀰󠀱󠀲󠀳󠀴󠀵󠀶󠀷󠀸󠀹󠀺󠀻󠀼󠀽󠀾󠀿󠁀󠁁󠁂󠁃󠁄󠁅󠁆󠁇󠁈󠁉󠁊󠁋󠁌󠁍󠁎󠁏󠁐󠁑󠁒󠁓󠁔󠁕󠁖󠁗󠁘󠁙󠁚󠁛󠁜󠁝󠁞󠁟󠁠󠁡󠁢󠁣󠁤󠁥󠁦󠁧󠁨󠁩󠁪󠁫󠁬󠁭󠁮󠁯󠁰󠁱󠁲󠁳󠁴󠁵󠁶󠁷󠁸󠁹󠁺󠁻󠁼󠁽󠁾󠁿 + +# "Byte order marks", U+FEFF and U+FFFE, each on its own line. +# The next two lines may appear to be blank or mojibake in some viewers. + +￾ + +# Unicode Symbols +# +# Strings which contain common unicode symbols (e.g. smart quotes) + +Ω≈ç√∫˜µ≤≥÷ +åß∂ƒ©˙∆˚¬…æ +œ∑´®†¥¨ˆøπ“‘ +¡™£¢∞§¶•ªº–≠ +¸˛Ç◊ı˜Â¯˘¿ +ÅÍÎÏ˝ÓÔÒÚÆ☃ +Œ„´‰ˇÁ¨ˆØ∏”’ +`⁄€‹›fifl‡°·‚—± +⅛⅜⅝⅞ +ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя +٠١٢٣٤٥٦٧٨٩ + +# Unicode Subscript/Superscript/Accents +# +# Strings which contain unicode subscripts/superscripts; can cause rendering issues + +⁰⁴⁵ +₀₁₂ +⁰⁴⁵₀₁₂ +ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ + +# Quotation Marks +# +# Strings which contain misplaced quotation marks; can cause encoding errors + +' +" +'' +"" +'"' +"''''"'" +"'"'"''''" + + + + + +# Two-Byte Characters +# +# Strings which contain two-byte characters: can cause rendering issues or character-length issues + +田中さんにあげて下さい +パーティーへ行かないか +和製漢語 +部落格 +사회과학원 어학연구소 +찦차를 타고 온 펲시맨과 쑛다리 똠방각하 +社會科學院語學研究所 +울란바토르 +𠜎𠜱𠝹𠱓𠱸𠲖𠳏 + +# Special Unicode Characters Union +# +# A super string recommended by VMware Inc. Globalization Team: can effectively cause rendering issues or character-length issues to validate product globalization readiness. +# +# 表 CJK_UNIFIED_IDEOGRAPHS (U+8868) +# ポ KATAKANA LETTER PO (U+30DD) +# あ HIRAGANA LETTER A (U+3042) +# A LATIN CAPITAL LETTER A (U+0041) +# 鷗 CJK_UNIFIED_IDEOGRAPHS (U+9DD7) +# Œ LATIN SMALL LIGATURE OE (U+0153) +# é LATIN SMALL LETTER E WITH ACUTE (U+00E9) +# B FULLWIDTH LATIN CAPITAL LETTER B (U+FF22) +# 逍 CJK_UNIFIED_IDEOGRAPHS (U+900D) +# Ü LATIN SMALL LETTER U WITH DIAERESIS (U+00FC) +# ß LATIN SMALL LETTER SHARP S (U+00DF) +# ª FEMININE ORDINAL INDICATOR (U+00AA) +# ą LATIN SMALL LETTER A WITH OGONEK (U+0105) +# ñ LATIN SMALL LETTER N WITH TILDE (U+00F1) +# 丂 CJK_UNIFIED_IDEOGRAPHS (U+4E02) +# 㐀 CJK Ideograph Extension A, First (U+3400) +# 𠀀 CJK Ideograph Extension B, First (U+20000) + +表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀 + +# Changing length when lowercased +# +# Characters which increase in length (2 to 3 bytes) when lowercased +# Credit: https://twitter.com/jifa/status/625776454479970304 + +Ⱥ +Ⱦ + +# Japanese Emoticons +# +# Strings which consists of Japanese-style emoticons which are popular on the web + +ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ +(。◕ ∀ ◕。) +`ィ(´∀`∩ +__ロ(,_,*) +・( ̄∀ ̄)・:*: +゚・✿ヾ╲(。◕‿◕。)╱✿・゚ +,。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’ +(╯°□°)╯︵ ┻━┻) +(ノಥ益ಥ)ノ ┻━┻ +┬─┬ノ( º _ ºノ) +( ͡° ͜ʖ ͡°) +¯\_(ツ)_/¯ + +# Emoji +# +# Strings which contain Emoji; should be the same behavior as two-byte characters, but not always + +😍 +👩🏽 +👾 🙇 💁 🙅 🙆 🙋 🙎 🙍 +🐵 🙈 🙉 🙊 +❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙 +✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿 +🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧 +0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ 🔟 + +# Regional Indicator Symbols +# +# Regional Indicator Symbols can be displayed differently across +# fonts, and have a number of special behaviors + +🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸 +🇺🇸🇷🇺🇸🇦🇫🇦🇲 +🇺🇸🇷🇺🇸🇦 + +# Unicode Numbers +# +# Strings which contain unicode numbers; if the code is localized, it should see the input as numeric + +123 +١٢٣ + +# Right-To-Left Strings +# +# Strings which contain text that should be rendered RTL if possible (e.g. Arabic, Hebrew) + +ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو. +בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ +הָיְתָהtestالصفحات التّحول +﷽ +ﷺ +مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ، + +# Trick Unicode +# +# Strings which contain unicode with unusual properties (e.g. Right-to-left override) (c.f. http://www.unicode.org/charts/PDF/U2000.pdf) + +‪‪test‪ +‫test‫ +
test
 +test⁠test‫ +⁦test⁧ + +# Zalgo Text +# +# Strings which contain "corrupted" text. The corruption will not appear in non-HTML text, however. (via http://www.eeemo.net) + +Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣ +̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰ +̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟ +̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕ +Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮ + +# Unicode Upsidedown +# +# Strings which contain unicode with an "upsidedown" effect (via http://www.upsidedowntext.com) + +˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥ +00˙Ɩ$- + +# Unicode font +# +# Strings which contain bold/italic/etc. versions of normal characters + +The quick brown fox jumps over the lazy dog +𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠 +𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 +𝑻𝒉𝒆 𝒒𝒖𝒊𝒄𝒌 𝒃𝒓𝒐𝒘𝒏 𝒇𝒐𝒙 𝒋𝒖𝒎𝒑𝒔 𝒐𝒗𝒆𝒓 𝒕𝒉𝒆 𝒍𝒂𝒛𝒚 𝒅𝒐𝒈 +𝓣𝓱𝓮 𝓺𝓾𝓲𝓬𝓴 𝓫𝓻𝓸𝔀𝓷 𝓯𝓸𝔁 𝓳𝓾𝓶𝓹𝓼 𝓸𝓿𝓮𝓻 𝓽𝓱𝓮 𝓵𝓪𝔃𝔂 𝓭𝓸𝓰 +𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 +𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐 +⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢ + +# Script Injection +# +# Strings which attempt to invoke a benign script injection; shows vulnerability to XSS + + +<script>alert('123');</script> + + +"> +'> +> + +< / script >< script >alert(123)< / script > + onfocus=JaVaSCript:alert(123) autofocus +" onfocus=JaVaSCript:alert(123) autofocus +' onfocus=JaVaSCript:alert(123) autofocus +<script>alert(123)</script> +ript>alert(123)ript> +--> +";alert(123);t=" +';alert(123);t=' +JavaSCript:alert(123) +;alert(123); +src=JaVaSCript:prompt(132) +">javascript:alert(1); +javascript:alert(1); +javascript:alert(1); +javascript:alert(1); +javascript:alert(1); +javascript:alert(1); +javascript:alert(1); +'`"><\x3Cscript>javascript:alert(1) +'`"><\x00script>javascript:alert(1) +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +ABC
DEF +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +test +`"'> +`"'> +`"'> +`"'> +`"'> +`"'> +`"'> +`"'> +`"'> +`"'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> +"`'> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +XXX + + + +<a href=http://foo.bar/#x=`y></a><img alt="`><img src=x:x onerror=javascript:alert(1)></a>"> +<!--[if]><script>javascript:alert(1)</script --> +<!--[if<img src=x onerror=javascript:alert(1)//]> --> +<script src="/\%(jscript)s"></script> +<script src="\\%(jscript)s"></script> +<IMG """><SCRIPT>alert("XSS")</SCRIPT>"> +<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))> +<IMG SRC=# onmouseover="alert('xxs')"> +<IMG SRC= onmouseover="alert('xxs')"> +<IMG onmouseover="alert('xxs')"> +<IMG SRC=javascript:alert('XSS')> +<IMG SRC=javascript:alert('XSS')> +<IMG SRC=javascript:alert('XSS')> +<IMG SRC="jav ascript:alert('XSS');"> +<IMG SRC="jav ascript:alert('XSS');"> +<IMG SRC="jav ascript:alert('XSS');"> +<IMG SRC="jav ascript:alert('XSS');"> +perl -e 'print "<IMG SRC=java\0script:alert(\"XSS\")>";' > out +<IMG SRC="  javascript:alert('XSS');"> +<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT> +<BODY onload!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")> +<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT> +<<SCRIPT>alert("XSS");//<</SCRIPT> +<SCRIPT SRC=http://ha.ckers.org/xss.js?< B > +<SCRIPT SRC=//ha.ckers.org/.j> +<IMG SRC="javascript:alert('XSS')" +<iframe src=http://ha.ckers.org/scriptlet.html < +\";alert('XSS');// +<u oncopy=alert()> Copy me</u> +<i onwheel=alert(1)> Scroll over me </i> +<plaintext> +http://a/%%30%30 +</textarea><script>alert(123)</script> + +# SQL Injection +# +# Strings which can cause a SQL injection if inputs are not sanitized + +1;DROP TABLE users +1'; DROP TABLE users-- 1 +' OR 1=1 -- 1 +' OR '1'='1 + +% +_ + +# Server Code Injection +# +# Strings which can cause user to run code on server as a privileged user (c.f. https://news.ycombinator.com/item?id=7665153) + +- +-- +--version +--help +$USER +/dev/null; touch /tmp/blns.fail ; echo +`touch /tmp/blns.fail` +$(touch /tmp/blns.fail) +@{[system "touch /tmp/blns.fail"]} + +# Command Injection (Ruby) +# +# Strings which can call system commands within Ruby/Rails applications + +eval("puts 'hello world'") +System("ls -al /") +`ls -al /` +Kernel.exec("ls -al /") +Kernel.exit(1) +%x('ls -al /') + +# XXE Injection (XML) +# +# String which can reveal system files when parsed by a badly configured XML parser + +<?xml version="1.0" encoding="ISO-8859-1"?><!DOCTYPE foo [ <!ELEMENT foo ANY ><!ENTITY xxe SYSTEM "file:///etc/passwd" >]><foo>&xxe;</foo> + +# Unwanted Interpolation +# +# Strings which can be accidentally expanded into different strings if evaluated in the wrong context, e.g. used as a printf format string or via Perl or shell eval. Might expose sensitive data from the program doing the interpolation, or might just represent the wrong string. + +$HOME +$ENV{'HOME'} +%d +%s%s%s%s%s +{0} +%*.*s +%@ +%n +File:/// + +# File Inclusion +# +# Strings which can cause user to pull in files that should not be a part of a web server + +../../../../../../../../../../../etc/passwd%00 +../../../../../../../../../../../etc/hosts + +# Known CVEs and Vulnerabilities +# +# Strings that test for known vulnerabilities + +() { 0; }; touch /tmp/blns.shellshock1.fail; +() { _; } >_[$($())] { touch /tmp/blns.shellshock2.fail; } +<<< %s(un='%s') = %u ++++ATH0 + +# MSDOS/Windows Special Filenames +# +# Strings which are reserved characters in MSDOS/Windows + +CON +PRN +AUX +CLOCK$ +NUL +A: +ZZ: +COM1 +LPT1 +LPT2 +LPT3 +COM2 +COM3 +COM4 + +# IRC specific strings +# +# Strings that may occur on IRC clients that make security products freak out + +DCC SEND STARTKEYLOGGER 0 0 0 + +# Scunthorpe Problem +# +# Innocuous strings which may be blocked by profanity filters (https://en.wikipedia.org/wiki/Scunthorpe_problem) + +Scunthorpe General Hospital +Penistone Community Church +Lightwater Country Park +Jimmy Clitheroe +Horniman Museum +shitake mushrooms +RomansInSussex.co.uk +http://www.cum.qc.ca/ +Craig Cockburn, Software Specialist +Linda Callahan +Dr. Herman I. Libshitz +magna cum laude +Super Bowl XXX +medieval erection of parapets +evaluate +mocha +expression +Arsenal canal +classic +Tyson Gay +Dick Van Dyke +basement + +# Human injection +# +# Strings which may cause human to reinterpret worldview + +If you're reading this, you've been in a coma for almost 20 years now. We're trying a new technique. We don't know where this message will end up in your dream, but we hope it works. Please wake up, we miss you. + +# Terminal escape codes +# +# Strings which punish the fools who use cat/type on this file + +Roses are red, violets are blue. Hope you enjoy terminal hue +But now...for my greatest trick... +The quick brown fox... [Beeeep] + +# iOS Vulnerabilities +# +# Strings which crashed iMessage in various versions of iOS + +Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗 +🏳0🌈️ +జ్ఞ‌ా diff --git a/s3-source-connector/src/test/resources/logback-test.xml b/s3-source-connector/src/test/resources/logback-test.xml new file mode 100644 index 000000000..f1d0b0cb6 --- /dev/null +++ b/s3-source-connector/src/test/resources/logback-test.xml @@ -0,0 +1,12 @@ +<configuration> + + <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender"> + <encoder> + <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> + </encoder> + </appender> + + <root level="debug"> + <appender-ref ref="STDOUT"/> + </root> +</configuration> diff --git a/s3-source-connector/src/test/resources/testtopic-0-0001.txt b/s3-source-connector/src/test/resources/testtopic-0-0001.txt new file mode 100644 index 000000000..c857053a9 --- /dev/null +++ b/s3-source-connector/src/test/resources/testtopic-0-0001.txt @@ -0,0 +1,6 @@ +performanceeeqjz fileajbzt reliabilityrtbxg Amazonyomxx S3jsukk S3jicqd multipartudsma Amazonlboqk contentepuod Amazonijbif filepbqji performancepsszv dataouuzw S3rdwof Amazonsyzgo filehhija reliabilitykccrg performanceefrfz Amazonzptcv multipartvpkxv. +contentgsrgr multipartqjaov filezqtro fileaunmb filerfjrx S3xkjoj filevlhez filernzty Amazondkzpv Amazonxdspv filebmwri Amazonxzqxz multipartggoaf fileqrpzo contentoporo filehsvga filehysoz Amazongqhtq multipartcqmwp S3tjaxu. +uploadupvpo performancewuoyl multipartbjjuk uploadpuecx dataqsdrc reliabilityusrbn S3wxsqo uploadmjczp Amazonulvpp datawahgl uploadghuib contentxvwoh contentvgtbd contentsttlw performancemnkib S3jdffr datasxzfy filevktta contentuewkr dataakciu. +S3kiqqs S3xmlbh reliabilitynrjhd Amazongbico S3honxh performancekwcyf performancehemxu contentzfktk filemuxvv uploadzcgqj reliabilitysdkwz filemzbbt performancezmfkb datazknlk Amazonkssri performancexklrb S3pfajq filekhldu reliabilitylixgd contenthqooz. +Amazonlljev datalbwgf fileimhqf multipartejavv Amazonsqfyd contentlfytq datapsrpi contentzotzk contentpfauu reliabilitysgqdc dataeiwnu filekyhlx contentcoomf performancetsxwq datacgjjl Amazoncrptx filekpsqv dataujipy performanceqjzow uploaddzryh. +Amazontpxgu reliabilitycapks fileiqyhi reliabilityxlxvs filepejwa contenttgbtb contentknony fileacpga datadqnqt S3erclt performancennoll reliabilityadyxe contentxutca contentjcoec multipartnjaef contentkcowq performancedzidj Amazonrwoaj dataogmoh performancewmtpn. diff --git a/settings.gradle.kts b/settings.gradle.kts index 5bded6986..a4451cb5e 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -6,6 +6,7 @@ val avroConverterVersion by extra("7.2.2") val avroDataVersion by extra("7.2.2") val awaitilityVersion by extra("4.2.1") val commonsTextVersion by extra("1.11.0") +val commonsCollections4Version by extra("4.4") val hadoopVersion by extra("3.4.0") val hamcrestVersion by extra("2.2") val jacksonVersion by extra("2.15.3") @@ -30,6 +31,9 @@ dependencyResolutionManagement { create("apache") { library("avro", "org.apache.avro:avro:$avroVersion") library("commons-text", "org.apache.commons:commons-text:$commonsTextVersion") + library( + "commons-collection4", + "org.apache.commons:commons-collections4:$commonsCollections4Version") library("kafka-connect-api", "org.apache.kafka:connect-api:$kafkaVersion") library("kafka-connect-json", "org.apache.kafka:connect-json:$kafkaVersion") library("kafka-connect-runtime", "org.apache.kafka:connect-runtime:$kafkaVersion") @@ -40,6 +44,7 @@ dependencyResolutionManagement { "org.apache.hadoop:hadoop-mapreduce-client-core:$hadoopVersion") library("parquet-avro", "org.apache.parquet:parquet-avro:$parquetVersion") library("parquet-tools", "org.apache.parquet:parquet-tools:$parquetVersion") + library("parquet-hadoop", "org.apache.parquet:parquet-hadoop:$parquetVersion") } create("compressionlibs") { library("snappy", "org.xerial.snappy:snappy-java:$snappyVersion") @@ -98,3 +103,5 @@ include("gcs-sink-connector") include("s3-sink-connector") include("azure-sink-connector") + +include("s3-source-connector")