From 1bc36df1732275513c8efe271f4f0a5881e4671f Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Tue, 4 Feb 2020 15:59:30 +0100 Subject: [PATCH 01/43] Fix url for jcenter.bintray.com repository: from http to https --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index ef8341738..1b3440ffd 100644 --- a/pom.xml +++ b/pom.xml @@ -163,7 +163,7 @@ jcenter - http://jcenter.bintray.com + https://jcenter.bintray.com false From 919c33780ef84812217f9052418223253f62f2d6 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Wed, 5 Feb 2020 11:17:48 +0100 Subject: [PATCH 02/43] Added databricks option to support databricks run: load config from DBFS, using the DB spark context --- .../runner/StreamProcessingRunner.java | 31 ++++++++++---- .../logisland-utils/pom.xml | 18 ++++++++ .../logisland/config/ConfigReader.java | 42 ++++++++++++++++++- 3 files changed, 83 insertions(+), 8 deletions(-) diff --git a/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/StreamProcessingRunner.java b/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/StreamProcessingRunner.java index 85f2795a3..9d0806de0 100644 --- a/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/StreamProcessingRunner.java +++ b/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/StreamProcessingRunner.java @@ -40,7 +40,7 @@ public class StreamProcessingRunner { */ public static void main(String[] args) { - logger.info("starting StreamProcessingRunner"); + logger.info("Starting StreamProcessingRunner"); ////////////////////////////////////////// // Commande lien management @@ -60,6 +60,13 @@ public static void main(String[] args) { Option conf = OptionBuilder.create("conf"); options.addOption(conf); + OptionBuilder.withArgName("databricks"); + OptionBuilder.withLongOpt("databricks-mode"); + OptionBuilder.isRequired(false); + OptionBuilder.hasArg(false); + OptionBuilder.withDescription("Databricks mode (configuration is read from DBFS)"); + Option databricks = OptionBuilder.create("databricks"); + options.addOption(databricks); Optional engineInstance = Optional.empty(); try { @@ -70,7 +77,16 @@ public static void main(String[] args) { String configFile = line.getOptionValue("conf"); // load the YAML config - LogislandConfiguration sessionConf = ConfigReader.loadConfig(configFile); + LogislandConfiguration sessionConf; + + boolean databricksMode = line.hasOption("databricks"); + + if (databricksMode) { + sessionConf = ConfigReader.loadConfigFromSharedFS(configFile); + } else { + sessionConf = ConfigReader.loadConfig(configFile); + } + logger.info("Configuration loaded"); // instantiate engine and all the processor from the config engineInstance = ComponentFactory.getEngineContext(sessionConf.getEngine()); @@ -80,23 +96,24 @@ public static void main(String[] args) { if (!engineInstance.get().isValid()) { throw new IllegalArgumentException("engineInstance is not valid with input configuration !"); } - logger.info("starting Logisland session version {}", sessionConf.getVersion()); + logger.info("Starting Logisland session version {}", sessionConf.getVersion()); logger.info(sessionConf.getDocumentation()); } catch (Exception e) { - logger.error("unable to launch runner", e); + logger.error("Unable to launch runner", e); System.exit(-1); } String engineName = engineInstance.get().getEngine().getIdentifier(); try { // start the engine EngineContext engineContext = engineInstance.get(); - logger.info("start engine {}", engineName); + logger.info("Start engine {}", engineName); engineInstance.get().getEngine().start(engineContext); - logger.info("awaitTermination for engine {}", engineName); + logger.info("Waiting termination of engine {}", engineName); engineContext.getEngine().awaitTermination(engineContext); + logger.info("Engine {} terminated", engineName); System.exit(0); } catch (Exception e) { - logger.error("something went bad while running the job {} : {}", engineName, e); + logger.error("Something went bad while running the job {} : {}", engineName, e); System.exit(-1); } diff --git a/logisland-core/logisland-framework/logisland-utils/pom.xml b/logisland-core/logisland-framework/logisland-utils/pom.xml index e91674356..841033dd6 100644 --- a/logisland-core/logisland-framework/logisland-utils/pom.xml +++ b/logisland-core/logisland-framework/logisland-utils/pom.xml @@ -26,6 +26,12 @@ logisland-utils jar + + + 2.3.0 + 2.11 + + @@ -33,6 +39,18 @@ logisland-api ${project.version} + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + provided + + + com.google.guava + guava + + + org.apache.avro avro diff --git a/logisland-core/logisland-framework/logisland-utils/src/main/java/com/hurence/logisland/config/ConfigReader.java b/logisland-core/logisland-framework/logisland-utils/src/main/java/com/hurence/logisland/config/ConfigReader.java index eaff3a426..6f552b8b8 100644 --- a/logisland-core/logisland-framework/logisland-utils/src/main/java/com/hurence/logisland/config/ConfigReader.java +++ b/logisland-core/logisland-framework/logisland-utils/src/main/java/com/hurence/logisland/config/ConfigReader.java @@ -20,6 +20,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import com.hurence.logisland.util.string.StringUtils; +import org.apache.spark.SparkContext; +import org.apache.spark.rdd.RDD; import java.io.File; import java.io.FileNotFoundException; @@ -27,6 +29,7 @@ import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Paths; +import java.util.Arrays; public class ConfigReader { @@ -40,7 +43,7 @@ static String readFile(String path, Charset encoding) /** - * Loads a YAML config file + * Loads a YAML config file (file located in the local file system) * * @param configFilePath the path of the config file * @return a LogislandSessionConfiguration @@ -66,6 +69,43 @@ public static LogislandConfiguration loadConfig(String configFilePath) throws Ex return logislandConf; } + /** + * Loads a YAML config file using (file located in the shared filesystem) + * + * @param configFilePath the path of the config file + * @return a LogislandSessionConfiguration + * @throws Exception + */ + public static LogislandConfiguration loadConfigFromSharedFS(String configFilePath) throws Exception { + ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); + + /** + * In Databricks, developers should utilize the shared SparkContext instead of creating one using the constructor. + * When running a job, you can access the shared context by calling SparkContext.getOrCreate(). + * + * Also in databricks, a path like /path/to/a/file will be loaded from DBFS so will be interpreted like + * dbfs:/path/to/a/file + */ + + SparkContext sparkContext = SparkContext.getOrCreate(); + + RDD configRdd = sparkContext.textFile(configFilePath, 1); + String[] configStringArray = (String[])configRdd.collect(); + String configString = String.join("\n", Arrays.asList(configStringArray)); + + System.out.println("DBFS Configuration:\n" + configString); + + // replace all host from environment variables + String fileContent = StringUtils.resolveEnvVars(configString, "localhost"); + + System.out.println("Resolved Configuration:\n" + fileContent); + + LogislandConfiguration logislandConf = mapper.readValue(fileContent, LogislandConfiguration.class); + checkLogislandConf(logislandConf); + + return logislandConf; + } + private static void checkLogislandConf(LogislandConfiguration conf) throws IllegalArgumentException { if (conf.getEngine().getComponent() == null || conf.getEngine().getComponent().isEmpty()) { throw new IllegalArgumentException("key 'component' is missing or empty for engine in configuration file"); From 587814edc667ca4a863af8ec0c92e055398effd2 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Wed, 5 Feb 2020 17:03:20 +0100 Subject: [PATCH 03/43] Introducing SparkConfigReader class to be packaged and loaded only in databricks environment (no more useless spark dependency when outside of databricks) --- .../util/spark/SparkConfigReader.java | 59 +++++++++++++++++++ .../runner/StreamProcessingRunner.java | 46 +++++++++++++-- .../logisland-utils/pom.xml | 17 ------ .../logisland/component/ComponentFactory.java | 1 + .../logisland/config/ConfigReader.java | 45 +------------- 5 files changed, 102 insertions(+), 66 deletions(-) create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/java/com/hurence/logisland/util/spark/SparkConfigReader.java diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/java/com/hurence/logisland/util/spark/SparkConfigReader.java b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/java/com/hurence/logisland/util/spark/SparkConfigReader.java new file mode 100644 index 000000000..1f059e7e6 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/java/com/hurence/logisland/util/spark/SparkConfigReader.java @@ -0,0 +1,59 @@ +package com.hurence.logisland.util.spark; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; +import com.hurence.logisland.config.LogislandConfiguration; +import com.hurence.logisland.util.string.StringUtils; +import org.apache.spark.SparkContext; +import org.apache.spark.rdd.RDD; + +import java.util.Arrays; + +import static com.hurence.logisland.config.ConfigReader.checkLogislandConf; + +/** + * This configuration reader depends on spark. We do not want to place methods in this class in the + * com.hurence.logisland.config.ConfigReader class where the loadConfig (from local filesystem) method + * resides, as it would introduce a spark dependency in the logisland-framework module. Only the spark + * engine should have a spark dependency. So this class should be loaded from the StreamProcessingRunner + * and this will succeed only in environments where a spark 2 engine is available and used, otherwise it + * will fail to load. This will for instance be successful in the databricks environment, which is by the + * way the first purpose for which this class is being introduced. + */ +public class SparkConfigReader { + + /** + * Loads a YAML config file using (file located in the shared filesystem) + * + * @param configFilePath the path of the config file + * @return a LogislandSessionConfiguration + * @throws Exception + */ + public static LogislandConfiguration loadConfigFromSharedFS(String configFilePath) throws Exception { + ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); + + /** + * In Databricks, developers should utilize the shared SparkContext instead of creating one using the constructor. + * When running a job, you can access the shared context by calling SparkContext.getOrCreate(). + * + * Also in databricks, a path like /path/to/a/file will be loaded from DBFS so will be interpreted like + * dbfs:/path/to/a/file + */ + + SparkContext sparkContext = SparkContext.getOrCreate(); + + RDD configRdd = sparkContext.textFile(configFilePath, 1); + String[] configStringArray = (String[])configRdd.collect(); + String configString = String.join("\n", Arrays.asList(configStringArray)); + + // replace all host from environment variables + String fileContent = StringUtils.resolveEnvVars(configString, "localhost"); + + System.out.println("Configuration:\n" + fileContent); + + LogislandConfiguration logislandConf = mapper.readValue(fileContent, LogislandConfiguration.class); + checkLogislandConf(logislandConf); + + return logislandConf; + } +} diff --git a/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/StreamProcessingRunner.java b/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/StreamProcessingRunner.java index 9d0806de0..ca361882a 100644 --- a/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/StreamProcessingRunner.java +++ b/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/StreamProcessingRunner.java @@ -24,10 +24,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.nio.channels.IllegalChannelGroupException; +import java.lang.reflect.Method; import java.util.Optional; - public class StreamProcessingRunner { private static Logger logger = LoggerFactory.getLogger(StreamProcessingRunner.class); @@ -81,10 +80,11 @@ public static void main(String[] args) { boolean databricksMode = line.hasOption("databricks"); - if (databricksMode) { - sessionConf = ConfigReader.loadConfigFromSharedFS(configFile); - } else { + if (!databricksMode) { sessionConf = ConfigReader.loadConfig(configFile); + } else { + logger.info("Running in databricks mode"); + sessionConf = loadConfigFromSharedFS(configFile); } logger.info("Configuration loaded"); @@ -116,7 +116,43 @@ public static void main(String[] args) { logger.error("Something went bad while running the job {} : {}", engineName, e); System.exit(-1); } + } + /** + * Loads the configuration from the shared filesystem + * @param configFile Configuration path to load. + * With databricks, no need to put the 'dbfs:' scheme: use /foo/logisland.yml instead of + * dbfs:/foo/logisland.yml + * @return The read configuration + */ + private static LogislandConfiguration loadConfigFromSharedFS(String configFile) { + + // Load the spark config reader. This is only expected to work if a spark 2+ engine is being + // used and available in the classpath (which should be the case in the azure databricks environment) + Class sparkConfigReaderClass = null; + try { + sparkConfigReaderClass = Class.forName("com.hurence.logisland.util.spark.SparkConfigReader"); + } catch(Exception e) { + logger.error("Could not load the SparkConfigReader class", e); + System.exit(-1); + } + // Prepare to call the loadConfigFromSharedFS method + Method loadConfigFromSharedFSMethod = null; + try { + loadConfigFromSharedFSMethod = sparkConfigReaderClass.getMethod("loadConfigFromSharedFS", String.class); + } catch (Exception e) { + logger.error("Could not find method loadConfigFromSharedFS in SparkConfigReader class", e); + System.exit(-1); + } + // Call the loadConfigFromSharedFS method to read the configuration from the shared filesystem + LogislandConfiguration LogislandConfiguration = null; + try { + LogislandConfiguration = (LogislandConfiguration)loadConfigFromSharedFSMethod.invoke(null, configFile); + } catch(Exception e) { + logger.error("Could not load configuration from shared filesystem", e); + System.exit(-1); + } + return LogislandConfiguration; } } diff --git a/logisland-core/logisland-framework/logisland-utils/pom.xml b/logisland-core/logisland-framework/logisland-utils/pom.xml index 841033dd6..e8708d5c2 100644 --- a/logisland-core/logisland-framework/logisland-utils/pom.xml +++ b/logisland-core/logisland-framework/logisland-utils/pom.xml @@ -27,11 +27,6 @@ logisland-utils jar - - 2.3.0 - 2.11 - - @@ -39,18 +34,6 @@ logisland-api ${project.version} - - org.apache.spark - spark-core_${scala.binary.version} - ${spark.version} - provided - - - com.google.guava - guava - - - org.apache.avro avro diff --git a/logisland-core/logisland-framework/logisland-utils/src/main/java/com/hurence/logisland/component/ComponentFactory.java b/logisland-core/logisland-framework/logisland-utils/src/main/java/com/hurence/logisland/component/ComponentFactory.java index 1bfa1b3e7..bef48f52f 100644 --- a/logisland-core/logisland-framework/logisland-utils/src/main/java/com/hurence/logisland/component/ComponentFactory.java +++ b/logisland-core/logisland-framework/logisland-utils/src/main/java/com/hurence/logisland/component/ComponentFactory.java @@ -139,6 +139,7 @@ public static T loadComponent(String className) throws ClassNotFoundExceptio try { return (T) PluginLoader.loadPlugin(className); } catch (ClassNotFoundException cnfe) { + logger.warn("Class " + className + " not found in plugins: trying to load from current class loader"); return (T) Class.forName(className).newInstance(); } } catch (Exception e) { diff --git a/logisland-core/logisland-framework/logisland-utils/src/main/java/com/hurence/logisland/config/ConfigReader.java b/logisland-core/logisland-framework/logisland-utils/src/main/java/com/hurence/logisland/config/ConfigReader.java index 6f552b8b8..d83e06b27 100644 --- a/logisland-core/logisland-framework/logisland-utils/src/main/java/com/hurence/logisland/config/ConfigReader.java +++ b/logisland-core/logisland-framework/logisland-utils/src/main/java/com/hurence/logisland/config/ConfigReader.java @@ -15,13 +15,10 @@ */ package com.hurence.logisland.config; - import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import com.hurence.logisland.util.string.StringUtils; -import org.apache.spark.SparkContext; -import org.apache.spark.rdd.RDD; import java.io.File; import java.io.FileNotFoundException; @@ -29,8 +26,6 @@ import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Paths; -import java.util.Arrays; - public class ConfigReader { @@ -41,7 +36,6 @@ static String readFile(String path, Charset encoding) return new String(encoded, encoding); } - /** * Loads a YAML config file (file located in the local file system) * @@ -69,44 +63,7 @@ public static LogislandConfiguration loadConfig(String configFilePath) throws Ex return logislandConf; } - /** - * Loads a YAML config file using (file located in the shared filesystem) - * - * @param configFilePath the path of the config file - * @return a LogislandSessionConfiguration - * @throws Exception - */ - public static LogislandConfiguration loadConfigFromSharedFS(String configFilePath) throws Exception { - ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); - - /** - * In Databricks, developers should utilize the shared SparkContext instead of creating one using the constructor. - * When running a job, you can access the shared context by calling SparkContext.getOrCreate(). - * - * Also in databricks, a path like /path/to/a/file will be loaded from DBFS so will be interpreted like - * dbfs:/path/to/a/file - */ - - SparkContext sparkContext = SparkContext.getOrCreate(); - - RDD configRdd = sparkContext.textFile(configFilePath, 1); - String[] configStringArray = (String[])configRdd.collect(); - String configString = String.join("\n", Arrays.asList(configStringArray)); - - System.out.println("DBFS Configuration:\n" + configString); - - // replace all host from environment variables - String fileContent = StringUtils.resolveEnvVars(configString, "localhost"); - - System.out.println("Resolved Configuration:\n" + fileContent); - - LogislandConfiguration logislandConf = mapper.readValue(fileContent, LogislandConfiguration.class); - checkLogislandConf(logislandConf); - - return logislandConf; - } - - private static void checkLogislandConf(LogislandConfiguration conf) throws IllegalArgumentException { + public static void checkLogislandConf(LogislandConfiguration conf) throws IllegalArgumentException { if (conf.getEngine().getComponent() == null || conf.getEngine().getComponent().isEmpty()) { throw new IllegalArgumentException("key 'component' is missing or empty for engine in configuration file"); } From 25fddb0354c8ba26481ca6cd74d295bf46577ea1 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Mon, 10 Feb 2020 12:45:09 +0100 Subject: [PATCH 04/43] Introducing logisland-engine-spark_2_4plus_kafka_2_4plus --- .../src/assembly/shared-dependencies.xml | 1 + .../pom.xml | 217 ++++++ .../logisland/util/spark/Spark24Platform.java | 30 + ...hurence.logisland.util.spark.SparkPlatform | 1 + .../pom.xml | 413 +++++++++++ .../AbstractKafkaConnectComponent.java | 223 ++++++ .../com/hurence/logisland/connect/Utils.java | 109 +++ .../converter/LogIslandRecordConverter.java | 232 ++++++ .../connect/sink/KafkaConnectStreamSink.java | 139 ++++ .../logisland/connect/sink/NullSink.java | 96 +++ .../connect/sink/SimpleSinkTaskContext.java | 81 +++ .../source/KafkaConnectStreamSource.java | 286 ++++++++ .../KafkaConnectStreamSourceProvider.java | 73 ++ .../connect/source/SimplePartition.java | 51 ++ .../logisland/connect/source/SimpleRDD.java | 64 ++ .../source/timed/ClockSourceConnector.java | 136 ++++ .../connect/source/timed/ClockSourceTask.java | 250 +++++++ ...PipelineConfigurationBroadcastWrapper.java | 80 +++ .../engine/spark/remote/RemoteApiClient.java | 247 +++++++ .../remote/RemoteApiComponentFactory.java | 250 +++++++ .../engine/spark/remote/model/Component.java | 186 +++++ .../engine/spark/remote/model/DataFlow.java | 144 ++++ .../engine/spark/remote/model/Pipeline.java | 108 +++ .../engine/spark/remote/model/Processor.java | 66 ++ .../engine/spark/remote/model/Property.java | 147 ++++ .../engine/spark/remote/model/Service.java | 66 ++ .../engine/spark/remote/model/Stream.java | 94 +++ .../engine/spark/remote/model/Versioned.java | 125 ++++ .../util/spark/ProcessorMetrics.java | 123 ++++ .../util/spark/ProtoBufRegistrator.java | 30 + .../util/spark/SparkConfigReader.java | 59 ++ .../logisland/util/spark/SparkPlatform.java | 27 + .../org.apache.spark.metrics.sink.KafkaSink | 1 + .../spark/KafkaStreamProcessingEngine.scala | 659 ++++++++++++++++++ .../RemoteApiStreamProcessingEngine.scala | 198 ++++++ .../spark/AbstractKafkaRecordStream.scala | 344 +++++++++ .../stream/spark/DummyRecordStream.scala | 68 ++ .../spark/KafkaRecordStreamDebugger.scala | 191 +++++ .../spark/KafkaRecordStreamHDFSBurner.scala | 229 ++++++ .../KafkaRecordStreamParallelProcessing.scala | 226 ++++++ .../KafkaRecordStreamSQLAggregator.scala | 160 +++++ .../stream/spark/SparkRecordStream.scala | 34 + .../logisland/stream/spark/package.scala | 546 +++++++++++++++ .../KafkaConnectBaseProviderService.scala | 112 +++ ...ConnectStructuredSinkProviderService.scala | 122 ++++ ...nnectStructuredSourceProviderService.scala | 83 +++ .../stream/spark/provider/package.scala | 126 ++++ .../spark/structured/StructuredStream.scala | 184 +++++ ...nsoleStructuredStreamProviderService.scala | 183 +++++ ...KafkaStructuredStreamProviderService.scala | 275 ++++++++ ...lFileStructuredStreamProviderService.scala | 167 +++++ .../MQTTStructuredStreamProviderService.scala | 174 +++++ .../RateStructuredStreamProviderService.scala | 202 ++++++ .../StructuredStreamProviderService.scala | 391 +++++++++++ .../logisland/util/kafka/KafkaReporter.scala | 224 ++++++ .../logisland/util/kafka/KafkaSink.scala | 76 ++ .../util/mqtt/MQTTStreamSource.scala | 240 +++++++ .../logisland/util/mqtt/MessageStore.scala | 109 +++ .../spark/ControllerServiceLookupSink.scala | 52 ++ .../logisland/util/spark/SparkUtils.scala | 272 ++++++++ .../apache/spark/metrics/sink/KafkaSink.scala | 91 +++ .../logisland/connect/KafkaConnectTest.java | 84 +++ .../LogIslandRecordConverterTest.java | 129 ++++ .../logisland/connect/fake/FakeConnector.java | 116 +++ .../logisland/connect/fake/TestSink.java | 57 ++ .../logisland/connect/fake/TestSinkTask.java | 55 ++ ...stractStreamProcessingIntegrationTest.java | 246 +++++++ ...mmaticStreamProcessingIntegrationTest.java | 170 +++++ .../RecordStreamProcessingDebuggerTest.java | 282 ++++++++ .../logisland/engine/RemoteApiEngineTest.java | 85 +++ .../logisland/engine/SparkEngineConfTest.java | 182 +++++ .../logisland/engine/StreamDebuggerTest.java | 79 +++ .../spark/remote/RemoteApiClientTest.java | 90 +++ .../spark/remote/mock/MockProcessor.java | 37 + .../remote/mock/MockServiceController.java | 29 + .../engine/spark/remote/mock/MockStream.java | 29 + .../structured/StructuredStreamTest.java | 83 +++ ...leStructuredStreamProviderServiceTest.java | 59 ++ .../ProviderServiceAsReaderRunner.java | 121 ++++ .../resources/conf/kafka-connect-stream.yml | 138 ++++ .../src/test/resources/conf/opencv.yml | 62 ++ .../src/test/resources/conf/remote-engine.yml | 38 + .../test/resources/conf/structured-stream.yml | 76 ++ .../conf/timeseries-structured-stream.yml | 99 +++ .../src/test/resources/log4j.properties | 65 ++ .../src/test/resources/logback.xml | 58 ++ .../pom.xml | 25 + .../logisland-engine-spark_2_common/pom.xml | 1 - logisland-core/logisland-engines/pom.xml | 1 + 89 files changed, 12388 insertions(+), 1 deletion(-) create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/pom.xml create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/resources/META-INF/services/com.hurence.logisland.util.spark.SparkPlatform create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/pom.xml create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/AbstractKafkaConnectComponent.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/Utils.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/converter/LogIslandRecordConverter.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/KafkaConnectStreamSink.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/NullSink.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/SimpleSinkTaskContext.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSource.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSourceProvider.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimplePartition.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimpleRDD.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceConnector.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceTask.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/PipelineConfigurationBroadcastWrapper.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiClient.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiComponentFactory.java create mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Component.java create mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/DataFlow.java create mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Pipeline.java create mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Processor.java create mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Property.java create mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Service.java create mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Stream.java create mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Versioned.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProcessorMetrics.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProtoBufRegistrator.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkConfigReader.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkPlatform.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/resources/META-INF/services/org.apache.spark.metrics.sink.KafkaSink create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/KafkaStreamProcessingEngine.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/RemoteApiStreamProcessingEngine.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/AbstractKafkaRecordStream.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/DummyRecordStream.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamDebugger.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamHDFSBurner.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamParallelProcessing.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamSQLAggregator.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/SparkRecordStream.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectBaseProviderService.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSinkProviderService.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSourceProviderService.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/package.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/ConsoleStructuredStreamProviderService.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/KafkaStructuredStreamProviderService.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderService.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/MQTTStructuredStreamProviderService.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/RateStructuredStreamProviderService.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaReporter.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaSink.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MQTTStreamSource.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MessageStore.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/ControllerServiceLookupSink.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/SparkUtils.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/org/apache/spark/metrics/sink/KafkaSink.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/KafkaConnectTest.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/converter/LogIslandRecordConverterTest.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/FakeConnector.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSink.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSinkTask.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/AbstractStreamProcessingIntegrationTest.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/ProgrammaticStreamProcessingIntegrationTest.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RecordStreamProcessingDebuggerTest.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RemoteApiEngineTest.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/SparkEngineConfTest.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/StreamDebuggerTest.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/RemoteApiClientTest.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockProcessor.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockServiceController.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockStream.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/StructuredStreamTest.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderServiceTest.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/ProviderServiceAsReaderRunner.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/kafka-connect-stream.yml create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/opencv.yml create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/remote-engine.yml create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/structured-stream.yml create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/timeseries-structured-stream.yml create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/log4j.properties create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/logback.xml create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/pom.xml diff --git a/logisland-assembly/src/assembly/shared-dependencies.xml b/logisland-assembly/src/assembly/shared-dependencies.xml index cb3f1322d..9c1b5de08 100644 --- a/logisland-assembly/src/assembly/shared-dependencies.xml +++ b/logisland-assembly/src/assembly/shared-dependencies.xml @@ -26,6 +26,7 @@ + com.hurence.logisland:logisland-engine-spark_2_4_kafka_2_4 com.hurence.logisland:logisland-engine-spark_2_1 com.hurence.logisland:logisland-engine-spark_2_3 com.hurence.logisland:logisland-engine-spark_1_6 diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/pom.xml new file mode 100644 index 000000000..f92399f6b --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/pom.xml @@ -0,0 +1,217 @@ + + + 4.0.0 + + com.hurence.logisland + logisland-engine-spark_2_4plus_kafka_2_4plus + 1.2.0 + + logisland-engine-spark_2_4_kafka_2_4 + jar + + + + + 2.12 + 2.4.4 + 2.4.0 + 2.12.10 + + + + + + + + + org.apache.kafka + kafka_${scala.binary.version} + ${kafka.version} + true + runtime + + + + org.apache.kafka + kafka-clients + ${kafka.version} + true + runtime + + + org.apache.bahir + spark-sql-streaming-mqtt_2.11 + 2.2.0 + runtime + true + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + provided + + + com.google.guava + guava + + + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-mllib_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-streaming-kafka + ${spark.version} + runtime + true + + + org.apache.spark + spark-sql-kafka + ${spark.version} + runtime + true + + + org.apache.spark + spark-streaming-kafka-assembly_${scala.binary.version} + ${spark.version} + runtime + true + + + + + + + com.hurence.logisland + logisland-engine-spark_2_common + ${project.version} + true + + + org.apache.spark + spark-sql_${scala.binary.version} + provided + + + + org.scala-lang + scala-library + ${scala.version} + provided + true + + + com.banzaicloud + spark-metrics_2.11 + 2.3-1.1.0 + + + io.prometheus + simpleclient + 0.0.23 + + + io.prometheus + simpleclient_dropwizard + 0.0.23 + + + io.prometheus + simpleclient_pushgateway + 0.0.23 + + + + + + + + + org.immutables.tools + maven-shade-plugin + 4 + + + package + + shade + + + + + com.fasterxml.jackson.datatype:jackson-datatype-jsr310 + com.fasterxml.jackson.datatype:jackson-datatype-jdk8 + com.hurence.logisland:logisland-engine-spark_2_common + *:* + + + com.fasterxml.jackson.core:* + com.fasterxml.jackson.databind:* + com.fasterxml.jackson.jaxrs*:* + com.fasterxml.jackson.module:jackson-module-jaxb-annotations + org.scala-lang:* + org.scalatest:* + org.apache.zookeeper:* + com.google.guava:* + org.apache.commons:* + org.slf4j:* + log4j:* + org.yaml:* + org.eclipse.jetty:* + org.glassfish.hk2*:* + org.glassfish.jersey*:* + + + + + *:* + + META-INF/license/** + META-INF/* + META-INF/maven/** + LICENSE + NOTICE + /*.txt + build.properties + + + + + + + + + + + + + + + banzaicloud-github + https://raw.github.com/banzaicloud/spark-metrics/master/maven-repo/releases + + + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java new file mode 100644 index 000000000..4383f2597 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java @@ -0,0 +1,30 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.util.spark; + +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; + +public class Spark24Platform implements SparkPlatform { + @Override + public Dataset createStreamingDataFrame(SQLContext sqlContext, RDD catalystRows, StructType schema) { + return sqlContext.internalCreateDataFrame(catalystRows, schema, true); + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/resources/META-INF/services/com.hurence.logisland.util.spark.SparkPlatform b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/resources/META-INF/services/com.hurence.logisland.util.spark.SparkPlatform new file mode 100644 index 000000000..405b9bf4e --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/resources/META-INF/services/com.hurence.logisland.util.spark.SparkPlatform @@ -0,0 +1 @@ +com.hurence.logisland.util.spark.Spark24Platform \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/pom.xml new file mode 100644 index 000000000..f49996d4d --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/pom.xml @@ -0,0 +1,413 @@ + + + 4.0.0 + + com.hurence.logisland + logisland-engine-spark_2_4plus_kafka_2_4plus + 1.2.0 + + logisland-engine-spark_2_4plus_common + jar + + + + 1.5.16 + 4.12 + 3.1.5 + 3.0.4 + 0.3.5 + 2.12 + 2.4.4 + 2.4.0 + 2.12.10 + 2.6.6 + + + + + + + com.typesafe.scala-logging + scala-logging-slf4j_${scala.binary.version} + 2.1.2 + provided + + + org.scala-lang + scala-compiler + ${scala.version} + + + + + org.apache.kafka + kafka-clients + ${kafka.version} + compile + + + + org.apache.kafka + kafka + ${kafka.version} + compile + + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + provided + + + com.google.guava + guava + + + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-mllib_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-streaming-kafka + ${spark.version} + + + org.apache.spark + spark-sql-kafka + ${spark.version} + + + + + org.apache.kafka + kafka_${scala.binary.version} + test + ${kafka.version} + test + + + org.apache.kafka + kafka-clients + ${kafka.version} + test + test + + + org.scalatest + scalatest_${scala.binary.version} + 3.1.0 + test + + + + + + + + com.hurence.logisland + logisland-api + ${project.version} + provided + + + com.hurence.logisland + logisland-utils + ${project.version} + provided + + + com.hurence.logisland + logisland-processor-timeseries + ${project.version} + provided + + + + com.hurence.logisland + logisland-plugin-support + ${project.version} + + provided + + + + com.groupon.dse + spark-metrics + 2.0.0 + + + + + + org.apache.spark + spark-core_${scala.binary.version} + provided + + + + org.apache.spark + spark-sql_${scala.binary.version} + provided + + + + org.apache.spark + spark-streaming_${scala.binary.version} + provided + + + + org.apache.bahir + spark-sql-streaming-mqtt_${scala.binary.version} + 2.4.0 + + + + org.apache.kafka + connect-api + ${kafka.version} + + + + org.apache.kafka + connect-runtime + ${kafka.version} + + + com.fasterxml.jackson.jaxrs + jackson-jaxrs-json-provider + + + + + org.apache.kafka + connect-json + ${kafka.version} + + + + + com.fasterxml.jackson.core + jackson-core + ${jackson.version} + provided + + + + com.fasterxml.jackson.core + jackson-databind + provided + ${jackson.version} + + + + com.fasterxml.jackson.core + jackson-annotations + provided + + + + com.fasterxml.jackson.module + jackson-module-parameter-names + ${jackson.version} + + + + com.fasterxml.jackson.datatype + jackson-datatype-jdk8 + ${jackson.version} + + + + com.fasterxml.jackson.datatype + jackson-datatype-jsr310 + ${jackson.version} + + + + org.apache.commons + commons-csv + + + + org.hibernate + hibernate-validator + 5.1.3.Final + + + + + + org.glassfish + javax.el + 3.0.0 + + + + + + + org.scala-lang + scala-compiler + provided + + + + + + + junit + junit + test + + + + ch.qos.logback + logback-classic + test + + + + + org.apache.kafka + kafka_${scala.binary.version} + ${kafka.version} + + + + + + org.apache.kafka + kafka_${scala.binary.version} + test + test + + + + + org.apache.kafka + kafka-clients + test + test + + + + io.swagger + swagger-core + ${swagger-core-version} + + + + + com.hurence.logisland + logisland-processor-common + ${project.version} + test + + + + + + com.squareup.okhttp3 + okhttp-urlconnection + 3.10.0 + + + + + + com.squareup.okhttp3 + mockwebserver + 3.10.0 + test + + + + + org.springframework + spring-context-support + 5.1.3.RELEASE + + + + + + + + net.alchim31.maven + scala-maven-plugin + 4.3.1 + + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile-first + process-test-resources + + testCompile + + + + + + ${scala.binary.version} + ${scala.version} + incremental + + -unchecked + -deprecation + + + -Xms64m + -Xms1024m + -Xmx1024m + -XX:PermSize=${PermGen} + -XX:MaxPermSize=${MaxPermGen} + + + -source + ${maven.compiler.source} + -target + ${maven.compiler.source} + + + + + + + + + + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/AbstractKafkaConnectComponent.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/AbstractKafkaConnectComponent.java new file mode 100644 index 000000000..f5a25b979 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/AbstractKafkaConnectComponent.java @@ -0,0 +1,223 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect; + + +import com.hurence.logisland.classloading.PluginProxy; +import com.hurence.logisland.component.ComponentFactory; +import com.hurence.logisland.connect.source.KafkaConnectStreamSourceProvider; +import org.apache.kafka.connect.connector.Connector; +import org.apache.kafka.connect.connector.ConnectorContext; +import org.apache.kafka.connect.connector.Task; +import org.apache.kafka.connect.errors.DataException; +import org.apache.kafka.connect.json.JsonConverter; +import org.apache.kafka.connect.storage.Converter; +import org.apache.kafka.connect.storage.OffsetBackingStore; +import org.apache.spark.sql.SQLContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.*; +import java.util.concurrent.atomic.AtomicBoolean; + +/** + * Kafka connect to spark sql streaming bridge. + * + * @author amarziali + */ +public abstract class AbstractKafkaConnectComponent { + + + private final static Logger LOGGER = LoggerFactory.getLogger(AbstractKafkaConnectComponent.class); + protected final T connector; + protected final List tasks = new ArrayList<>(); + protected final OffsetBackingStore offsetBackingStore; + protected final AtomicBoolean startWatch = new AtomicBoolean(false); + protected final String connectorName; + private final Map connectorProperties; + + protected final SQLContext sqlContext; + protected final Converter keyConverter; + protected final Converter valueConverter; + protected final int maxTasks; + protected final String streamId; + + + /** + * Base constructor. Should be called by {@link KafkaConnectStreamSourceProvider} + * + * @param sqlContext the spark sql context. + * @param connectorProperties the connector related properties. + * @param keyConverter the converter for the data key + * @param valueConverter the converter for the data body + * @param offsetBackingStore the backing store implementation (can be in-memory, file based, kafka based, etc...) + * @param maxTasks the maximum theoretical number of tasks this source should spawn. + * @param connectorClass the class of kafka connect source connector to wrap. + * @param streamId the Stream id. + * + */ + public AbstractKafkaConnectComponent(SQLContext sqlContext, + Map connectorProperties, + Converter keyConverter, + Converter valueConverter, + OffsetBackingStore offsetBackingStore, + int maxTasks, + String connectorClass, + String streamId) { + try { + this.sqlContext = sqlContext; + this.maxTasks = maxTasks; + //instantiate connector + this.connectorName = connectorClass; + connector = ComponentFactory.loadComponent(connectorClass); + //create converters + this.keyConverter = keyConverter; + this.valueConverter = valueConverter; + this.connectorProperties = connectorProperties; + this.streamId = streamId; + + //Create the connector context + final ConnectorContext connectorContext = new ConnectorContext() { + @Override + public void requestTaskReconfiguration() { + try { + stopAllTasks(); + createAndStartAllTasks(); + } catch (Throwable t) { + LOGGER.error("Unable to reconfigure tasks for connector " + connectorName(), t); + } + } + + @Override + public void raiseError(Exception e) { + LOGGER.error("Connector " + connectorName() + " raised error : " + e.getMessage(), e); + } + }; + + LOGGER.info("Starting connector {}", connectorClass); + connector.initialize(connectorContext); + this.offsetBackingStore = offsetBackingStore; + + + } catch (Exception e) { + throw new DataException("Unable to create connector " + connectorName(), e); + } + + } + + public void start() { + try { + offsetBackingStore.start(); + //create and start tasks + createAndStartAllTasks(); + } catch (Exception e) { + try { + stop(); + } catch (Throwable t) { + LOGGER.error("Unable to properly stop tasks of connector " + connectorName(), t); + } + throw new DataException("Unable to start connector " + connectorName(), e); + } + } + + protected abstract void initialize(U task); + + /** + * Create all the {@link Runnable} workers needed to host the source tasks. + * + * @return + * @throws IllegalAccessException if task instantiation fails. + * @throws InstantiationException if task instantiation fails. + */ + protected void createAndStartAllTasks() throws IllegalAccessException, InstantiationException, ClassNotFoundException { + if (!startWatch.compareAndSet(false, true)) { + throw new IllegalStateException("Connector is already started"); + } + connector.start(connectorProperties); + Class taskClass = (Class) connector.taskClass(); + List> configs = connector.taskConfigs(maxTasks); + tasks.clear(); + LOGGER.info("Creating {} tasks for connector {}", configs.size(), connectorName()); + for (Map conf : configs) { + //create the task + U task = PluginProxy.create(taskClass.newInstance()); + initialize(task); + task.start(conf); + tasks.add(task); + + } + } + + + /** + * Create a converter to be used to translate internal data. + * Child classes can override this method to provide alternative converters. + * + * @return an instance of {@link Converter} + */ + protected Converter createInternalConverter(boolean isKey) { + JsonConverter internalConverter = new JsonConverter(); + internalConverter.configure(Collections.singletonMap("schemas.enable", "false"), isKey); + return internalConverter; + } + + /** + * Gets the connector name used by this stream source. + * + * @return + */ + protected String connectorName() { + return connectorName; + } + + + /** + * Stops every tasks running and serving for this connector. + */ + protected void stopAllTasks() { + LOGGER.info("Stopping every tasks for connector {}", connectorName()); + while (!tasks.isEmpty()) { + try { + tasks.remove(0).stop(); + } catch (Throwable t) { + LOGGER.warn("Error occurring while stopping a task of connector " + connectorName(), t); + } + } + } + + protected void stop() { + if (!startWatch.compareAndSet(true, false)) { + throw new IllegalStateException("Connector is not started"); + } + LOGGER.info("Stopping connector {}", connectorName()); + stopAllTasks(); + offsetBackingStore.stop(); + connector.stop(); + } + + + /** + * Check the stream source state. + * + * @return + */ + public boolean isRunning() { + return startWatch.get(); + } + + +} + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/Utils.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/Utils.java new file mode 100644 index 000000000..cc95a24cf --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/Utils.java @@ -0,0 +1,109 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect; + +import com.hurence.logisland.stream.spark.provider.StreamOptions; +import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.connect.runtime.WorkerConfig; +import org.apache.kafka.connect.runtime.distributed.DistributedConfig; +import org.apache.kafka.connect.runtime.standalone.StandaloneConfig; +import org.apache.kafka.connect.storage.*; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Map; +import java.util.Properties; +import java.util.stream.Collectors; + +public class Utils { + + + /** + * Configuration definition for {@link MemoryOffsetBackingStore} + */ + private static class MemoryConfig extends WorkerConfig { + public MemoryConfig(Map props) { + super(new ConfigDef(), props); + } + } + + /** + * Configuration definition for {@link FileOffsetBackingStore} + */ + private static class FileConfig extends WorkerConfig { + public FileConfig(Map props) { + super(new ConfigDef() + .define(StandaloneConfig.OFFSET_STORAGE_FILE_FILENAME_CONFIG, + ConfigDef.Type.STRING, + ConfigDef.Importance.HIGH, + "file to store offset data in") + , props); + } + } + + /** + * Configuration definition for {@link KafkaOffsetBackingStore} + */ + private static class KafkaConfig extends WorkerConfig { + public KafkaConfig(Map props) { + super(new ConfigDef() + .define(BOOTSTRAP_SERVERS_CONFIG, + ConfigDef.Type.LIST, + BOOTSTRAP_SERVERS_DEFAULT, + ConfigDef.Importance.HIGH, + BOOTSTRAP_SERVERS_DOC) + .define(DistributedConfig.OFFSET_STORAGE_TOPIC_CONFIG, + ConfigDef.Type.STRING, + ConfigDef.Importance.HIGH, + "kafka topic to store connector offsets in") + , props); + } + } + + public static Converter createConverter(String converterClassName, String propertiesAsString, boolean isKey) + throws ClassNotFoundException, IllegalAccessException, InstantiationException, IOException { + Converter ret = (Converter) Class.forName(converterClassName).newInstance(); + ret.configure(propertiesToMap(propertiesAsString), isKey); + return ret; + } + + public static Map propertiesToMap(String propertiesAsString) throws IOException { + Properties props = new Properties(); + props.load(new StringReader(propertiesAsString)); + return props.entrySet().stream().collect(Collectors.toMap(e -> e.getKey().toString(), e -> e.getValue().toString())); + } + + public static OffsetBackingStore createOffsetBackingStore(String type, Map properties) { + WorkerConfig workerConfig = null; + OffsetBackingStore offsetBackingStore; + if (StreamOptions.FILE_BACKING_STORE().getValue().equals(type)) { + offsetBackingStore = new FileOffsetBackingStore(); + workerConfig = new FileConfig(properties); + } else if (StreamOptions.MEMORY_BACKING_STORE().getValue().equals(type)) { + offsetBackingStore = new MemoryOffsetBackingStore(); + workerConfig = new MemoryConfig(properties); + } else if (StreamOptions.KAFKA_BACKING_STORE().getValue().equals(type)) { + offsetBackingStore = new KafkaOffsetBackingStore(); + workerConfig = new KafkaConfig(properties); + } else { + throw new IllegalArgumentException(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE().getName() + + " must be set!"); + } + offsetBackingStore.configure(workerConfig); + return offsetBackingStore; + } + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/converter/LogIslandRecordConverter.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/converter/LogIslandRecordConverter.java new file mode 100644 index 000000000..4ad94cdd9 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/converter/LogIslandRecordConverter.java @@ -0,0 +1,232 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect.converter; + +import com.hurence.logisland.record.Field; +import com.hurence.logisland.record.*; +import com.hurence.logisland.serializer.RecordSerializer; +import com.hurence.logisland.serializer.SerializerProvider; +import com.hurence.logisland.stream.StreamProperties; +import org.apache.kafka.connect.data.*; +import org.apache.kafka.connect.errors.DataException; +import org.apache.kafka.connect.storage.Converter; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.lang.reflect.Array; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.util.Collection; +import java.util.Date; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.stream.Collectors; + +public class LogIslandRecordConverter implements Converter { + + /** + * Record serializer class (instance of {@link RecordSerializer}) + */ + public static final String PROPERTY_RECORD_SERIALIZER = "record.serializer"; + /** + * Avro schema to use (only apply to {@link com.hurence.logisland.serializer.AvroSerializer}) + */ + public static final String PROPERTY_AVRO_SCHEMA = "avro.schema"; + + /** + * The record type to use. If not provided {@link LogIslandRecordConverter#PROPERTY_RECORD_TYPE} will be used. + */ + public static final String PROPERTY_RECORD_TYPE = StreamProperties.RECORD_TYPE().getName(); + + /** + * The default type for logisland {@link Record} created by this converter. + */ + private static final String DEFAULT_RECORD_TYPE = "kafka_connect"; + + private RecordSerializer recordSerializer; + private String recordType; + private boolean isKey; + + + @Override + public void configure(Map configs, boolean isKey) { + recordSerializer = SerializerProvider.getSerializer((String) configs.get(PROPERTY_RECORD_SERIALIZER), (String) configs.get(PROPERTY_AVRO_SCHEMA)); + recordType = ((Map) configs).getOrDefault(PROPERTY_RECORD_TYPE, DEFAULT_RECORD_TYPE).toString(); + this.isKey = isKey; + } + + @Override + public byte[] fromConnectData(String topic, Schema schema, Object value) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + recordSerializer.serialize(baos, + new StandardRecord(recordType).setField(toFieldRecursive(FieldDictionary.RECORD_VALUE, schema, value, isKey))); + return baos.toByteArray(); + } catch (IOException ioe) { + throw new DataException("Unexpected IO Exception occurred while serializing data [topic " + topic + "]", ioe); + } + + } + + @Override + public SchemaAndValue toConnectData(String topic, byte[] value) { + try (ByteArrayInputStream bais = new ByteArrayInputStream(value)) { + Record r = recordSerializer.deserialize(bais); + Schema schema = toSchemaRecursive(r); + return new SchemaAndValue(schema, toObjectRecursive(r, schema)); + } catch (IOException ioe) { + throw new DataException("Unexpected IO Exception occurred while serializing data [topic " + topic + "]", ioe); + } + } + + + public Object toObjectRecursive(Object o, Schema schema) { + if (o instanceof Collection) { + return ((Collection) o).stream().map(elem -> toObjectRecursive(elem, schema.schema())); + } else if (o instanceof Map) { + Struct ret = new Struct(schema); + ((Map) o).forEach((k, v) -> ret.put(k.toString(), toObjectRecursive(o, schema.field(k.toString()).schema()))); + return ret; + } else if (o instanceof Record) { + Struct ret = new Struct(schema); + ((Record) o).getAllFieldsSorted().forEach(field -> ret.put(field.getName(), toObjectRecursive(field.getRawValue(), schema.field(field.getName()).schema()))); + return ret; + } + return o; + } + + private SchemaBuilder toSchemaRecursive(Object o) { + if (o instanceof Byte) { + return SchemaBuilder.bytes().optional(); + } else if (o instanceof Short) { + return SchemaBuilder.int16().optional(); + } else if (o instanceof Integer) { + return SchemaBuilder.int32().optional(); + + } else if (o instanceof Long) { + return SchemaBuilder.int64().optional(); + + } else if (o instanceof Float) { + return SchemaBuilder.float32().optional(); + + } else if (o instanceof Double) { + return SchemaBuilder.float64().optional(); + + } else if (o instanceof Boolean) { + return SchemaBuilder.bool().optional(); + } else if (o instanceof byte[]) { + return SchemaBuilder.bytes().optional(); + } else if (o instanceof Collection) { + return SchemaBuilder.array(toSchemaRecursive((Array.getLength(o) > 0 ? Array.get(o, 0) : null))).optional(); + } else if (o instanceof Map) { + SchemaBuilder sb = SchemaBuilder.struct(); + ((Map) o).forEach((k, v) -> sb.field(k.toString(), toSchemaRecursive(v))); + return sb.optional(); + } else if (o instanceof Record) { + SchemaBuilder sb = SchemaBuilder.struct(); + ((Record) o).getAllFieldsSorted().forEach(field -> sb.field(field.getName(), toSchemaRecursive(field.getRawValue()))); + return sb.optional(); + } + return SchemaBuilder.string().optional(); + } + + + private Field toFieldRecursive(String name, Schema schema, Object value, boolean isKey) { + try { + if (value == null) { + return new Field(name, FieldType.NULL, null); + } + final Schema.Type schemaType; + if (schema == null) { + schemaType = ConnectSchema.schemaType(value.getClass()); + if (schemaType == null) + throw new DataException("Java class " + value.getClass() + " does not have corresponding schema type."); + } else { + schemaType = schema.type(); + } + switch (schemaType) { + case INT8: + case INT16: + case INT32: + return new Field(name, FieldType.INT, value); + case INT64: + Object toSet = value; + if (value instanceof Date) { + toSet = ((Date) value).getTime(); + } else if (value instanceof Instant) { + toSet = ((Instant) value).toEpochMilli(); + } + return new Field(name, FieldType.LONG, toSet); + case FLOAT32: + return new Field(name, FieldType.FLOAT, value); + case FLOAT64: + return new Field(name, FieldType.DOUBLE, value); + case BOOLEAN: + return new Field(name, FieldType.BOOLEAN, value); + case STRING: + return new Field(name, FieldType.STRING, value); + case BYTES: + byte[] bytes = null; + if (value instanceof byte[]) { + bytes = (byte[]) value; + } else if (value instanceof ByteBuffer) { + bytes = ((ByteBuffer) value).array(); + } else { + //throw new DataException("Invalid type for bytes type: " + value.getClass()); + //AM: fix to handle special cases (see oracle jdbc issues) + return new Field(name, FieldType.STRING, value != null ? value.toString() : value); + } + return new Field(name, FieldType.BYTES, bytes); + case ARRAY: { + return new Field(name, FieldType.ARRAY, + ((Collection) value).stream().map(item -> { + Schema valueSchema = schema == null ? null : schema.valueSchema(); + return toFieldRecursive(FieldDictionary.RECORD_VALUE, valueSchema, item, true); + }) + .map(Field::getRawValue) + .collect(Collectors.toList())); + } + case MAP: { + return new Field(name, FieldType.MAP, new LinkedHashMap<>((Map) value)); + } + case STRUCT: { + Struct struct = (Struct) value; + + if (struct.schema() != schema) { + throw new DataException("Mismatching schema."); + } + if (isKey) { + Map ret = new HashMap<>(); + struct.schema().fields().stream().filter(field -> !(field.schema().isOptional() && struct.get(field) == null)) + .forEach(field -> ret.put(field.name(), toFieldRecursive(field.name(), field.schema(), struct.get(field), true).getRawValue())); + return new Field(name, FieldType.MAP, ret); + } else { + Record ret = new StandardRecord(); + struct.schema().fields().stream() + .filter(field -> !(field.schema().isOptional() && struct.get(field) == null)) + .forEach(field -> ret.setField(toFieldRecursive(field.name(), field.schema(), struct.get(field), true))); + return new Field(name, FieldType.RECORD, ret); + } + + } + } + throw new DataException("Couldn't convert " + value + " to a logisland Record."); + } catch (ClassCastException e) { + throw new DataException("Invalid type for " + schema.type() + ": " + value.getClass()); + } + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/KafkaConnectStreamSink.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/KafkaConnectStreamSink.java new file mode 100644 index 000000000..d84170198 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/KafkaConnectStreamSink.java @@ -0,0 +1,139 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect.sink; + +import com.google.common.collect.ListMultimap; +import com.google.common.collect.Multimaps; +import com.hurence.logisland.connect.AbstractKafkaConnectComponent; +import com.hurence.logisland.connect.source.KafkaConnectStreamSourceProvider; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.sink.SinkConnector; +import org.apache.kafka.connect.sink.SinkRecord; +import org.apache.kafka.connect.sink.SinkTask; +import org.apache.kafka.connect.storage.Converter; +import org.apache.kafka.connect.storage.OffsetBackingStore; +import org.apache.spark.sql.SQLContext; +import scala.Tuple2; + +import java.util.*; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Kafka {@link SinkConnector} to logisland adapter. + * + * @author amarziali + */ +public class KafkaConnectStreamSink extends AbstractKafkaConnectComponent { + + + private final ListMultimap bufferedRecords = Multimaps.synchronizedListMultimap( + Multimaps.newListMultimap(new HashMap<>(), ArrayList::new)); + private final Map contexts = new IdentityHashMap<>(); + + private final Map> partitions = Collections.synchronizedMap(new HashMap<>()); + + + private final String topic; + private final AtomicLong counter = new AtomicLong(); + + /** + * Base constructor. + * + * @param sqlContext the spark sql context. + * @param connectorProperties the connector related properties. + * @param keyConverter the converter for the data key + * @param valueConverter the converter for the data body + * @param offsetBackingStore the backing store implementation (can be in-memory, file based, kafka based, etc...) + * @param maxTasks the maximum theoretical number of tasks this source should spawn. + * @param connectorClass the class of kafka connect source connector to wrap. + * @param streamId the id of the underlying stream + */ + public KafkaConnectStreamSink(SQLContext sqlContext, + Map connectorProperties, + Converter keyConverter, + Converter valueConverter, + OffsetBackingStore offsetBackingStore, + int maxTasks, + String topic, + String connectorClass, + String streamId) { + super(sqlContext, connectorProperties, keyConverter, valueConverter, offsetBackingStore, maxTasks, connectorClass, streamId); + this.topic = topic; + } + + + @Override + protected void initialize(SinkTask task) { + SimpleSinkTaskContext sstc = new SimpleSinkTaskContext(topic); + task.initialize(sstc); + contexts.put(task, sstc); + } + + public boolean openPartition(int partition) { + Tuple2 ret = partitions.computeIfAbsent(partition, + part -> { + SinkTask task = tasks.get(partition % tasks.size()); + TopicPartition tp = new TopicPartition(topic, part); + task.open(Collections.singleton(tp)); + SimpleSinkTaskContext tk = contexts.get(task); + return Tuple2.apply(task, tk); + }); + + return ret._2().assignThenState(partition); + } + + public void enqueueOnPartition(int partition, byte[] key, byte[] value) { + SchemaAndValue keySV = keyConverter.toConnectData(topic, key); + SchemaAndValue valueSV = valueConverter.toConnectData(topic, value); + + bufferedRecords.put(partition, + new SinkRecord(topic, + partition, + keySV.schema(), + keySV.value(), + valueSV.schema(), + valueSV.value(), + counter.incrementAndGet())); + } + + public void flushPartition(int partition) { + List records = bufferedRecords.get(partition); + if (!records.isEmpty()) { + partitions.get(partition)._1().put(records); + partitions.get(partition)._1().flush(Collections.singletonMap( + new TopicPartition(topic, partition), + new OffsetAndMetadata(records.get(records.size() - 1).kafkaOffset())) + ); + bufferedRecords.removeAll(partition); + + } + } + + @Override + protected void stopAllTasks() { + try { + super.stopAllTasks(); + } finally { + counter.set(0); + contexts.clear(); + bufferedRecords.clear(); + partitions.clear(); + } + + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/NullSink.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/NullSink.java new file mode 100644 index 000000000..34b2bbcc1 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/NullSink.java @@ -0,0 +1,96 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect.sink; + +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.connect.connector.Task; +import org.apache.kafka.connect.sink.SinkConnector; +import org.apache.kafka.connect.sink.SinkRecord; +import org.apache.kafka.connect.sink.SinkTask; + +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * A busybox {@link SinkConnector} + * + * @author amarziali + */ +public class NullSink extends SinkConnector { + + public static class NullSinkTask extends SinkTask { + @Override + public void start(Map props) { + + } + + @Override + public void put(Collection records) { + + } + + @Override + public void flush(Map offsets) { + + } + + @Override + public void stop() { + + } + + @Override + public String version() { + return ""; + } + } + + @Override + public String version() { + return ""; + } + + @Override + public void start(Map props) { + + } + + @Override + public Class taskClass() { + return NullSinkTask.class; + } + + @Override + public List> taskConfigs(int maxTasks) { + return IntStream.range(0, maxTasks).mapToObj(i -> Collections.emptyMap()).collect(Collectors.toList()); + } + + @Override + public void stop() { + + } + + @Override + public ConfigDef config() { + return new ConfigDef(); + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/SimpleSinkTaskContext.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/SimpleSinkTaskContext.java new file mode 100644 index 000000000..05c662318 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/SimpleSinkTaskContext.java @@ -0,0 +1,81 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect.sink; + +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.connect.sink.SinkTaskContext; + +import java.util.*; +import java.util.stream.Collectors; + +/** + * A simple version of {@link SinkTaskContext} + * + * @author amarziali + */ +public class SimpleSinkTaskContext implements SinkTaskContext { + + private final Map state = Collections.synchronizedMap(new HashMap<>()); + private final String topic; + + public SimpleSinkTaskContext(String topic) { + this.topic = topic; + } + + @Override + public void offset(Map offsets) { + //not implemented + } + + @Override + public void offset(TopicPartition tp, long offset) { + //not implemented + } + + @Override + public void timeout(long timeoutMs) { + //not implemented + } + + @Override + public Set assignment() { + return state.entrySet().stream().filter(Map.Entry::getValue) + .map(entry -> new TopicPartition(topic, entry.getKey())) + .collect(Collectors.toSet()); + } + + @Override + public void pause(TopicPartition... partitions) { + Arrays.stream(partitions).map(TopicPartition::partition).forEach(p -> state.put(p, false)); + + } + + @Override + public void resume(TopicPartition... partitions) { + Arrays.stream(partitions).map(TopicPartition::partition).forEach(p -> state.put(p, true)); + + } + + @Override + public void requestCommit() { + + } + + public boolean assignThenState(int partition) { + return state.computeIfAbsent(partition, p -> true); + } + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSource.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSource.java new file mode 100644 index 000000000..dabf03390 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSource.java @@ -0,0 +1,286 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect.source; + + +import com.hurence.logisland.connect.AbstractKafkaConnectComponent; +import com.hurence.logisland.stream.spark.provider.StreamOptions; +import com.hurence.logisland.util.spark.SparkPlatform; +import org.apache.commons.lang3.StringUtils; +import org.apache.kafka.connect.runtime.WorkerSourceTaskContext; +import org.apache.kafka.connect.source.SourceConnector; +import org.apache.kafka.connect.source.SourceRecord; +import org.apache.kafka.connect.source.SourceTask; +import org.apache.kafka.connect.storage.Converter; +import org.apache.kafka.connect.storage.OffsetBackingStore; +import org.apache.kafka.connect.storage.OffsetStorageReaderImpl; +import org.apache.kafka.connect.storage.OffsetStorageWriter; +import org.apache.kafka.connect.util.ConnectorTaskId; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.execution.streaming.Offset; +import org.apache.spark.sql.execution.streaming.SerializedOffset; +import org.apache.spark.sql.execution.streaming.Source; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Option; +import scala.Tuple2; +import scala.collection.JavaConversions; + +import java.util.*; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +/** + * Kafka connect to spark sql streaming bridge. + * + * @author amarziali + */ +public class KafkaConnectStreamSource extends AbstractKafkaConnectComponent implements Source { + + + /** + * The Schema used for this source. + */ + public final static StructType SCHEMA = new StructType(new StructField[]{ + new StructField(StreamOptions.KAFKA_CONNECT_CONNECTOR_PROPERTIES().getName(), + DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType), false, Metadata.empty()), + new StructField(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER().getName(), + DataTypes.StringType, false, Metadata.empty()), + new StructField(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER_PROPERTIES().getName(), + DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType), false, Metadata.empty()), + new StructField(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER().getName(), + DataTypes.StringType, false, Metadata.empty()), + new StructField(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER_PROPERTIES().getName(), + DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType), false, Metadata.empty()), + new StructField(StreamOptions.KAFKA_CONNECT_MAX_TASKS().getName(), + DataTypes.createMapType(DataTypes.IntegerType, DataTypes.StringType), false, Metadata.empty()) + }); + /** + * The schema used to represent the outgoing dataframe. + */ + public final static StructType DATA_SCHEMA = new StructType(new StructField[]{ + new StructField("topic", DataTypes.StringType, false, Metadata.empty()), + new StructField("sourcePartition", DataTypes.StringType, false, Metadata.empty()), + new StructField("sourceOffset", DataTypes.StringType, false, Metadata.empty()), + new StructField("key", DataTypes.BinaryType, true, Metadata.empty()), + new StructField("value", DataTypes.BinaryType, false, Metadata.empty()) + + }); + private final static Logger LOGGER = LoggerFactory.getLogger(KafkaConnectStreamSource.class); + + private final AtomicLong counter = new AtomicLong(); + private final AtomicInteger taskCounter = new AtomicInteger(); + + private final Map offsetWriterMap = new IdentityHashMap<>(); + private final SortedMap>> bufferedRecords = + Collections.synchronizedSortedMap(new TreeMap<>()); + private final SortedMap>> uncommittedRecords = + Collections.synchronizedSortedMap(new TreeMap<>()); + private final Map busyTasks = Collections.synchronizedMap(new IdentityHashMap<>()); + + private final SparkPlatform sparkPlatform = StreamSupport.stream( + Spliterators.spliteratorUnknownSize(ServiceLoader.load(SparkPlatform.class).iterator(), Spliterator.ORDERED), + false).findFirst().orElseThrow(() -> new IllegalStateException("SparkPlatform service spi not defined. " + + "Unable to continue")); + + + /** + * Base constructor. Should be called by {@link KafkaConnectStreamSourceProvider} + * + * @param sqlContext the spark sql context. + * @param connectorProperties the connector related properties. + * @param keyConverter the converter for the data key + * @param valueConverter the converter for the data body + * @param offsetBackingStore the backing store implementation (can be in-memory, file based, kafka based, etc...) + * @param maxTasks the maximum theoretical number of tasks this source should spawn. + * @param connectorClass the class of kafka connect source connector to wrap. + * @param streamId the id of the underlying stream + */ + public KafkaConnectStreamSource(SQLContext sqlContext, + Map connectorProperties, + Converter keyConverter, + Converter valueConverter, + OffsetBackingStore offsetBackingStore, + int maxTasks, + String connectorClass, + String streamId) { + super(sqlContext, connectorProperties, keyConverter, valueConverter, offsetBackingStore, maxTasks, connectorClass, streamId); + + } + + + @Override + protected void initialize(SourceTask task) { + int taskId = taskCounter.incrementAndGet(); + ConnectorTaskId connectorTaskId = new ConnectorTaskId(StringUtils.join(new String[]{streamId, connectorName}, '#'), taskId); + task.initialize(new WorkerSourceTaskContext(new OffsetStorageReaderImpl(offsetBackingStore, connectorTaskId.toString(), + createInternalConverter(true), createInternalConverter(false)))); + offsetWriterMap.put(task, new OffsetStorageWriter(offsetBackingStore, connectorTaskId.toString(), + createInternalConverter(true), createInternalConverter(false))); + + } + + + @Override + public StructType schema() { + return SCHEMA; + } + + @Override + protected void createAndStartAllTasks() throws IllegalAccessException, InstantiationException, ClassNotFoundException { + counter.set(0); + taskCounter.set(0); + busyTasks.clear(); + bufferedRecords.clear(); + offsetWriterMap.clear(); + super.createAndStartAllTasks(); + } + + @Override + public synchronized Option getOffset() { + if (!uncommittedRecords.isEmpty()) { + return Option.apply(SerializedOffset.apply(Long.toString(counter.incrementAndGet()))); + } + if (bufferedRecords.isEmpty()) { + tasks.forEach(t -> busyTasks.computeIfAbsent(t, sourceTask -> { + Thread thread = new Thread(() -> { + try { + List> tmp = sourceTask.poll().stream() + .map(sourceRecord -> Tuple2.apply(sourceTask, sourceRecord)) + .collect(Collectors.toList()); + if (!tmp.isEmpty()) { + bufferedRecords.put(counter.incrementAndGet(), tmp); + } + } catch (InterruptedException ie) { + LOGGER.warn("Task {} interrupted while waiting.", sourceTask.getClass().getCanonicalName()); + } finally { + busyTasks.remove(t); + } + }); + thread.start(); + return thread; + })); + } else { + return Option.apply(SerializedOffset.apply(bufferedRecords.lastKey().toString())); + + } + return Option.empty(); + } + + + @Override + public Dataset getBatch(Option start, Offset end) { + Long startOff = start.isDefined() ? Long.parseLong(start.get().json()) : + !bufferedRecords.isEmpty() ? bufferedRecords.firstKey() : 0L; + + Map> current = + new LinkedHashMap<>(bufferedRecords.subMap(startOff, Long.parseLong(end.json()) + 1)) + .keySet().stream() + .flatMap(offset -> { + List> srl = bufferedRecords.remove(offset); + if (srl != null) { + uncommittedRecords.put(offset, srl); + return srl.stream(); + } + return Stream.empty(); + }) + .map(Tuple2::_2) + .map(sourceRecord -> InternalRow.fromSeq(JavaConversions.asScalaBuffer(Arrays.asList( + toUTFString(sourceRecord.topic()), + toUTFString(sourceRecord.sourcePartition()), + toUTFString(sourceRecord.sourceOffset()), + keyConverter.fromConnectData(sourceRecord.topic(), sourceRecord.keySchema(), sourceRecord.key()), + valueConverter.fromConnectData(sourceRecord.topic(), sourceRecord.valueSchema(), sourceRecord.value()) + )).toSeq())) + .collect(Collectors.groupingBy(row -> Objects.hashCode((row.getString(1))))); + return sparkPlatform.createStreamingDataFrame(sqlContext, new SimpleRDD(sqlContext.sparkContext(), current), DATA_SCHEMA); + + + } + + private UTF8String toUTFString(Object o) { + if (o != null) { + return UTF8String.fromString(o.toString()); + } + return UTF8String.EMPTY_UTF8; + } + + @Override + public void commit(Offset end) { + if (uncommittedRecords.isEmpty()) { + return; + } + //first commit all offsets already given + List> recordsToCommit = + new LinkedHashMap<>(uncommittedRecords.subMap(uncommittedRecords.firstKey(), Long.parseLong(end.json()) + 1)).keySet().stream() + .flatMap(key -> uncommittedRecords.remove(key).stream()) + .collect(Collectors.toList()); + + recordsToCommit.forEach(tuple -> { + try { + offsetWriterMap.get(tuple._1()).offset(tuple._2().sourcePartition(), tuple._2().sourceOffset()); + tuple._1().commitRecord(tuple._2()); + } catch (Exception e) { + LOGGER.warn("Unable to commit record " + tuple._2(), e); + } + }); + recordsToCommit.stream().map(Tuple2::_1).distinct().forEach(sourceTask -> { + try { + sourceTask.commit(); + } catch (Exception e) { + LOGGER.warn("Unable to bulk commit offset for connector " + connectorName, e); + } + }); + //now flush offset writer + offsetWriterMap.values().forEach(offsetStorageWriter -> { + try { + if (offsetStorageWriter.beginFlush()) { + offsetStorageWriter.doFlush((error, result) -> { + if (error == null) { + LOGGER.debug("Flushing till offset {} with result {}", end, result); + } else { + LOGGER.error("Unable to commit records till source offset " + end, error); + + } + }).get(30, TimeUnit.SECONDS); + } + } catch (Exception e) { + LOGGER.error("Unable to commit records till source offset " + end, e); + } + }); + } + + + @Override + public void stop() { + super.stop(); + } + +} + + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSourceProvider.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSourceProvider.java new file mode 100644 index 000000000..8d97a277c --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSourceProvider.java @@ -0,0 +1,73 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect.source; + +import com.hurence.logisland.connect.Utils; +import com.hurence.logisland.stream.spark.provider.StreamOptions; +import org.apache.kafka.connect.source.SourceConnector; +import org.apache.kafka.connect.storage.Converter; +import org.apache.kafka.connect.storage.OffsetBackingStore; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.execution.streaming.Source; +import org.apache.spark.sql.sources.StreamSourceProvider; +import org.apache.spark.sql.types.StructType; +import scala.Option; +import scala.Tuple2; +import scala.collection.immutable.Map; + +/** + * A {@link StreamSourceProvider} capable of creating spark {@link com.hurence.logisland.stream.spark.structured.StructuredStream} + * enabled kafka sources. + * + * @author amarziali + */ +public class KafkaConnectStreamSourceProvider implements StreamSourceProvider { + + @Override + public Source createSource(SQLContext sqlContext, String metadataPath, Option schema, String providerName, Map parameters) { + try { + Converter keyConverter = Utils.createConverter(parameters.get(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER().getName()).get(), + parameters.get(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER_PROPERTIES().getName()).get(), true); + Converter valueConverter = Utils.createConverter(parameters.get(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER().getName()).get(), + parameters.get(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER_PROPERTIES().getName()).get(), false); + //create the right backing store + String bs = parameters.get(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE().getName()).get(); + java.util.Map offsetBackingStoreProperties = + Utils.propertiesToMap(parameters.get(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE_PROPERTIES().getName()).get()); + OffsetBackingStore offsetBackingStore = Utils.createOffsetBackingStore(bs, offsetBackingStoreProperties); + + KafkaConnectStreamSource ret = new KafkaConnectStreamSource(sqlContext, + Utils.propertiesToMap(parameters.get(StreamOptions.KAFKA_CONNECT_CONNECTOR_PROPERTIES().getName()).get()), + keyConverter, + valueConverter, + offsetBackingStore, + Integer.parseInt(parameters.get(StreamOptions.KAFKA_CONNECT_MAX_TASKS().getName()).get()), + parameters.get(StreamOptions.KAFKA_CONNECT_CONNECTOR_CLASS().getName()).get(), + parameters.get("path").get()); + ret.start(); + return ret; + } catch (Exception e) { + throw new IllegalArgumentException("Unable to create kafka connect stream source: " + e.getMessage(), e); + } + + + } + + @Override + public Tuple2 sourceSchema(SQLContext sqlContext, Option schema, String providerName, Map parameters) { + return Tuple2.apply(providerName, KafkaConnectStreamSource.DATA_SCHEMA); + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimplePartition.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimplePartition.java new file mode 100644 index 000000000..171e74bd4 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimplePartition.java @@ -0,0 +1,51 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect.source; + +import org.apache.spark.Partition; + +/** + * Simple partition. + * + * @author amarziali + */ +public class SimplePartition implements Partition { + + private final int index; + private final int hash; + + public SimplePartition(int index, int hash) { + this.index = index; + this.hash = hash; + } + + @Override + public int index() { + return index; + } + + public int getHash() { + return hash; + } + + @Override + public String toString() { + return "SimplePartition{" + + "index=" + index + + ", hash=" + hash + + '}'; + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimpleRDD.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimpleRDD.java new file mode 100644 index 000000000..9768dcf30 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimpleRDD.java @@ -0,0 +1,64 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect.source; + +import org.apache.spark.Dependency; +import org.apache.spark.Partition; +import org.apache.spark.SparkContext; +import org.apache.spark.TaskContext; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.catalyst.InternalRow; +import scala.collection.Iterator; +import scala.collection.JavaConversions; +import scala.reflect.ClassTag$; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +/** + * Simple kafka connect source partitioned RDD. + * + * @author amarziali + */ +public class SimpleRDD extends RDD { + + final Map> data; + + public SimpleRDD(SparkContext _sc, Map> data) { + super(_sc, JavaConversions.collectionAsScalaIterable(Collections.>emptyList()).toSeq(), + ClassTag$.MODULE$.apply(InternalRow.class)); + this.data = data; + } + + @Override + public Iterator compute(Partition split, TaskContext context) { + return JavaConversions.collectionAsScalaIterable(data.get(((SimplePartition)split).getHash())).iterator(); + } + + @Override + public Partition[] getPartitions() { + Partition[] ret = new SimplePartition[data.size()]; + int j = 0; + for (Integer i : data.keySet()) { + ret[j] = new SimplePartition(j, i); + j++; + } + return ret; + + } + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceConnector.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceConnector.java new file mode 100644 index 000000000..65a29c89e --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceConnector.java @@ -0,0 +1,136 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect.source.timed; + +import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.connect.connector.Task; +import org.apache.kafka.connect.source.SourceConnector; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * A connector that emits an empty record at fixed rate waking up the processing pipeline. + * + * @author amarziali + */ +public class ClockSourceConnector extends SourceConnector { + + public static final String RATE = "rate"; + public static final String POLL_CRON_SCHEDULER_CONFIG = "poll.cron"; + public static final String SNAPSHOT_FIELD_CONFIG = "snapshot.field"; + public static final String TSID_FIELD_CONFIG = "tsid.field"; + public static final String DATE_FIELD_CONFIG = "date.field"; + public static final String DATE_FORMAT_CONFIG = "date.format"; + public static final String DATE_TIMEZONE_CONFIG = "date.timezone"; + public static final String RECORD_ID_FIELD_CONFIG = "record.id.field"; + public static final String HAS_ONGOING_RECORD_CONFIG = "has.ongoing.record"; + public static final String HAS_PREVIOUS_RECORD_CONFIG = "has.previous.record"; + public static final String CURRENT_RECORD_ID_VALUE_CONFIG = "current.record.id.value"; + + + public static final long RATE_DEFAULT = 60000; + public static final String POLL_CRON_SCHEDULER_DEFAULT = null; + public static final String SNAPSHOT_FIELD_DEFAULT = null; + public static final String TSID_FIELD_DEFAULT = null; + public static final String DATE_FIELD_DEFAULT = null; + public static final String DATE_FORMAT_DEFAULT = "yyyy-MM-dd HH:mm:ss z"; + public static final String DATE_TIMEZONE_DEFAULT = "CET"; + public static final String RECORD_ID_FIELD_DEFAULT = "id"; + public static final boolean HAS_ONGOING_RECORD_DEFAULT = false; + public static final boolean HAS_PREVIOUS_RECORD_DEFAULT = false; + public static final String CURRENT_RECORD_ID_VALUE_DEFAULT = "clockRecord"; + + + private static final ConfigDef CONFIG = new ConfigDef() + .define(RATE, ConfigDef.Type.LONG, RATE_DEFAULT, ConfigDef.Importance.HIGH, "The clock rate in milliseconds") + .define(POLL_CRON_SCHEDULER_CONFIG, ConfigDef.Type.STRING, POLL_CRON_SCHEDULER_DEFAULT, ConfigDef.Importance.HIGH, "The cron expression") + .define(SNAPSHOT_FIELD_CONFIG, ConfigDef.Type.STRING, SNAPSHOT_FIELD_DEFAULT, ConfigDef.Importance.HIGH, "Name of the field containing the snapshot id") + .define(TSID_FIELD_CONFIG, ConfigDef.Type.STRING, TSID_FIELD_DEFAULT, ConfigDef.Importance.HIGH, "Name of the field containing the ordering column") + .define(DATE_FIELD_CONFIG, ConfigDef.Type.STRING, DATE_FIELD_DEFAULT, ConfigDef.Importance.HIGH, "Name of the field containing the date in human readable format") + .define(DATE_FORMAT_CONFIG, ConfigDef.Type.STRING, DATE_FORMAT_DEFAULT, ConfigDef.Importance.HIGH, "Format to use to display date in human readable-format") + .define(DATE_TIMEZONE_CONFIG, ConfigDef.Type.STRING, DATE_TIMEZONE_DEFAULT, ConfigDef.Importance.HIGH, "Timezone to use to display date in human readable-format") + .define(RECORD_ID_FIELD_CONFIG, ConfigDef.Type.STRING, RECORD_ID_FIELD_DEFAULT, ConfigDef.Importance.HIGH, "Name of the field containing the id of the record") + .define(HAS_ONGOING_RECORD_CONFIG, ConfigDef.Type.BOOLEAN, HAS_ONGOING_RECORD_DEFAULT, ConfigDef.Importance.HIGH, "If set to true, it will produce an additional record with ongoing snapshot details") + .define(HAS_PREVIOUS_RECORD_CONFIG, ConfigDef.Type.BOOLEAN, HAS_PREVIOUS_RECORD_DEFAULT, ConfigDef.Importance.HIGH, "If set to true, it will produce an additional record with previous snapshot details") + .define(CURRENT_RECORD_ID_VALUE_CONFIG, ConfigDef.Type.STRING, CURRENT_RECORD_ID_VALUE_DEFAULT, ConfigDef.Importance.HIGH, "Specifies the id value of the record"); + + private long rate; + private String recordIdField; + private String currentRecordIdValue; + private String recordSnapshotField; + private String cronExprValue; + private String tsidField; + private String dateField; + private boolean hasOngoingRecordDefault; + private boolean hasPreviousRecordDefault; + private String formatDateValue; + private String timezoneDateValue; + + @Override + public String version() { + return "1.0"; + } + + @Override + public void start(Map props) { + rate = (Long) CONFIG.parse(props).get(RATE); + recordIdField = (String) CONFIG.parse(props).get(RECORD_ID_FIELD_CONFIG); + recordSnapshotField = (String) CONFIG.parse(props).get(SNAPSHOT_FIELD_CONFIG); + cronExprValue = (String) CONFIG.parse(props).get(POLL_CRON_SCHEDULER_CONFIG); + tsidField = (String) CONFIG.parse(props).get(TSID_FIELD_CONFIG); + dateField = (String) CONFIG.parse(props).get(DATE_FIELD_CONFIG); + hasOngoingRecordDefault = (boolean) CONFIG.parse(props).get(HAS_ONGOING_RECORD_CONFIG); + hasPreviousRecordDefault = (boolean) CONFIG.parse(props).get(HAS_PREVIOUS_RECORD_CONFIG); + currentRecordIdValue = (String) CONFIG.parse(props).get(CURRENT_RECORD_ID_VALUE_CONFIG); + formatDateValue = (String) CONFIG.parse(props).get(DATE_FORMAT_CONFIG); + timezoneDateValue = (String) CONFIG.parse(props).get(DATE_TIMEZONE_CONFIG); + } + + @Override + public Class taskClass() { + return ClockSourceTask.class; + } + + @Override + public List> taskConfigs(int maxTasks) { + Map mapConfig = new HashMap<>(); + mapConfig.put(RATE, Long.toString(rate)); + mapConfig.put(RECORD_ID_FIELD_CONFIG, recordIdField); + mapConfig.put(CURRENT_RECORD_ID_VALUE_CONFIG, currentRecordIdValue); + mapConfig.put(SNAPSHOT_FIELD_CONFIG, recordSnapshotField); + mapConfig.put(POLL_CRON_SCHEDULER_CONFIG, cronExprValue); + mapConfig.put(TSID_FIELD_CONFIG, tsidField); + mapConfig.put(DATE_FIELD_CONFIG, dateField); + mapConfig.put(HAS_ONGOING_RECORD_CONFIG, Boolean.toString(hasOngoingRecordDefault)); + mapConfig.put(HAS_PREVIOUS_RECORD_CONFIG, Boolean.toString(hasPreviousRecordDefault)); + mapConfig.put(DATE_FORMAT_CONFIG, formatDateValue); + mapConfig.put(DATE_TIMEZONE_CONFIG, timezoneDateValue); + return Collections.singletonList(mapConfig); + } + + @Override + public void stop() { + } + + @Override + public ConfigDef config() { + + return CONFIG; + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceTask.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceTask.java new file mode 100644 index 000000000..9e33b5278 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceTask.java @@ -0,0 +1,250 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect.source.timed; + +import org.apache.kafka.common.utils.SystemTime; +import org.apache.kafka.common.utils.Time; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.source.SourceRecord; +import org.apache.kafka.connect.source.SourceTask; + +import java.text.SimpleDateFormat; +import java.util.*; + +import org.springframework.scheduling.support.CronSequenceGenerator; + + +/** + * {@link SourceTask} for {@link ClockSourceConnector} + * + * @author amarziali, jarnou + * + * The ClockSourceTask is a kafka connect service controller designed to + * generate a/(a set of) record(s) on a regular basis. + * It can be either on a rate (in milliseconds) or on a cron basis (cron expression). + * + * Note: If both rate and cron are specified in the configuration of the controller, + * the cron takes precedence over the rate. + * + * It is possible to add a field in the produced record containing a value + * (number of seconds since 1970) corresponding to the time at which the record has been + * produced and another field containing the date in a human-readable format in a + * specified timezone (CET being the default) + * + * By the way: + * It is also possible in addition to the standard record produced to generate 2 additional records. + * One is containing the data (snaphot, date) for the ongoing 'clock', and another one for the + * previous one. + * The use-case for these 2 additional records is the update of a table of snapshots for instance. + * The idea is to have a list of all snapshots, as well as the ongoing and the previous one. + * + */ +public class ClockSourceTask extends SourceTask { + + private Time time; + private long rate; + private String cronExpr; + private String recordIdField; + private String recordIdValue; + private String snapshotField; + private String tsidField; + private String dateField; + private CronSequenceGenerator cronSeqGen = null; + private boolean useCron = false; + private boolean useSnapshot = false; + private boolean useTSID = false; + private boolean useDate = false; + private boolean hasOngoingRecord = false; + private boolean hasPreviousRecord = false; + private long recordSnapshot = -1; // Uniquely identifies a poll/retrieval of the data from a src + private Schema finalSchema = null; + private long previousRecordSnapshot = -1; + private static long TSID_DEFAULT = -1; + private String dateFormat; + private String dateTimezone; + + + + @Override + public void start(Map props) { + this.time = new SystemTime(); + rate = Long.parseLong(props.get(ClockSourceConnector.RATE)); + cronExpr = props.get(ClockSourceConnector.POLL_CRON_SCHEDULER_CONFIG); + recordIdField = props.get(ClockSourceConnector.RECORD_ID_FIELD_CONFIG); + snapshotField = props.get(ClockSourceConnector.SNAPSHOT_FIELD_CONFIG); + tsidField = props.get(ClockSourceConnector.TSID_FIELD_CONFIG); + dateField = props.get(ClockSourceConnector.DATE_FIELD_CONFIG); + recordIdValue = props.get(ClockSourceConnector.CURRENT_RECORD_ID_VALUE_CONFIG); + dateFormat = props.get(ClockSourceConnector.DATE_FORMAT_CONFIG); + dateTimezone = props.get(ClockSourceConnector.DATE_TIMEZONE_CONFIG); + + // Check if cron should be used && Generate a cron object once for further use + if ((cronExpr != null) && (cronExpr.isEmpty() != true)) { + useCron = CronSequenceGenerator.isValidExpression(cronExpr); + } + + if (useCron) { + cronSeqGen = new CronSequenceGenerator(cronExpr); + } + useSnapshot = (snapshotField != null) ? true : false; + useTSID = (tsidField != null) ? true : false; + useDate = (dateField != null) ? true : false; + hasOngoingRecord = new Boolean(props.get(ClockSourceConnector.HAS_ONGOING_RECORD_CONFIG)); + hasPreviousRecord = new Boolean(props.get(ClockSourceConnector.HAS_PREVIOUS_RECORD_CONFIG)); + + // Build the schema if not created yet + if (finalSchema == null) { + SchemaBuilder newSchema = SchemaBuilder.struct(); + newSchema.field(recordIdField, Schema.STRING_SCHEMA); + if (useSnapshot) { + newSchema.field(snapshotField, Schema.INT64_SCHEMA); + } + if (useTSID) { + newSchema.field(tsidField, Schema.INT64_SCHEMA); + } + if (useDate) { + newSchema.field(dateField, Schema.STRING_SCHEMA); + } + finalSchema = newSchema.build(); + } + } + + @Override + public List poll() throws InterruptedException { + final long untilNext; + if (useCron) { + Date nextTriggerDate = cronSeqGen.next(new Date(time.milliseconds())); + long nextTriggerDateInMs = nextTriggerDate.getTime(); + untilNext = nextTriggerDateInMs - time.milliseconds(); + if (useSnapshot) { + recordSnapshot = nextTriggerDateInMs ; + } + time.sleep(untilNext); + } + else { + if (useSnapshot){ + recordSnapshot = (time.milliseconds()+rate) ; + } + Thread.sleep(rate); + } + + Struct recordVal = new Struct(finalSchema); + recordVal.put(recordIdField, recordIdValue); + if (useSnapshot) { + recordVal.put(snapshotField, recordSnapshot); + if (useDate){ + String jdate = secToString(recordSnapshot, dateFormat, dateTimezone); + recordVal.put(dateField, jdate); + } + } + if (useTSID) { + recordVal.put(tsidField, recordSnapshot); + } + + SourceRecord sr = new SourceRecord( + null, + null, + "", + finalSchema, + recordVal); + + if ( ! hasOngoingRecord && ! hasPreviousRecord ) { + return Collections.singletonList(sr); + } + else { + List listRecords = new LinkedList<>(); + listRecords.add(sr); + + if (useSnapshot) { + // Build ongoing record (if requested) + if (hasOngoingRecord){ + Struct orVal = new Struct(finalSchema); + orVal.put(recordIdField, "ongoing"); + if (useSnapshot) { + orVal.put(snapshotField, recordSnapshot); + if (useDate){ + String jdate = secToString(recordSnapshot, dateFormat, dateTimezone); + orVal.put(dateField, jdate); + } + } + if (useTSID) { + orVal.put(tsidField, TSID_DEFAULT); + } + + SourceRecord or = new SourceRecord( + null, + null, + "", + finalSchema, + orVal); + listRecords.add(or); + } + + // Build previous record (if requested) + if (hasPreviousRecord && previousRecordSnapshot > 0) { + Struct prVal = new Struct(finalSchema); + prVal.put(recordIdField, "previous"); + if (useSnapshot) { + prVal.put(snapshotField, previousRecordSnapshot); + if (useDate){ + String jdate = secToString(previousRecordSnapshot, dateFormat, dateTimezone); + prVal.put(dateField, jdate); + } + } + if (useTSID) { + prVal.put(tsidField, TSID_DEFAULT); + } + + SourceRecord pr = new SourceRecord(null, + null, + "", + finalSchema, + prVal); + listRecords.add(pr); + } + previousRecordSnapshot = recordSnapshot; + } + + return listRecords; + } + } + + @Override + public void stop() { + + } + + @Override + public String version() { + return "1.0"; + } + + /* + * Return the timeInSec in a Human Readable + * format (dateFormat) using the timezone given in parameter. + */ + private String secToString(long timeInSec, String dateFormat, String timezone){ + //convert seconds to milliseconds + Date date = new Date(timeInSec); + // format of the date + SimpleDateFormat jdf = new SimpleDateFormat(dateFormat); + jdf.setTimeZone(TimeZone.getTimeZone(timezone)); + String jdate = jdf.format(date); + return jdate; + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/PipelineConfigurationBroadcastWrapper.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/PipelineConfigurationBroadcastWrapper.java new file mode 100644 index 000000000..da7c39477 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/PipelineConfigurationBroadcastWrapper.java @@ -0,0 +1,80 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote; + +import com.hurence.logisland.engine.EngineContext; +import com.hurence.logisland.processor.ProcessContext; +import com.hurence.logisland.stream.StreamContext; +import org.apache.spark.SparkContext; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.broadcast.Broadcast; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collection; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * A {@link Broadcast} wrapper for a Stream pipeline configuration. + * This class allow to magically synchronize data modified from the spark driver to every executor. + * + * @author amarziali + */ +public class PipelineConfigurationBroadcastWrapper { + private static final Logger logger = LoggerFactory.getLogger(PipelineConfigurationBroadcastWrapper.class); + + private Broadcast>> broadcastedPipelineMap; + + private static PipelineConfigurationBroadcastWrapper obj = new PipelineConfigurationBroadcastWrapper(); + + private PipelineConfigurationBroadcastWrapper() { + } + + public static PipelineConfigurationBroadcastWrapper getInstance() { + return obj; + } + + public JavaSparkContext getSparkContext(SparkContext sc) { + JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); + return jsc; + } + + public void refresh(Map> pipelineMap, SparkContext sparkContext) { + logger.info("Refreshing dataflow pipelines!"); + + if (broadcastedPipelineMap != null) { + broadcastedPipelineMap.unpersist(); + } + broadcastedPipelineMap = getSparkContext(sparkContext).broadcast(pipelineMap); + } + + public void refresh(EngineContext engineContext, SparkContext sparkContext) { + logger.info("Refreshing dataflow pipelines!"); + + if (broadcastedPipelineMap != null) { + broadcastedPipelineMap.unpersist(); + } + broadcastedPipelineMap = getSparkContext(sparkContext).broadcast(engineContext.getStreamContexts().stream() + .collect(Collectors.toMap(StreamContext::getIdentifier, s -> s.getProcessContexts().stream().collect(Collectors.toList())))); + + } + + + public Collection get(String streamName) { + return broadcastedPipelineMap.getValue().get(streamName); + } +} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiClient.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiClient.java new file mode 100644 index 000000000..49e3ca5e1 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiClient.java @@ -0,0 +1,247 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; +import com.hurence.logisland.engine.spark.remote.model.DataFlow; +import okhttp3.*; +import okhttp3.internal.http.HttpDate; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.validation.ConstraintViolation; +import javax.validation.ConstraintViolationException; +import javax.validation.Validation; +import javax.validation.Validator; +import javax.ws.rs.core.HttpHeaders; +import javax.ws.rs.core.MediaType; +import java.time.Duration; +import java.time.Instant; +import java.util.Date; +import java.util.Iterator; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +/** + * Rest client wrapper for logisland remote APIs. + * + * @author amarziali + */ +public class RemoteApiClient { + + /** + * Conversation state. + */ + public static class State { + public Instant lastModified; + } + + /** + * Connection settings. + */ + public static class ConnectionSettings { + + private final String baseUrl; + private final Duration socketTimeout; + private final Duration connectTimeout; + private final String username; + private final String password; + + /** + * Constructs a new instance. + * If username and password are provided, the client will be configured to supply a basic authentication. + * + * @param baseUrl the base url + * @param socketTimeout the read/write socket timeout + * @param connectTimeout the connection socket timeout + * @param username the username if a basic authentication is needed. + * @param password the password if a basic authentication is needed. + */ + public ConnectionSettings(String baseUrl, Duration socketTimeout, Duration connectTimeout, String username, String password) { + this.baseUrl = baseUrl; + this.socketTimeout = socketTimeout; + this.connectTimeout = connectTimeout; + this.username = username; + this.password = password; + } + } + + private static final Logger logger = LoggerFactory.getLogger(RemoteApiClient.class); + + private static final String DATAFLOW_RESOURCE_URI = "dataflows"; + private static final String STREAM_RESOURCE_URI = "streams"; + + + private static final Validator validator = Validation.buildDefaultValidatorFactory().getValidator(); + + private final OkHttpClient client; + private final HttpUrl baseUrl; + private final ObjectMapper mapper; + + + /** + * Construct a new instance with provided connection settings. + * + * @param connectionSettings the {@link ConnectionSettings} + */ + public RemoteApiClient(ConnectionSettings connectionSettings) { + this.baseUrl = HttpUrl.parse(connectionSettings.baseUrl); + this.mapper = new ObjectMapper(); + mapper.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES); + mapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); + mapper.registerModule(new JavaTimeModule()) + .findAndRegisterModules(); + + OkHttpClient.Builder builder = new OkHttpClient() + .newBuilder() + .readTimeout(connectionSettings.socketTimeout.toMillis(), TimeUnit.MILLISECONDS) + .writeTimeout(connectionSettings.socketTimeout.toMillis(), TimeUnit.MILLISECONDS) + .connectTimeout(connectionSettings.connectTimeout.toMillis(), TimeUnit.MILLISECONDS) + .followRedirects(true) + .followSslRedirects(true); + //add basic auth if needed. + if (connectionSettings.username != null && connectionSettings.password != null) { + builder.addInterceptor(chain -> { + Request originalRequest = chain.request(); + Request requestWithBasicAuth = originalRequest + .newBuilder() + .header(HttpHeaders.AUTHORIZATION, Credentials.basic(connectionSettings.username, connectionSettings.password)) + .build(); + return chain.proceed(requestWithBasicAuth); + }); + } + this.client = builder.build(); + } + + + /** + * Generic method to fetch and validate a HTTP resource. + * + * @param url the resource Url. + * @param state the conversation state. + * @param resourceClass the bean model class. + * @param the type of the model data to return. + * @return an {@link Optional} bean containing requested validated data. + */ + private Optional doFetch(HttpUrl url, State state, Class resourceClass) { + Request.Builder request = new Request.Builder() + .url(url).addHeader(HttpHeaders.ACCEPT, MediaType.APPLICATION_JSON); + + if (state.lastModified != null) { + request.addHeader(HttpHeaders.IF_MODIFIED_SINCE, HttpDate.format(new Date((state.lastModified.toEpochMilli())))); + } + + try (Response response = client.newCall(request.build()).execute()) { + if (response.code() != javax.ws.rs.core.Response.Status.NOT_MODIFIED.getStatusCode()) { + + if (!response.isSuccessful()) { + logger.error("Error refreshing {} from remote server. Got code {}", resourceClass.getCanonicalName(), response.code()); + } else { + String lm = response.header(HttpHeaders.LAST_MODIFIED); + if (lm != null) { + try { + Date tmp = HttpDate.parse(lm); + if (tmp != null) { + state.lastModified = tmp.toInstant(); + } + } catch (Exception e) { + logger.warn("Unable to correctly parse Last-Modified Header"); + } + } + T ret = mapper.readValue(response.body().byteStream(), resourceClass); + //validate against javax.validation annotations. + doValidate(ret); + return Optional.of(ret); + } + } + } catch (Exception e) { + logger.error("Unable to refresh dataflow from remote server", e); + } + + return Optional.empty(); + } + + /** + * Perform validation of the given bean. + * + * @param bean the instance to validate + * @see javax.validation.Validator#validate + */ + private void doValidate(Object bean) { + Set> result = validator.validate(bean); + if (!result.isEmpty()) { + StringBuilder sb = new StringBuilder("Bean validation failed: "); + for (Iterator> it = result.iterator(); it.hasNext(); ) { + ConstraintViolation violation = it.next(); + sb.append(violation.getPropertyPath()).append(" - ").append(violation.getMessage()); + if (it.hasNext()) { + sb.append("; "); + } + } + throw new ConstraintViolationException(sb.toString(), result); + } + } + + /** + * Fetches dataflow from a remote server. + * + * @param dataflowName the name of the dataflow to fetch. + * @param state the conversation state (never null) + * @return a optional {@link DataFlow} (never null). Empty in case of error or no results. + */ + public Optional fetchDataflow(String dataflowName, State state) { + return doFetch(baseUrl.newBuilder().addPathSegment(DATAFLOW_RESOURCE_URI).addPathSegment(dataflowName).build(), + state, DataFlow.class); + } + + /** + * Push a dataflow configuration to a remote server. + * We do not care about http result code since the call is fire and forget. + * + * @param dataflowName the name of the dataflow to push + * @param dataFlow the item to push. + */ + public void pushDataFlow(String dataflowName, DataFlow dataFlow) { + try { + Request request = new Request.Builder() + .url(baseUrl.newBuilder() + .addPathSegment(DATAFLOW_RESOURCE_URI).addPathSegment(dataflowName) + .build()) + .addHeader(HttpHeaders.ACCEPT, MediaType.APPLICATION_JSON) + .post(dataFlow != null ? + RequestBody.create(okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), + + mapper.writeValueAsString(dataFlow)) : + RequestBody.create(null, new byte[0])) + .build(); + try (Response response = client.newCall(request).execute()) { + if (!response.isSuccessful()) { + logger.warn("Expected application to answer with 200 OK. Got {}", response.code()); + } + } + + + } catch (Exception e) { + logger.warn("Unexpected exception trying to push latest dataflow configuration", e); + } + } + + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiComponentFactory.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiComponentFactory.java new file mode 100644 index 000000000..a9748cffc --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiComponentFactory.java @@ -0,0 +1,250 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote; + +import com.hurence.logisland.component.ComponentFactory; +import com.hurence.logisland.component.ConfigurableComponent; +import com.hurence.logisland.component.PropertyDescriptor; +import com.hurence.logisland.config.ControllerServiceConfiguration; +import com.hurence.logisland.controller.ControllerService; +import com.hurence.logisland.controller.ControllerServiceInitializationContext; +import com.hurence.logisland.controller.StandardControllerServiceContext; +import com.hurence.logisland.engine.EngineContext; +import com.hurence.logisland.engine.spark.remote.model.*; +import com.hurence.logisland.processor.ProcessContext; +import com.hurence.logisland.processor.StandardProcessContext; +import com.hurence.logisland.stream.RecordStream; +import com.hurence.logisland.stream.StandardStreamContext; +import com.hurence.logisland.stream.StreamContext; +import org.apache.spark.SparkContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; + +/** + * ] + * Component factory resolving logisland components from remote api model. + * + * @author amarziali + */ +public class RemoteApiComponentFactory { + + private static final Logger logger = LoggerFactory.getLogger(RemoteApiComponentFactory.class); + + + /** + * Instantiates a stream from of configuration + * + * @param stream + * @return + */ + public StreamContext getStreamContext(Stream stream) { + try { + final RecordStream recordStream = ComponentFactory.loadComponent(stream.getComponent()); + final StreamContext instance = + new StandardStreamContext(recordStream, stream.getName()); + + // instantiate each related processor + stream.getPipeline().getProcessors().stream() + .map(this::getProcessContext) + .forEach(instance::addProcessContext); + + + // set the config properties + configureComponent(recordStream, stream.getConfig()) + .forEach((k, s) -> instance.setProperty(k, s)); + if (!instance.isValid()) { + throw new IllegalArgumentException("Stream is not valid"); + } + + logger.info("created stream {}", stream.getName()); + return instance; + + } catch (ClassNotFoundException e) { + throw new RuntimeException("unable to instantiate stream " + stream.getName(), e); + } + } + + /** + * Constructs processors. + * + * @param processor the processor bean. + * @return optionally the constructed processor context or nothing in case of error. + */ + public ProcessContext getProcessContext(Processor processor) { + try { + final com.hurence.logisland.processor.Processor processorInstance = ComponentFactory.loadComponent(processor.getComponent()); + final ProcessContext processContext = + new StandardProcessContext(processorInstance, processor.getName()); + + // set all properties + configureComponent(processorInstance, processor.getConfig()) + .forEach((k, s) -> processContext.setProperty(k, s)); + ; + + if (!processContext.isValid()) { + throw new IllegalArgumentException("Processor is not valid"); + } + + + logger.info("created processor {}", processor); + return processContext; + } catch (ClassNotFoundException e) { + throw new RuntimeException("unable to instantiate processor " + processor.getName(), e); + } + + } + + + /** + * Constructs controller services. + * + * @param service the service bean. + * @return optionally the constructed service configuration or nothing in case of error. + */ + public ControllerServiceConfiguration getControllerServiceConfiguration(Service service) { + try { + ControllerService cs = ComponentFactory.loadComponent(service.getComponent()); + ControllerServiceConfiguration configuration = new ControllerServiceConfiguration(); + configuration.setControllerService(service.getName()); + configuration.setComponent(service.getComponent()); + configuration.setDocumentation(service.getDocumentation()); + configuration.setType("service"); + configuration.setConfiguration(configureComponent(cs, service.getConfig())); + ControllerServiceInitializationContext ic = new StandardControllerServiceContext(cs, service.getName()); + configuration.getConfiguration().forEach((k, s) -> ic.setProperty(k, s)); + if (!ic.isValid()) { + throw new IllegalArgumentException("Service is not valid"); + } + logger.info("created service {}", service.getName()); + return configuration; + } catch (Exception e) { + throw new RuntimeException("unable to instantiate service " + service.getName(), e); + } + + + } + + /** + * Updates the state of the engine if needed. + * + * @param sparkContext the spark context + * @param engineContext the engineContext + * @param dataflow the new dataflow (new state) + * @param oldDataflow latest dataflow dataflow. + */ + public boolean updateEngineContext(SparkContext sparkContext, EngineContext engineContext, DataFlow dataflow, DataFlow oldDataflow) { + boolean changed = false; + if (oldDataflow == null || oldDataflow.getLastModified().isBefore(dataflow.getLastModified())) { + logger.info("We have a new configuration. Resetting current engine"); + logger.info("Configuring dataflow. Last change at {} is {}", dataflow.getLastModified(), dataflow.getModificationReason()); + + + List css = dataflow.getServices().stream() + .map(this::getControllerServiceConfiguration) + .collect(Collectors.toList()); + + List sc = dataflow.getStreams().stream() + .map(this::getStreamContext) + .collect(Collectors.toList()); + + sc.forEach(streamContext -> { + if (!streamContext.isValid()) { + throw new IllegalArgumentException("Unable to validate steam " + streamContext.getIdentifier()); + } + }); + + logger.info("Restarting engine"); + engineContext.getEngine().reset(engineContext); + css.forEach(engineContext::addControllerServiceConfiguration); + sc.forEach(engineContext::addStreamContext); + + PipelineConfigurationBroadcastWrapper.getInstance().refresh( + engineContext.getStreamContexts().stream() + .collect(Collectors.toMap(StreamContext::getIdentifier, StreamContext::getProcessContexts)) + , sparkContext); + updatePipelines(sparkContext, engineContext, dataflow.getStreams()); + engineContext.getEngine().start(engineContext); + changed = true; + + } else { + //need to update pipelines? + + Map streamMap = dataflow.getStreams().stream().collect(Collectors.toMap(Stream::getName, Function.identity())); + + List mergedStreamList = new ArrayList<>(); + for (Stream oldStream : oldDataflow.getStreams()) { + Stream newStream = streamMap.get(oldStream.getName()); + if (newStream != null && oldStream.getPipeline().getLastModified().isBefore(newStream.getPipeline().getLastModified())) { + changed = true; + logger.info("Detected change for pipeline {}", newStream.getName()); + mergedStreamList.add(newStream); + } else { + mergedStreamList.add(oldStream); + } + } + if (changed) { + updatePipelines(sparkContext, engineContext, mergedStreamList); + } + + } + return changed; + } + + + /** + * Update pipelines. + * + * @param sparkContext the spark context + * @param engineContext the engine context. + * @param streams the list of streams + */ + public void updatePipelines(SparkContext sparkContext, EngineContext engineContext, Collection streams) { + Map> pipelineMap = streams.stream() + .collect(Collectors.toMap(Stream::getName, + s -> s.getPipeline().getProcessors().stream().map(this::getProcessContext) + .collect(Collectors.toList()))); + engineContext.getStreamContexts().forEach(streamContext -> { + streamContext.getProcessContexts().clear(); + streamContext.getProcessContexts().addAll(pipelineMap.get(streamContext.getIdentifier())); + }); + + PipelineConfigurationBroadcastWrapper.getInstance().refresh(pipelineMap, sparkContext); + } + + private Map configureComponent(ConfigurableComponent component, Collection properties) { + final Map propertyMap = properties.stream().collect(Collectors.toMap(Property::getKey, Function.identity())); + return propertyMap.keySet().stream().map(component::getPropertyDescriptor) + .filter(propertyDescriptor -> propertyDescriptor != null) + .filter(propertyDescriptor -> propertyMap.containsKey(propertyDescriptor.getName()) || + (propertyDescriptor.getDefaultValue() != null && propertyDescriptor.isRequired())) + .collect(Collectors.toMap(PropertyDescriptor::getName, propertyDescriptor -> { + String value = propertyDescriptor.getDefaultValue(); + if (propertyMap.containsKey(propertyDescriptor.getName())) { + value = propertyMap.get(propertyDescriptor.getName()).getValue(); + } + return value; + })); + } + + +} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Component.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Component.java new file mode 100755 index 000000000..50ac7ad5a --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Component.java @@ -0,0 +1,186 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote.model; + +import com.fasterxml.jackson.annotation.JsonProperty; +import io.swagger.annotations.ApiModelProperty; + +import javax.validation.Valid; +import javax.validation.constraints.NotNull; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +/** + * Component + */ +@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") + +public class Component { + @JsonProperty("name") + private String name = null; + + @JsonProperty("component") + private String component = null; + + @JsonProperty("documentation") + private String documentation = null; + + @JsonProperty("config") + @Valid + private List config = new ArrayList<>(); + + public Component name(String name) { + this.name = name; + return this; + } + + /** + * Get name + * + * @return name + **/ + @ApiModelProperty(required = true, value = "") + @NotNull + + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public Component component(String component) { + this.component = component; + return this; + } + + /** + * Get component + * + * @return component + **/ + @ApiModelProperty(required = true, value = "") + @NotNull + + + public String getComponent() { + return component; + } + + public void setComponent(String component) { + this.component = component; + } + + public Component documentation(String documentation) { + this.documentation = documentation; + return this; + } + + /** + * Get documentation + * + * @return documentation + **/ + @ApiModelProperty(value = "") + + + public String getDocumentation() { + return documentation; + } + + public void setDocumentation(String documentation) { + this.documentation = documentation; + } + + public Component config(List config) { + this.config = config; + return this; + } + + public Component addConfigItem(Property configItem) { + if (this.config == null) { + this.config = new ArrayList(); + } + this.config.add(configItem); + return this; + } + + /** + * Get config + * + * @return config + **/ + @ApiModelProperty(value = "") + + @Valid + + public List getConfig() { + return config; + } + + public void setConfig(List config) { + this.config = config; + } + + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Component component = (Component) o; + return Objects.equals(this.name, component.name) && + Objects.equals(this.component, component.component) && + Objects.equals(this.documentation, component.documentation) && + Objects.equals(this.config, component.config); + } + + @Override + public int hashCode() { + return Objects.hash(name, component, documentation, config); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("class Component {\n"); + + sb.append(" name: ").append(toIndentedString(name)).append("\n"); + sb.append(" component: ").append(toIndentedString(component)).append("\n"); + sb.append(" documentation: ").append(toIndentedString(documentation)).append("\n"); + sb.append(" config: ").append(toIndentedString(config)).append("\n"); + sb.append("}"); + return sb.toString(); + } + + /** + * Convert the given object to string with each line indented by 4 spaces + * (except the first line). + */ + private String toIndentedString(Object o) { + if (o == null) { + return "null"; + } + return o.toString().replace("\n", "\n "); + } +} + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/DataFlow.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/DataFlow.java new file mode 100755 index 000000000..d10fc7a76 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/DataFlow.java @@ -0,0 +1,144 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote.model; + +import com.fasterxml.jackson.annotation.JsonProperty; +import io.swagger.annotations.ApiModel; +import io.swagger.annotations.ApiModelProperty; + +import javax.validation.Valid; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +/** + * A streaming pipeline. + */ +@ApiModel(description = "A streaming pipeline.") +@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") + +public class DataFlow extends Versioned { + @JsonProperty("services") + @Valid + private List services = new ArrayList<>(); + + @JsonProperty("streams") + @Valid + private List streams = new ArrayList<>(); + + public DataFlow services(List services) { + this.services = services; + return this; + } + + public DataFlow addServicesItem(Service servicesItem) { + if (this.services == null) { + this.services = new ArrayList(); + } + this.services.add(servicesItem); + return this; + } + + /** + * The service controllers. + * + * @return services + **/ + @ApiModelProperty(value = "The service controllers.") + + @Valid + + public List getServices() { + return services; + } + + public void setServices(List services) { + this.services = services; + } + + public DataFlow streams(List streams) { + this.streams = streams; + return this; + } + + public DataFlow addStreamsItem(Stream streamsItem) { + if (this.streams == null) { + this.streams = new ArrayList(); + } + this.streams.add(streamsItem); + return this; + } + + /** + * The engine properties. + * + * @return streams + **/ + @ApiModelProperty(value = "The engine properties.") + + @Valid + + public List getStreams() { + return streams; + } + + public void setStreams(List streams) { + this.streams = streams; + } + + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DataFlow dataFlow = (DataFlow) o; + return Objects.equals(this.services, dataFlow.services) && + Objects.equals(this.streams, dataFlow.streams) && + super.equals(o); + } + + @Override + public int hashCode() { + return Objects.hash(services, streams, super.hashCode()); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("class DataFlow {\n"); + sb.append(" ").append(toIndentedString(super.toString())).append("\n"); + sb.append(" services: ").append(toIndentedString(services)).append("\n"); + sb.append(" streams: ").append(toIndentedString(streams)).append("\n"); + sb.append("}"); + return sb.toString(); + } + + /** + * Convert the given object to string with each line indented by 4 spaces + * (except the first line). + */ + private String toIndentedString(Object o) { + if (o == null) { + return "null"; + } + return o.toString().replace("\n", "\n "); + } +} + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Pipeline.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Pipeline.java new file mode 100755 index 000000000..2d08c33fb --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Pipeline.java @@ -0,0 +1,108 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote.model; + +import com.fasterxml.jackson.annotation.JsonProperty; +import io.swagger.annotations.ApiModel; +import io.swagger.annotations.ApiModelProperty; + +import javax.validation.Valid; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +/** + * Tracks stream processing pipeline configuration + */ +@ApiModel(description = "Tracks stream processing pipeline configuration") +@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") + +public class Pipeline extends Versioned { + @JsonProperty("processors") + @Valid + private List processors = new ArrayList<>(); + + public Pipeline processors(List processors) { + this.processors = processors; + return this; + } + + public Pipeline addProcessorsItem(Processor processorsItem) { + if (this.processors == null) { + this.processors = new ArrayList(); + } + this.processors.add(processorsItem); + return this; + } + + /** + * Get processors + * + * @return processors + **/ + @ApiModelProperty(value = "") + + @Valid + + public List getProcessors() { + return processors; + } + + public void setProcessors(List processors) { + this.processors = processors; + } + + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Pipeline pipeline = (Pipeline) o; + return Objects.equals(this.processors, pipeline.processors) && + super.equals(o); + } + + @Override + public int hashCode() { + return Objects.hash(processors, super.hashCode()); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("class Pipeline {\n"); + sb.append(" ").append(toIndentedString(super.toString())).append("\n"); + sb.append(" processors: ").append(toIndentedString(processors)).append("\n"); + sb.append("}"); + return sb.toString(); + } + + /** + * Convert the given object to string with each line indented by 4 spaces + * (except the first line). + */ + private String toIndentedString(Object o) { + if (o == null) { + return "null"; + } + return o.toString().replace("\n", "\n "); + } +} + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Processor.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Processor.java new file mode 100755 index 000000000..1350a44c5 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Processor.java @@ -0,0 +1,66 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote.model; + +import io.swagger.annotations.ApiModel; + +import java.util.Objects; + +/** + * A logisland 'processor'. + */ +@ApiModel(description = "A logisland 'processor'.") +@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") + +public class Processor extends Component { + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + return true; + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode()); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("class Processor {\n"); + sb.append(" ").append(toIndentedString(super.toString())).append("\n"); + sb.append("}"); + return sb.toString(); + } + + /** + * Convert the given object to string with each line indented by 4 spaces + * (except the first line). + */ + private String toIndentedString(Object o) { + if (o == null) { + return "null"; + } + return o.toString().replace("\n", "\n "); + } +} + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Property.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Property.java new file mode 100755 index 000000000..739cc1707 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Property.java @@ -0,0 +1,147 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote.model; + +import com.fasterxml.jackson.annotation.JsonProperty; +import io.swagger.annotations.ApiModelProperty; + +import javax.validation.constraints.NotNull; +import java.util.Objects; + +/** + * Property + */ +@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") + +public class Property { + @JsonProperty("key") + private String key = null; + + @JsonProperty("type") + private String type = "string"; + + @JsonProperty("value") + private String value = null; + + public Property key(String key) { + this.key = key; + return this; + } + + /** + * Get key + * + * @return key + **/ + @ApiModelProperty(required = true, value = "") + @NotNull + + + public String getKey() { + return key; + } + + public void setKey(String key) { + this.key = key; + } + + public Property type(String type) { + this.type = type; + return this; + } + + /** + * Get type + * + * @return type + **/ + @ApiModelProperty(value = "") + + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public Property value(String value) { + this.value = value; + return this; + } + + /** + * Get value + * + * @return value + **/ + @ApiModelProperty(required = true, value = "") + @NotNull + + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } + + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Property property = (Property) o; + return Objects.equals(this.key, property.key) && + Objects.equals(this.type, property.type) && + Objects.equals(this.value, property.value); + } + + @Override + public int hashCode() { + return Objects.hash(key, type, value); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("class Property {\n"); + + sb.append(" key: ").append(toIndentedString(key)).append("\n"); + sb.append(" type: ").append(toIndentedString(type)).append("\n"); + sb.append(" value: ").append(toIndentedString(value)).append("\n"); + sb.append("}"); + return sb.toString(); + } + + /** + * Convert the given object to string with each line indented by 4 spaces + * (except the first line). + */ + private String toIndentedString(Object o) { + if (o == null) { + return "null"; + } + return o.toString().replace("\n", "\n "); + } +} + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Service.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Service.java new file mode 100755 index 000000000..a4834f747 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Service.java @@ -0,0 +1,66 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote.model; + +import io.swagger.annotations.ApiModel; + +import java.util.Objects; + +/** + * A logisland 'controller service'. + */ +@ApiModel(description = "A logisland 'controller service'.") +@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") + +public class Service extends Component { + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + return true; + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode()); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("class Service {\n"); + sb.append(" ").append(toIndentedString(super.toString())).append("\n"); + sb.append("}"); + return sb.toString(); + } + + /** + * Convert the given object to string with each line indented by 4 spaces + * (except the first line). + */ + private String toIndentedString(Object o) { + if (o == null) { + return "null"; + } + return o.toString().replace("\n", "\n "); + } +} + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Stream.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Stream.java new file mode 100755 index 000000000..8dc999e90 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Stream.java @@ -0,0 +1,94 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote.model; + +import com.fasterxml.jackson.annotation.JsonProperty; +import io.swagger.annotations.ApiModelProperty; + +import javax.validation.Valid; +import java.util.Objects; + +/** + * Stream + */ +@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") +public class Stream extends Component { + @JsonProperty("pipeline") + private Pipeline pipeline = null; + + public Stream pipeline(Pipeline pipeline) { + this.pipeline = pipeline; + return this; + } + + /** + * Get pipeline + * + * @return pipeline + **/ + @ApiModelProperty(value = "") + + @Valid + + public Pipeline getPipeline() { + return pipeline; + } + + public void setPipeline(Pipeline pipeline) { + this.pipeline = pipeline; + } + + + @Override + public boolean equals(java.lang.Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Stream stream = (Stream) o; + return Objects.equals(this.pipeline, stream.pipeline) && + super.equals(o); + } + + @Override + public int hashCode() { + return Objects.hash(pipeline, super.hashCode()); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("class Stream {\n"); + sb.append(" ").append(toIndentedString(super.toString())).append("\n"); + sb.append(" pipeline: ").append(toIndentedString(pipeline)).append("\n"); + sb.append("}"); + return sb.toString(); + } + + /** + * Convert the given object to string with each line indented by 4 spaces + * (except the first line). + */ + private String toIndentedString(java.lang.Object o) { + if (o == null) { + return "null"; + } + return o.toString().replace("\n", "\n "); + } +} + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Versioned.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Versioned.java new file mode 100755 index 000000000..fbd57fc65 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Versioned.java @@ -0,0 +1,125 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote.model; + +import com.fasterxml.jackson.annotation.JsonProperty; +import io.swagger.annotations.ApiModel; +import io.swagger.annotations.ApiModelProperty; + +import javax.validation.Valid; +import javax.validation.constraints.NotNull; +import java.time.OffsetDateTime; +import java.util.Objects; + +/** + * a versioned component + */ +@ApiModel(description = "a versioned component") +@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") + +public class Versioned { + @JsonProperty("lastModified") + private OffsetDateTime lastModified = null; + + @JsonProperty("modificationReason") + private String modificationReason = null; + + public Versioned lastModified(OffsetDateTime lastModified) { + this.lastModified = lastModified; + return this; + } + + /** + * the last modified timestamp of this pipeline (used to trigger changes). + * + * @return lastModified + **/ + @ApiModelProperty(required = true, value = "the last modified timestamp of this pipeline (used to trigger changes).") + @NotNull + + @Valid + + public OffsetDateTime getLastModified() { + return lastModified; + } + + public void setLastModified(OffsetDateTime lastModified) { + this.lastModified = lastModified; + } + + public Versioned modificationReason(String modificationReason) { + this.modificationReason = modificationReason; + return this; + } + + /** + * Can be used to document latest changeset. + * + * @return modificationReason + **/ + @ApiModelProperty(value = "Can be used to document latest changeset.") + + + public String getModificationReason() { + return modificationReason; + } + + public void setModificationReason(String modificationReason) { + this.modificationReason = modificationReason; + } + + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Versioned versioned = (Versioned) o; + return Objects.equals(this.lastModified, versioned.lastModified) && + Objects.equals(this.modificationReason, versioned.modificationReason); + } + + @Override + public int hashCode() { + return Objects.hash(lastModified, modificationReason); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("class Versioned {\n"); + + sb.append(" lastModified: ").append(toIndentedString(lastModified)).append("\n"); + sb.append(" modificationReason: ").append(toIndentedString(modificationReason)).append("\n"); + sb.append("}"); + return sb.toString(); + } + + /** + * Convert the given object to string with each line indented by 4 spaces + * (except the first line). + */ + private String toIndentedString(Object o) { + if (o == null) { + return "null"; + } + return o.toString().replace("\n", "\n "); + } +} + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProcessorMetrics.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProcessorMetrics.java new file mode 100644 index 000000000..cfb5cddad --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProcessorMetrics.java @@ -0,0 +1,123 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.util.spark; + + +import com.hurence.logisland.metrics.Names; +import com.hurence.logisland.record.FieldDictionary; +import com.hurence.logisland.record.Record; +import org.apache.spark.groupon.metrics.UserMetricsSystem; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + + +/** + * Created by tom on 09/09/16. + */ +public class ProcessorMetrics { + private static Logger logger = LoggerFactory.getLogger(ProcessorMetrics.class.getName()); + + public synchronized static void resetMetrics(final String metricPrefix) { + UserMetricsSystem.gauge(metricPrefix + Names.INCOMING_MESSAGES).set(0); + UserMetricsSystem.gauge(metricPrefix + Names.INCOMING_RECORDS).set(0); + UserMetricsSystem.gauge(metricPrefix + Names.OUTGOING_RECORDS).set(0); + UserMetricsSystem.gauge(metricPrefix + Names.ERRORS).set(0); + UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_FIELD_AVERAGE).set(0); + UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_RECORD_AVERAGE).set(0); + UserMetricsSystem.gauge(metricPrefix + Names.RECORDS_PER_SECOND_AVERAGE).set(0); + UserMetricsSystem.gauge(metricPrefix + Names.PROCESSED_BYTES).set(0); + UserMetricsSystem.gauge(metricPrefix + Names.PROCESSED_FIELDS).set(0); + UserMetricsSystem.gauge(metricPrefix + Names.ERROR_PERCENTAGE).set(0); + UserMetricsSystem.gauge(metricPrefix + Names.FIELDS_PER_RECORD_AVERAGE).set(0); + UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_SECOND_AVERAGE).set(0); + //UserMetricsSystem.gauge(metricPrefix + "processing_time_ms").set(0); + } + + + /** + * publish + * + * @param metricPrefix + * @param incomingEvents + * @param outgoingEvents + * @param fromOffset + * @param untilOffset + * @param processingDurationInMillis + */ + public synchronized static void computeMetrics( + final String metricPrefix, + final Collection incomingEvents, + final Collection outgoingEvents, + final long fromOffset, + final long untilOffset, + final long processingDurationInMillis) { + + + if ((outgoingEvents != null) && (outgoingEvents.size() != 0)) { + + UserMetricsSystem.gauge(metricPrefix + Names.INCOMING_MESSAGES).set(untilOffset - fromOffset); + UserMetricsSystem.gauge(metricPrefix + Names.INCOMING_RECORDS).set(incomingEvents.size()); + UserMetricsSystem.gauge(metricPrefix + Names.OUTGOING_RECORDS).set(outgoingEvents.size()); + + long errorCount = outgoingEvents.stream().filter(r -> r != null && r.hasField(FieldDictionary.RECORD_ERRORS)).count(); + UserMetricsSystem.gauge(metricPrefix + "errors").set(errorCount); + if (outgoingEvents.size() != 0) { + final List recordSizesInBytes = new ArrayList<>(); + final List recordNumberOfFields = new ArrayList<>(); + + outgoingEvents.forEach(record -> { + recordSizesInBytes.add(record.sizeInBytes()); + recordNumberOfFields.add(record.size()); + }); + + final int numberOfProcessedBytes = recordSizesInBytes.stream().mapToInt(Integer::intValue).sum(); + final int numberOfProcessedFields = recordNumberOfFields.stream().mapToInt(Integer::intValue).sum(); + + if (numberOfProcessedFields != 0) { + UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_FIELD_AVERAGE).set(numberOfProcessedBytes / numberOfProcessedFields); + } else { + UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_FIELD_AVERAGE).set(0); + } + if (processingDurationInMillis != 0) { + UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_SECOND_AVERAGE).set(numberOfProcessedBytes * 1000 / processingDurationInMillis); + UserMetricsSystem.gauge(metricPrefix + Names.RECORDS_PER_SECOND_AVERAGE).set(outgoingEvents.size() * 1000 / processingDurationInMillis); + } else { + UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_SECOND_AVERAGE).set(0); + UserMetricsSystem.gauge(metricPrefix + Names.RECORDS_PER_SECOND_AVERAGE).set(0); + } + + + UserMetricsSystem.gauge(metricPrefix + Names.PROCESSED_BYTES).set(numberOfProcessedBytes); + UserMetricsSystem.gauge(metricPrefix + Names.PROCESSED_FIELDS).set(numberOfProcessedFields); + + UserMetricsSystem.gauge(metricPrefix + Names.ERROR_PERCENTAGE).set((long) (100.0f * errorCount / outgoingEvents.size())); + UserMetricsSystem.gauge(metricPrefix + Names.FIELDS_PER_RECORD_AVERAGE).set(numberOfProcessedFields / outgoingEvents.size()); + UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_RECORD_AVERAGE).set(numberOfProcessedBytes / outgoingEvents.size()); + } else if (errorCount > 0) + UserMetricsSystem.gauge(metricPrefix + Names.ERROR_PERCENTAGE).set(100L); + else + UserMetricsSystem.gauge(metricPrefix + Names.ERROR_PERCENTAGE).set(0L); + + + // UserMetricsSystem.gauge(metricPrefix + "processing_time_ms").set(processingDurationInMillis); + + } + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProtoBufRegistrator.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProtoBufRegistrator.java new file mode 100644 index 000000000..62e0159a2 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProtoBufRegistrator.java @@ -0,0 +1,30 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.util.spark; + +import com.esotericsoftware.kryo.Kryo; +import com.esotericsoftware.kryo.serializers.FieldSerializer; +import org.apache.spark.serializer.KryoRegistrator; +import org.eclipse.kura.core.message.protobuf.KuraPayloadProto; + + +public class ProtoBufRegistrator implements KryoRegistrator { + @Override + public void registerClasses(Kryo kryo) { + kryo.register(KuraPayloadProto.KuraPayload.class, new FieldSerializer(kryo, KuraPayloadProto.KuraPayload.class)); + } +} + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkConfigReader.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkConfigReader.java new file mode 100644 index 000000000..1f059e7e6 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkConfigReader.java @@ -0,0 +1,59 @@ +package com.hurence.logisland.util.spark; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; +import com.hurence.logisland.config.LogislandConfiguration; +import com.hurence.logisland.util.string.StringUtils; +import org.apache.spark.SparkContext; +import org.apache.spark.rdd.RDD; + +import java.util.Arrays; + +import static com.hurence.logisland.config.ConfigReader.checkLogislandConf; + +/** + * This configuration reader depends on spark. We do not want to place methods in this class in the + * com.hurence.logisland.config.ConfigReader class where the loadConfig (from local filesystem) method + * resides, as it would introduce a spark dependency in the logisland-framework module. Only the spark + * engine should have a spark dependency. So this class should be loaded from the StreamProcessingRunner + * and this will succeed only in environments where a spark 2 engine is available and used, otherwise it + * will fail to load. This will for instance be successful in the databricks environment, which is by the + * way the first purpose for which this class is being introduced. + */ +public class SparkConfigReader { + + /** + * Loads a YAML config file using (file located in the shared filesystem) + * + * @param configFilePath the path of the config file + * @return a LogislandSessionConfiguration + * @throws Exception + */ + public static LogislandConfiguration loadConfigFromSharedFS(String configFilePath) throws Exception { + ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); + + /** + * In Databricks, developers should utilize the shared SparkContext instead of creating one using the constructor. + * When running a job, you can access the shared context by calling SparkContext.getOrCreate(). + * + * Also in databricks, a path like /path/to/a/file will be loaded from DBFS so will be interpreted like + * dbfs:/path/to/a/file + */ + + SparkContext sparkContext = SparkContext.getOrCreate(); + + RDD configRdd = sparkContext.textFile(configFilePath, 1); + String[] configStringArray = (String[])configRdd.collect(); + String configString = String.join("\n", Arrays.asList(configStringArray)); + + // replace all host from environment variables + String fileContent = StringUtils.resolveEnvVars(configString, "localhost"); + + System.out.println("Configuration:\n" + fileContent); + + LogislandConfiguration logislandConf = mapper.readValue(fileContent, LogislandConfiguration.class); + checkLogislandConf(logislandConf); + + return logislandConf; + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkPlatform.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkPlatform.java new file mode 100644 index 000000000..32124abb3 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkPlatform.java @@ -0,0 +1,27 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.util.spark; + +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; + +public interface SparkPlatform { + Dataset createStreamingDataFrame(SQLContext sqlContext, RDD catalystRows, StructType schema); +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/resources/META-INF/services/org.apache.spark.metrics.sink.KafkaSink b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/resources/META-INF/services/org.apache.spark.metrics.sink.KafkaSink new file mode 100644 index 000000000..ecb6d54f3 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/resources/META-INF/services/org.apache.spark.metrics.sink.KafkaSink @@ -0,0 +1 @@ +org.apache.spark.metrics.sink.KafkaSink \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/KafkaStreamProcessingEngine.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/KafkaStreamProcessingEngine.scala new file mode 100644 index 000000000..c7ee30d8c --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/KafkaStreamProcessingEngine.scala @@ -0,0 +1,659 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.hurence.logisland.engine.spark + + +import java.util +import java.util.concurrent.Executors +import java.util.regex.Pattern +import java.util.{Collections, UUID} + +import com.hurence.logisland.component.{AllowableValue, ComponentContext, PropertyDescriptor} +import com.hurence.logisland.engine.spark.remote.PipelineConfigurationBroadcastWrapper +import com.hurence.logisland.engine.{AbstractProcessingEngine, EngineContext} +import com.hurence.logisland.stream.spark.{AbstractKafkaRecordStream, SparkRecordStream} +import com.hurence.logisland.validator.StandardValidators +import org.apache.spark.groupon.metrics.UserMetricsSystem +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.streaming.StreamingQueryListener +import org.apache.spark.streaming.{Milliseconds, StreamingContext} +import org.apache.spark.{SparkConf, SparkContext, SparkEnv} +import org.slf4j.LoggerFactory + +import scala.collection.JavaConversions._ + + +object KafkaStreamProcessingEngine { + + + val SPARK_PROPERTIES_FILE_PATH: PropertyDescriptor = new PropertyDescriptor.Builder()//Not used in code but in logisland.sh script. Si it must be present ! + .name("spark.properties.file.path") + .description("for using --properties-file option while submitting spark job") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build + + val SPARK_MONITORING_DRIVER_PORT: PropertyDescriptor = new PropertyDescriptor.Builder()//Not used in code but in logisland.sh script. Si it must be present ! + .name("spark.monitoring.driver.port") + .description("The port for exposing monitoring metrics") + .required(false) + .addValidator(StandardValidators.POSITIVE_LONG_VALIDATOR) + .build + + val SPARK_MASTER = new PropertyDescriptor.Builder() + .name("spark.master") + .description("The url to Spark Master") + .required(true) + // The regex allows "local[K]" with K as an integer, "local[*]", "yarn", "yarn-client", "yarn-cluster" and "spark://HOST[:PORT]" + // there is NO support for "mesos://HOST:PORT" + .addValidator(StandardValidators.createRegexMatchingValidator(Pattern.compile( + "^(yarn|" + + "local(\\[([0-9]+|\\*)(,[0-9]+)?\\])?|" + + "spark:\\/\\/[a-z0-9\\.\\-]+(:[0-9]+)?(,[a-z0-9\\.\\-]+(:[0-9]+)?)*|" + + "mesos:\\/\\/((zk:\\/\\/[a-z0-9\\.\\-]+:[0-9]+(,[a-z0-9\\.\\-]+:[0-9]+)*\\/mesos)|(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+|[a-z][a-z0-9\\.\\-]+)(:[0-9]+)?))|" + + "k8s://.+)$"))) + .defaultValue("local[2]") + .build + + val SPARK_APP_NAME = new PropertyDescriptor.Builder() + .name("spark.app.name") + .description("Tha application name") + .required(true) + .addValidator(StandardValidators.createRegexMatchingValidator(Pattern.compile("^[a-zA-z0-9-_\\.]+$"))) + .defaultValue("logisland") + .build + + val SPARK_STREAMING_BATCH_DURATION = new PropertyDescriptor.Builder() + .name("spark.streaming.batchDuration") + .description("") + .required(true) + .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) + .defaultValue("2000") + .build + + val SPARK_YARN_DEPLOYMODE = new PropertyDescriptor.Builder() + .name("spark.yarn.deploy-mode") + .description("The yarn deploy mode") + .required(false) + // .allowableValues("client", "cluster") + .build + + val SPARK_YARN_QUEUE = new PropertyDescriptor.Builder() + .name("spark.yarn.queue") + .description("The name of the YARN queue") + .required(false) + // .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .defaultValue("default") + .build + + val memorySizePattern = Pattern.compile("^[0-9]+[mMgG]$"); + val SPARK_DRIVER_MEMORY = new PropertyDescriptor.Builder() + .name("spark.driver.memory") + .description("The memory size for Spark driver") + .required(false) + .addValidator(StandardValidators.createRegexMatchingValidator(memorySizePattern)) + .defaultValue("512m") + .build + + val SPARK_EXECUTOR_MEMORY = new PropertyDescriptor.Builder() + .name("spark.executor.memory") + .description("The memory size for Spark executors") + .required(false) + .addValidator(StandardValidators.createRegexMatchingValidator(memorySizePattern)) + .defaultValue("1g") + .build + + val SPARK_DRIVER_CORES = new PropertyDescriptor.Builder() + .name("spark.driver.cores") + .description("The number of cores for Spark driver") + .required(false) + .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) + .defaultValue("4") + .build + + val SPARK_EXECUTOR_CORES = new PropertyDescriptor.Builder() + .name("spark.executor.cores") + .description("The number of cores for Spark driver") + .required(false) + .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) + .defaultValue("1") + .build + + val SPARK_EXECUTOR_INSTANCES = new PropertyDescriptor.Builder() + .name("spark.executor.instances") + .description("The number of instances for Spark app") + .required(false) + .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) + .build + + val SPARK_SERIALIZER = new PropertyDescriptor.Builder() + .name("spark.serializer") + .description("Class to use for serializing objects that will be sent over the network " + + "or need to be cached in serialized form") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .defaultValue("org.apache.spark.serializer.KryoSerializer") + .build + + val SPARK_STREAMING_BLOCK_INTERVAL = new PropertyDescriptor.Builder() + .name("spark.streaming.blockInterval") + .description("Interval at which data received by Spark Streaming receivers is chunked into blocks " + + "of data before storing them in Spark. Minimum recommended - 50 ms") + .required(false) + .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) + .defaultValue("350") + .build + + val SPARK_STREAMING_KAFKA_MAX_RATE_PER_PARTITION = new PropertyDescriptor.Builder() + .name("spark.streaming.kafka.maxRatePerPartition") + .description("Maximum rate (number of records per second) at which data will be read from each Kafka partition") + .required(false) + .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) + .defaultValue("5000") + .build + + val SPARK_STREAMING_BACKPRESSURE_ENABLED = new PropertyDescriptor.Builder() + .name("spark.streaming.backpressure.enabled") + .description("This enables the Spark Streaming to control the receiving rate based on " + + "the current batch scheduling delays and processing times so that the system " + + "receives only as fast as the system can process.") + .required(false) + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .defaultValue("false") + .build + + val SPARK_STREAMING_UNPERSIST = new PropertyDescriptor.Builder() + .name("spark.streaming.unpersist") + .description("Force RDDs generated and persisted by Spark Streaming to be automatically unpersisted " + + "from Spark's memory. The raw input data received by Spark Streaming is also automatically cleared." + + " Setting this to false will allow the raw data and persisted RDDs to be accessible outside " + + "the streaming application as they will not be cleared automatically. " + + "But it comes at the cost of higher memory usage in Spark.") + .required(false) + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .defaultValue("false") + .build + + val SPARK_UI_PORT = new PropertyDescriptor.Builder() + .name("spark.ui.port") + .description("") + .required(false) + .addValidator(StandardValidators.PORT_VALIDATOR) + .defaultValue("4050") + .build + + val SPARK_STREAMING_TIMEOUT = new PropertyDescriptor.Builder() + .name("spark.streaming.timeout") + .description("") + .required(false) + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("-1") + .build + + val SPARK_STREAMING_KAFKA_MAXRETRIES = new PropertyDescriptor.Builder() + .name("spark.streaming.kafka.maxRetries") + .description("Maximum rate (number of records per second) at which data will be read from each Kafka partition") + .required(false) + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("3") + .build + + val SPARK_STREAMING_UI_RETAINED_BATCHES = new PropertyDescriptor.Builder() + .name("spark.streaming.ui.retainedBatches") + .description("How many batches the Spark Streaming UI and status APIs remember before garbage collecting.") + .required(false) + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("200") + .build + + val SPARK_STREAMING_RECEIVER_WAL_ENABLE = new PropertyDescriptor.Builder() + .name("spark.streaming.receiver.writeAheadLog.enable") + .description("Enable write ahead logs for receivers. " + + "All the input data received through receivers will be saved to write ahead logs " + + "that will allow it to be recovered after driver failures.") + .required(false) + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .defaultValue("false") + .build + + + val SPARK_YARN_MAX_APP_ATTEMPTS = new PropertyDescriptor.Builder() + .name("spark.yarn.maxAppAttempts") + .description("Because Spark driver and Application Master share a single JVM," + + " any error in Spark driver stops our long-running job. " + + "Fortunately it is possible to configure maximum number of attempts " + + "that will be made to re-run the application. " + + "It is reasonable to set higher value than default 2 " + + "(derived from YARN cluster property yarn.resourcemanager.am.max-attempts). " + + "4 works quite well, higher value may cause unnecessary restarts" + + " even if the reason of the failure is permanent.") + .required(false) + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("4") + .build + + + val SPARK_YARN_AM_ATTEMPT_FAILURES_VALIDITY_INTERVAL = new PropertyDescriptor.Builder() + .name("spark.yarn.am.attemptFailuresValidityInterval") + .description("If the application runs for days or weeks without restart " + + "or redeployment on highly utilized cluster, " + + "4 attempts could be exhausted in few hours. " + + "To avoid this situation, the attempt counter should be reset on every hour of so.") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .defaultValue("1h") + .build + + val SPARK_YARN_MAX_EXECUTOR_FAILURES = new PropertyDescriptor.Builder() + .name("spark.yarn.max.executor.failures") + .description("a maximum number of executor failures before the application fails. " + + "By default it is max(2 * num executors, 3), " + + "well suited for batch jobs but not for long-running jobs." + + " The property comes with corresponding validity interval which also should be set." + + "8 * num_executors") + .required(false) + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("20") + .build + + + val SPARK_YARN_EXECUTOR_FAILURES_VALIDITY_INTERVAL = new PropertyDescriptor.Builder() + .name("spark.yarn.executor.failuresValidityInterval") + .description("If the application runs for days or weeks without restart " + + "or redeployment on highly utilized cluster, " + + "x attempts could be exhausted in few hours. " + + "To avoid this situation, the attempt counter should be reset on every hour of so.") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .defaultValue("1h") + .build + + val SPARK_TASK_MAX_FAILURES = new PropertyDescriptor.Builder() + .name("spark.task.maxFailures") + .description("For long-running jobs you could also consider to boost maximum" + + " number of task failures before giving up the job. " + + "By default tasks will be retried 4 times and then job fails.") + .required(false) + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("8") + .build + + val SPARK_MEMORY_STORAGE_FRACTION = new PropertyDescriptor.Builder() + .name("spark.memory.storageFraction") + .description("expresses the size of R as a fraction of M (default 0.5). " + + "R is the storage space within M where cached blocks immune to being evicted by execution.") + .required(false) + .addValidator(StandardValidators.FLOAT_VALIDATOR) + .defaultValue("0.5") + .build + + val SPARK_MEMORY_FRACTION = new PropertyDescriptor.Builder() + .name("spark.memory.fraction") + .description("expresses the size of M as a fraction of the (JVM heap space - 300MB) (default 0.75). " + + "The rest of the space (25%) is reserved for user data structures, internal metadata in Spark, " + + "and safeguarding against OOM errors in the case of sparse and unusually large records.") + .required(false) + .addValidator(StandardValidators.FLOAT_VALIDATOR) + .defaultValue("0.6") + .build + + val FAIR = new AllowableValue("FAIR", "FAIR", "fair sharing") + val FIFO = new AllowableValue("FIFO", "FIFO", "queueing jobs one after another") + + val SPARK_SCHEDULER_MODE = new PropertyDescriptor.Builder() + .name("spark.scheduler.mode") + .description("The scheduling mode between jobs submitted to the same SparkContext. " + + "Can be set to FAIR to use fair sharing instead of queueing jobs one after another. " + + "Useful for multi-user services.") + .required(false) + .allowableValues(FAIR, FIFO) + .defaultValue(FAIR.getValue) + .build + + val JAVA_MESOS_LIBRARY_PATH = new PropertyDescriptor.Builder() + .name("java.library.path") + .description("The java library path to use with mesos.") + .required(false) + .build + + val SPARK_MESOS_CORE_MAX = new PropertyDescriptor.Builder() + .name("spark.cores.max") + .description("The maximum number of total executor core with mesos.") + .required(false) + .build + +} + + +class KafkaStreamProcessingEngine extends AbstractProcessingEngine { + + private val logger = LoggerFactory.getLogger(classOf[KafkaStreamProcessingEngine]) + private val conf = new SparkConf() + private var running = false + protected var batchDurationMs: Int = 1000 + + + /** + * Provides subclasses the ability to perform initialization logic + */ + override def init(context: ComponentContext): Unit = { + super.init(context) + val engineContext = context.asInstanceOf[EngineContext] + val sparkMaster = engineContext.getPropertyValue(KafkaStreamProcessingEngine.SPARK_MASTER).asString + val appName = engineContext.getPropertyValue(KafkaStreamProcessingEngine.SPARK_APP_NAME).asString + batchDurationMs = engineContext.getPropertyValue(KafkaStreamProcessingEngine.SPARK_STREAMING_BATCH_DURATION).asInteger().intValue() + + /** + * job configuration + */ + + + conf.setAppName(appName) + conf.setMaster(sparkMaster) + + def setConfProperty(conf: SparkConf, engineContext: EngineContext, propertyDescriptor: PropertyDescriptor) = { + + // Need to check if the properties are set because those properties are not "requires" + if (engineContext.getPropertyValue(propertyDescriptor).isSet) { + conf.set(propertyDescriptor.getName, engineContext.getPropertyValue(propertyDescriptor).asString) + } + } + + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_STREAMING_UI_RETAINED_BATCHES) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_STREAMING_RECEIVER_WAL_ENABLE) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_STREAMING_KAFKA_MAXRETRIES) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_UI_PORT) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_STREAMING_UNPERSIST) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_STREAMING_BACKPRESSURE_ENABLED) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_STREAMING_BLOCK_INTERVAL) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_STREAMING_KAFKA_MAX_RATE_PER_PARTITION) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_SERIALIZER) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_DRIVER_MEMORY) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_EXECUTOR_MEMORY) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_DRIVER_CORES) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_EXECUTOR_CORES) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_EXECUTOR_INSTANCES) + + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_YARN_MAX_APP_ATTEMPTS) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_YARN_AM_ATTEMPT_FAILURES_VALIDITY_INTERVAL) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_YARN_MAX_EXECUTOR_FAILURES) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_YARN_EXECUTOR_FAILURES_VALIDITY_INTERVAL) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_TASK_MAX_FAILURES) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_MEMORY_FRACTION) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_MEMORY_STORAGE_FRACTION) + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_SCHEDULER_MODE) + + conf.set("spark.kryo.registrator", "com.hurence.logisland.util.spark.ProtoBufRegistrator") + + if (sparkMaster startsWith "yarn") { + // Note that SPARK_YARN_DEPLOYMODE is not used by spark itself but only by spark-submit CLI + // That's why we do not need to propagate it here + setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_YARN_QUEUE) + } + + @transient val sparkContext = getCurrentSparkContext() + + UserMetricsSystem.initialize(sparkContext, "LogislandMetrics") + + + + + /** + * shutdown context gracefully + */ + sys.ShutdownHookThread { + logger.info("Gracefully stopping Spark Streaming Application") + shutdown(engineContext) + logger.info("Application stopped") + } + + + PipelineConfigurationBroadcastWrapper.getInstance().refresh(engineContext, sparkContext) + + + SQLContext.getOrCreate(getCurrentSparkContext()).streams.addListener(new StreamingQueryListener { + + val runMap = scala.collection.mutable.Map[UUID, String]() + val executor = Executors.newSingleThreadExecutor() + //force early initialization of this pool + executor.submit(new Runnable { + override def run(): Unit = {} + }) + + override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = { + logger.info(s"Streaming query for stream ${event.name} has been started") + runMap.put(event.id, event.name) + } + + override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = { + } + + override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = { + if (event.exception.isDefined && !getCurrentSparkContext().isStopped) { + val currentStreamId = runMap.get(event.id) + logger.warn(s"Streaming query for stream $currentStreamId terminated with exception ${event.exception}. " + + s"The engine will be reset") + + executor.submit(new Runnable { + override def run(): Unit = { + Thread.sleep(1000); + engineContext.getEngine.reset(engineContext) + } + }) + } + } + }) + + running = true + + logger.info(s"spark context initialized with master:$sparkMaster, " + + s"appName:$appName, " + + s"batchDuration:$batchDurationMs ") + logger.info(s"conf : ${conf.toDebugString}") + } + + override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { + val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] + descriptors.add(KafkaStreamProcessingEngine.SPARK_APP_NAME) + descriptors.add(KafkaStreamProcessingEngine.SPARK_MASTER) + descriptors.add(KafkaStreamProcessingEngine.SPARK_MONITORING_DRIVER_PORT) + descriptors.add(KafkaStreamProcessingEngine.SPARK_YARN_DEPLOYMODE) + descriptors.add(KafkaStreamProcessingEngine.SPARK_YARN_QUEUE) + descriptors.add(KafkaStreamProcessingEngine.SPARK_DRIVER_MEMORY) + descriptors.add(KafkaStreamProcessingEngine.SPARK_EXECUTOR_MEMORY) + descriptors.add(KafkaStreamProcessingEngine.SPARK_DRIVER_CORES) + descriptors.add(KafkaStreamProcessingEngine.SPARK_EXECUTOR_CORES) + descriptors.add(KafkaStreamProcessingEngine.SPARK_EXECUTOR_INSTANCES) + descriptors.add(KafkaStreamProcessingEngine.SPARK_SERIALIZER) + descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_BLOCK_INTERVAL) + descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_KAFKA_MAX_RATE_PER_PARTITION) + descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_BATCH_DURATION) + descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_BACKPRESSURE_ENABLED) + descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_UNPERSIST) + descriptors.add(KafkaStreamProcessingEngine.SPARK_UI_PORT) + descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_TIMEOUT) + descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_KAFKA_MAXRETRIES) + descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_UI_RETAINED_BATCHES) + descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_RECEIVER_WAL_ENABLE) + descriptors.add(KafkaStreamProcessingEngine.SPARK_YARN_MAX_APP_ATTEMPTS) + descriptors.add(KafkaStreamProcessingEngine.SPARK_YARN_AM_ATTEMPT_FAILURES_VALIDITY_INTERVAL) + descriptors.add(KafkaStreamProcessingEngine.SPARK_YARN_MAX_EXECUTOR_FAILURES) + descriptors.add(KafkaStreamProcessingEngine.SPARK_YARN_EXECUTOR_FAILURES_VALIDITY_INTERVAL) + descriptors.add(KafkaStreamProcessingEngine.SPARK_TASK_MAX_FAILURES) + descriptors.add(KafkaStreamProcessingEngine.SPARK_MEMORY_FRACTION) + descriptors.add(KafkaStreamProcessingEngine.SPARK_MEMORY_STORAGE_FRACTION) + descriptors.add(KafkaStreamProcessingEngine.SPARK_SCHEDULER_MODE) + descriptors.add(KafkaStreamProcessingEngine.SPARK_PROPERTIES_FILE_PATH) + descriptors.add(KafkaStreamProcessingEngine.JAVA_MESOS_LIBRARY_PATH) + descriptors.add(KafkaStreamProcessingEngine.SPARK_MESOS_CORE_MAX) + + Collections.unmodifiableList(descriptors) + } + + + /** + * start the engine + * + * @param engineContext + */ + override def start(engineContext: EngineContext) = { + logger.info("starting Spark Engine") + val streamingContext = createStreamingContext(engineContext) + if (!engineContext.getStreamContexts.map(p => p.getStream).filter(p => p.isInstanceOf[AbstractKafkaRecordStream]).isEmpty) { + streamingContext.start() + } + + } + + protected def getCurrentSparkStreamingContext(sparkContext: SparkContext): StreamingContext = { + return StreamingContext.getActiveOrCreate(() => + return new StreamingContext(sparkContext, Milliseconds(batchDurationMs)) + ) + } + + protected def getCurrentSparkContext(): SparkContext = { + return SparkContext.getOrCreate(conf) + } + + + def createStreamingContext(engineContext: EngineContext): StreamingContext = { + + + @transient val sc = getCurrentSparkContext() + @transient val ssc = getCurrentSparkStreamingContext(sc) + val appName = sc.appName; + + + /** + * loop over processContext + */ + engineContext.getStreamContexts.foreach(streamingContext => { + try { + val kafkaStream = streamingContext.getStream.asInstanceOf[SparkRecordStream] + + kafkaStream.setup(appName, ssc, streamingContext, engineContext) + kafkaStream.start() + } catch { + case ex: Exception => + throw new IllegalStateException("something bad happened, please check Kafka or cluster health", ex) + } + + }) + ssc + } + + + override def shutdown(engineContext: EngineContext) = { + if (running) { + running = false + logger.info(s"shutting down Spark engine") + stop(engineContext, true) + } + } + + def stop(engineContext: EngineContext, doStopSparkContext: Boolean) = { + synchronized { + val sc = getCurrentSparkContext(); + if (!sc.isStopped) { + + engineContext.getStreamContexts.foreach(streamingContext => { + try { + val kafkaStream = streamingContext.getStream.asInstanceOf[SparkRecordStream] + kafkaStream.stop() + } catch { + case ex: Exception => + logger.error("something bad happened, please check Kafka or cluster health : {}", ex.getMessage) + } + }) + + try { + if (!sc.isStopped) { + val ssc = getCurrentSparkStreamingContext(sc); + ssc.stop(stopSparkContext = false, stopGracefully = true) + } + + } finally { + if (doStopSparkContext && !sc.isStopped) { + try { + sc.stop(); + } catch { + case ex: Exception => + logger.error("something bad while stopping the spark context. Please check cluster health : {}", ex.getMessage) + } + } + } + + } + } + } + + override def onPropertyModified(descriptor: PropertyDescriptor, oldValue: String, newValue: String) = { + logger.info(s"property ${ + descriptor.getName + } value changed from $oldValue to $newValue") + } + + /** + * Await for termination. + * + */ + override def awaitTermination(engineContext: EngineContext): Unit = { + var timeout = engineContext.getPropertyValue(KafkaStreamProcessingEngine.SPARK_STREAMING_TIMEOUT) + .asInteger().toInt + val sc = getCurrentSparkContext() + + while (!sc.isStopped) { + try { + if (timeout < 0) { + Thread.sleep(200) + } else { + val toSleep = Math.min(200, timeout); + Thread.sleep(toSleep) + timeout -= toSleep + } + } catch { + case e: InterruptedException => return + case unknown: Throwable => throw unknown + } + } + } + + + /** + * Reset the engine by stopping the streaming context. + */ + override def reset(engineContext: EngineContext): Unit = { + shutdown(engineContext) + } + + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/RemoteApiStreamProcessingEngine.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/RemoteApiStreamProcessingEngine.scala new file mode 100644 index 000000000..f6ef5ce60 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/RemoteApiStreamProcessingEngine.scala @@ -0,0 +1,198 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark + +import java.time.Duration +import java.util +import java.util.Collections +import java.util.concurrent.{Executors, TimeUnit} + +import com.hurence.logisland.component.PropertyDescriptor +import com.hurence.logisland.engine.EngineContext +import com.hurence.logisland.engine.spark.remote.model.DataFlow +import com.hurence.logisland.engine.spark.remote.{RemoteApiClient, RemoteApiComponentFactory} +import com.hurence.logisland.stream.StandardStreamContext +import com.hurence.logisland.stream.spark.DummyRecordStream +import com.hurence.logisland.validator.StandardValidators +import org.apache.spark.streaming.dstream.DStream +import org.slf4j.LoggerFactory + +object RemoteApiStreamProcessingEngine { + val REMOTE_API_BASE_URL = new PropertyDescriptor.Builder() + .name("remote.api.baseUrl") + .description("The base URL of the remote server providing logisland configuration") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build + + val REMOTE_API_POLLING_RATE = new PropertyDescriptor.Builder() + .name("remote.api.polling.rate") + .description("Remote api polling rate in milliseconds") + .required(true) + .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) + .build + + val REMOTE_API_CONFIG_PUSH_RATE = new PropertyDescriptor.Builder() + .name("remote.api.push.rate") + .description("Remote api configuration push rate in milliseconds") + .required(true) + .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) + .build + + val REMOTE_API_CONNECT_TIMEOUT = new PropertyDescriptor.Builder() + .name("remote.api.timeouts.connect") + .description("Remote api connection timeout in milliseconds") + .required(false) + .defaultValue("10000") + .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) + .build + + val REMOTE_API_SOCKET_TIMEOUT = new PropertyDescriptor.Builder() + .name("remote.api.timeouts.socket") + .description("Remote api default read/write socket timeout in milliseconds") + .required(false) + .defaultValue("10000") + .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) + .build + + val REMOTE_API_USER = new PropertyDescriptor.Builder() + .name("remote.api.auth.user") + .description("The basic authentication user for the remote api endpoint.") + .required(false) + .build + + val REMOTE_API_PASSWORD = new PropertyDescriptor.Builder() + .name("remote.api.auth.password") + .description("The basic authentication password for the remote api endpoint.") + .required(false) + .build +} + +class RemoteApiStreamProcessingEngine extends KafkaStreamProcessingEngine { + + private val logger = LoggerFactory.getLogger(classOf[RemoteApiStreamProcessingEngine]) + private var initialized = false + + + override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { + val ret = new util.ArrayList(super.getSupportedPropertyDescriptors) + ret.add(RemoteApiStreamProcessingEngine.REMOTE_API_BASE_URL) + ret.add(RemoteApiStreamProcessingEngine.REMOTE_API_POLLING_RATE) + ret.add(RemoteApiStreamProcessingEngine.REMOTE_API_CONFIG_PUSH_RATE) + ret.add(RemoteApiStreamProcessingEngine.REMOTE_API_CONNECT_TIMEOUT) + ret.add(RemoteApiStreamProcessingEngine.REMOTE_API_USER) + ret.add(RemoteApiStreamProcessingEngine.REMOTE_API_PASSWORD) + ret.add(RemoteApiStreamProcessingEngine.REMOTE_API_SOCKET_TIMEOUT) + return Collections.unmodifiableList(ret) + } + + + /** + * start the engine + * + * @param engineContext + */ + override def start(engineContext: EngineContext): Unit = { + // engineContext.addStreamContext(new StandardStreamContext(new DummyRecordStream(), "busybox")) + + if (!initialized) { + initialized = true + val remoteApiClient = new RemoteApiClient(new RemoteApiClient.ConnectionSettings( + engineContext.getProperty(RemoteApiStreamProcessingEngine.REMOTE_API_BASE_URL), + Duration.ofMillis(engineContext.getPropertyValue(RemoteApiStreamProcessingEngine.REMOTE_API_SOCKET_TIMEOUT).asLong()), + Duration.ofMillis(engineContext.getPropertyValue(RemoteApiStreamProcessingEngine.REMOTE_API_CONNECT_TIMEOUT).asLong()), + engineContext.getProperty(RemoteApiStreamProcessingEngine.REMOTE_API_USER), + engineContext.getProperty(RemoteApiStreamProcessingEngine.REMOTE_API_PASSWORD))) + + + val appName = getCurrentSparkContext().appName + var currentDataflow: DataFlow = null + + //schedule dataflow refresh + @transient lazy val executor = Executors.newSingleThreadScheduledExecutor(); + @transient lazy val remoteApiComponentFactory = new RemoteApiComponentFactory + + + executor.scheduleWithFixedDelay(new Runnable { + val state = new RemoteApiClient.State + + override def run(): Unit = { + var changed = false + try { + val dataflow = remoteApiClient.fetchDataflow(appName, state) + if (dataflow.isPresent) { + changed = true + if (remoteApiComponentFactory.updateEngineContext(getCurrentSparkContext(), engineContext, dataflow.get, currentDataflow)) { + currentDataflow = dataflow.get() + } + } + } catch { + case default: Throwable => { + currentDataflow = null + logger.warn("Unexpected exception while trying to poll for new dataflow configuration", default) + reset(engineContext) + } + } finally { + if (changed) { + try { + remoteApiClient.pushDataFlow(appName, currentDataflow); + } catch { + case default: Throwable => logger.warn("Unexpected exception while trying to push configuration to remote server", default) + } + } + } + } + }, 0, engineContext.getProperty(RemoteApiStreamProcessingEngine.REMOTE_API_POLLING_RATE).toInt, TimeUnit.MILLISECONDS + ) + + executor.scheduleWithFixedDelay(new Runnable { + + override def run(): Unit = { + try { + remoteApiClient.pushDataFlow(appName, currentDataflow) + } catch { + case default: Throwable => logger.warn("Unexpected exception while trying to push configuration to remote server", default) + } + } + }, 0, engineContext.getProperty(RemoteApiStreamProcessingEngine.REMOTE_API_CONFIG_PUSH_RATE).toInt, TimeUnit.MILLISECONDS + ) + + + } + + + super.start(engineContext) + } + + + override def shutdown(engineContext: EngineContext): Unit = { + super.shutdown(engineContext) + } + + /** + * Reset the engine by stopping the streaming context. + */ + override def reset(engineContext: EngineContext): Unit = { + logger.info(s"Resetting engine ${ + engineContext.getIdentifier + }") + super.stop(engineContext, false) + engineContext.getStreamContexts.clear() + engineContext.getControllerServiceConfigurations.clear() + } + + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/AbstractKafkaRecordStream.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/AbstractKafkaRecordStream.scala new file mode 100644 index 000000000..8f2da59a7 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/AbstractKafkaRecordStream.scala @@ -0,0 +1,344 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.hurence.logisland.stream.spark + +import java.io.ByteArrayInputStream +import java.util +import java.util.Collections + +import com.hurence.logisland.component.PropertyDescriptor +import com.hurence.logisland.engine.EngineContext +import com.hurence.logisland.engine.spark.remote.PipelineConfigurationBroadcastWrapper +import com.hurence.logisland.record.Record +import com.hurence.logisland.serializer._ +import com.hurence.logisland.stream.StreamProperties._ +import com.hurence.logisland.stream.{AbstractRecordStream, StreamContext} +import com.hurence.logisland.util.kafka.KafkaSink +import com.hurence.logisland.util.spark._ +import kafka.zk.AdminZkClient +import kafka.zk.KafkaZkClient +import kafka.zookeeper.ZooKeeperClient +import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, OffsetAndMetadata, OffsetCommitCallback} +import org.apache.kafka.clients.producer.ProducerConfig +import org.apache.kafka.common.TopicPartition +import org.apache.kafka.common.security.JaasUtils +import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySerializer} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.groupon.metrics.UserMetricsSystem +import org.apache.spark.rdd.RDD +import org.apache.spark.streaming.kafka.KafkaUtils; +//import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe +//import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent +//import org.apache.spark.streaming.kafka010.{CanCommitOffsets, KafkaUtils, OffsetRange} +import org.apache.spark.streaming.{Seconds, StreamingContext} +import org.slf4j.LoggerFactory + +import scala.collection.JavaConversions._ + + +abstract class AbstractKafkaRecordStream extends AbstractRecordStream with SparkRecordStream { + + + val NONE_TOPIC: String = "none" + private val logger = LoggerFactory.getLogger(this.getClass) + protected var kafkaSink: Broadcast[KafkaSink] = null + protected var appName: String = "" + @transient protected var ssc: StreamingContext = null + protected var streamContext: StreamContext = null + protected var engineContext: EngineContext = null + protected var controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink] = null + protected var needMetricsReset = false + + override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { + val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] + descriptors.add(ERROR_TOPICS) + descriptors.add(INPUT_TOPICS) + descriptors.add(OUTPUT_TOPICS) + descriptors.add(AVRO_INPUT_SCHEMA) + descriptors.add(AVRO_OUTPUT_SCHEMA) + descriptors.add(INPUT_SERIALIZER) + descriptors.add(OUTPUT_SERIALIZER) + descriptors.add(ERROR_SERIALIZER) + descriptors.add(KAFKA_TOPIC_AUTOCREATE) + descriptors.add(KAFKA_TOPIC_DEFAULT_PARTITIONS) + descriptors.add(KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR) + descriptors.add(KAFKA_METADATA_BROKER_LIST) + descriptors.add(KAFKA_ZOOKEEPER_QUORUM) + descriptors.add(KAFKA_MANUAL_OFFSET_RESET) + descriptors.add(KAFKA_BATCH_SIZE) + descriptors.add(KAFKA_LINGER_MS) + descriptors.add(KAFKA_ACKS) + descriptors.add(WINDOW_DURATION) + descriptors.add(SLIDE_DURATION) + Collections.unmodifiableList(descriptors) + } + + + override def setup(appName: String, ssc: StreamingContext, streamContext: StreamContext, engineContext: EngineContext) = { + this.appName = appName + this.ssc = ssc + this.streamContext = streamContext + this.engineContext = engineContext + + } + + override def getStreamContext(): StreamingContext = this.ssc + + override def start() = { + if (ssc == null) + throw new IllegalStateException("stream not initialized") + + try { + + // Define the Kafka parameters, broker list must be specified + val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString.split(",").toSet + val outputTopics = streamContext.getPropertyValue(OUTPUT_TOPICS).asString.split(",").toSet + val errorTopics = streamContext.getPropertyValue(ERROR_TOPICS).asString.split(",").toSet + val metricsTopics = DEFAULT_METRICS_TOPIC.getValue.split(",").toSet + + val topicAutocreate = streamContext.getPropertyValue(KAFKA_TOPIC_AUTOCREATE).asBoolean().booleanValue() + val topicDefaultPartitions = streamContext.getPropertyValue(KAFKA_TOPIC_DEFAULT_PARTITIONS).asInteger().intValue() + val topicDefaultReplicationFactor = streamContext.getPropertyValue(KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR).asInteger().intValue() + val brokerList = streamContext.getPropertyValue(KAFKA_METADATA_BROKER_LIST).asString + val zkQuorum = streamContext.getPropertyValue(KAFKA_ZOOKEEPER_QUORUM).asString + + val kafkaBatchSize = streamContext.getPropertyValue(KAFKA_BATCH_SIZE).asString + val kafkaLingerMs = streamContext.getPropertyValue(KAFKA_LINGER_MS).asString + val kafkaAcks = streamContext.getPropertyValue(KAFKA_ACKS).asString + val kafkaOffset = streamContext.getPropertyValue(KAFKA_MANUAL_OFFSET_RESET).asString + + + val kafkaSinkParams = Map( + ProducerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokerList, + ProducerConfig.CLIENT_ID_CONFIG -> appName, + ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getCanonicalName, + ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName, + ProducerConfig.ACKS_CONFIG -> kafkaAcks, + ProducerConfig.RETRIES_CONFIG -> "3", + ProducerConfig.LINGER_MS_CONFIG -> kafkaLingerMs, + ProducerConfig.BATCH_SIZE_CONFIG -> kafkaBatchSize, + ProducerConfig.RETRY_BACKOFF_MS_CONFIG -> "1000", + ProducerConfig.RECONNECT_BACKOFF_MS_CONFIG -> "1000") + + kafkaSink = ssc.sparkContext.broadcast(KafkaSink(kafkaSinkParams)) + controllerServiceLookupSink = ssc.sparkContext.broadcast( + ControllerServiceLookupSink(engineContext.getControllerServiceConfigurations) + ) + + // TODO deprecate topic creation here (must be done through the agent) +// if (topicAutocreate) { +//// val zkUtils = ZkUtils.apply(zkQuorum, 10000, 10000, JaasUtils.isZkSecurityEnabled) +//// createTopicsIfNeeded(zkUtils, inputTopics, topicDefaultPartitions, topicDefaultReplicationFactor) +//// createTopicsIfNeeded(zkUtils, outputTopics, topicDefaultPartitions, topicDefaultReplicationFactor) +//// createTopicsIfNeeded(zkUtils, errorTopics, topicDefaultPartitions, topicDefaultReplicationFactor) +//// createTopicsIfNeeded(zkUtils, metricsTopics, 1, 1) +// zkQuorum +// val zooKeeperClient : ZooKeeperClient = new ZooKeeperClient(zkQuorum, +// 1000, +// 1000: Int, +// 10: Int, +// , +// metricGroup: String, +// metricType: String) ) +// val kafkaZkClient : KafkaZkClient = new KafkaZkClient +// val adminZkClient : AdminZkClient = new AdminZkClient[kafkaZkClient] +// } + + + val kafkaParams = Map[String, Object]( + ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokerList, + ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer], + ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer], + ConsumerConfig.GROUP_ID_CONFIG -> appName, + ConsumerConfig.RECONNECT_BACKOFF_MS_CONFIG -> "50", + ConsumerConfig.RETRY_BACKOFF_MS_CONFIG -> "100", + ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> kafkaOffset, + ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> "false", + ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG -> "30000" + /*, + ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG -> "5000"*/ + ) + + + logger.info(s"starting Kafka direct stream on topics $inputTopics from $kafkaOffset offsets") + @transient val kafkaStream = KafkaUtils.createDirectStream[Array[Byte], Array[Byte]]( + ssc, + PreferConsistent, + Subscribe[Array[Byte], Array[Byte]](inputTopics, kafkaParams) + ) + + // do the parallel processing + + val stream = if (streamContext.getPropertyValue(WINDOW_DURATION).isSet) { + if (streamContext.getPropertyValue(SLIDE_DURATION).isSet) + kafkaStream.window( + Seconds(streamContext.getPropertyValue(WINDOW_DURATION).asLong()), + Seconds(streamContext.getPropertyValue(SLIDE_DURATION).asLong()) + ) + else + kafkaStream.window(Seconds(streamContext.getPropertyValue(WINDOW_DURATION).asLong())) + + } else kafkaStream + + + stream + .foreachRDD(rdd => { + + this.streamContext.getProcessContexts().clear(); + this.streamContext.getProcessContexts().addAll( + PipelineConfigurationBroadcastWrapper.getInstance().get(this.streamContext.getIdentifier)) + + if (!rdd.isEmpty()) { + + + val offsetRanges = process(rdd) + // some time later, after outputs have completed + if (offsetRanges.nonEmpty) { + // kafkaStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges.get) + + + kafkaStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges.get, new OffsetCommitCallback() { + def onComplete(m: java.util.Map[TopicPartition, OffsetAndMetadata], e: Exception) { + if (null != e) { + logger.error("error commiting offsets", e) + } + } + }) + + + needMetricsReset = true + } + else if (needMetricsReset) { + try { + + for (partitionId <- 0 to rdd.getNumPartitions) { + val pipelineMetricPrefix = streamContext.getIdentifier + "." + + "partition" + partitionId + "." + val pipelineTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + "Pipeline.processing_time_ms").time() + + streamContext.getProcessContexts.foreach(processorContext => { + UserMetricsSystem.timer(pipelineMetricPrefix + processorContext.getIdentifier + ".processing_time_ms") + .time() + .stop() + + ProcessorMetrics.resetMetrics(pipelineMetricPrefix + processorContext.getIdentifier + ".") + }) + pipelineTimerContext.stop() + } + } catch { + case ex: Throwable => + logger.error(s"exception : ${ex.toString}") + None + } finally { + needMetricsReset = false + } + } + } + + }) + } catch { + case ex: Throwable => + ex.printStackTrace() + logger.error("something bad happened, please check Kafka or Zookeeper health : {}", ex) + } + } + + + /** + * to be overriden by subclasses + * + * @param rdd + */ + def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] + + + /** + * build a serializer + * + * @param inSerializerClass the serializer type + * @param schemaContent an Avro schema + * @return the serializer + */ + def getSerializer(inSerializerClass: String, schemaContent: String): RecordSerializer = { + SerializerProvider.getSerializer(inSerializerClass, schemaContent) + } + + /** + * + * @param partition + * @param serializer + * @return + */ + def deserializeRecords(partition: Iterator[ConsumerRecord[Array[Byte], Array[Byte]]], serializer: RecordSerializer): List[Record] = { + partition.flatMap(rawEvent => { + try { + val bais = new ByteArrayInputStream(rawEvent.value()) + val deserialized = serializer.deserialize(bais) + bais.close() + + Some(deserialized) + } catch { + case t: Throwable => + logger.error(s"exception while deserializing events ${t.getMessage}") + None + } + }).toList + } + + +// /** +// * Topic creation +// * +// * @param zkUtils +// * @param topics +// * @param topicDefaultPartitions +// * @param topicDefaultReplicationFactor +// */ +// def createTopicsIfNeeded(zkUtils: ZkUtils, +// topics: Set[String], +// topicDefaultPartitions: Int, +// topicDefaultReplicationFactor: Int): Unit = { +// +// topics.foreach(topic => { +// +// if (!topic.equals(NONE_TOPIC) && !AdminUtils.topicExists(zkUtils, topic)) { +// AdminUtils.createTopic(zkUtils, topic, topicDefaultPartitions, topicDefaultReplicationFactor) +// Thread.sleep(1000) +// logger.info(s"created topic $topic with" + +// s" $topicDefaultPartitions partitions and" + +// s" $topicDefaultReplicationFactor replicas") +// } +// }) +// } +} + + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/DummyRecordStream.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/DummyRecordStream.scala new file mode 100644 index 000000000..fec79e1cd --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/DummyRecordStream.scala @@ -0,0 +1,68 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark + +import java.util + +import com.hurence.logisland.component.PropertyDescriptor +import com.hurence.logisland.engine.EngineContext +import com.hurence.logisland.stream.{AbstractRecordStream, StreamContext} +import com.hurence.logisland.util.spark.SparkUtils +import org.apache.spark.storage.StorageLevel +import org.apache.spark.streaming.StreamingContext +import org.apache.spark.streaming.receiver.Receiver + +class DummyRecordStream extends AbstractRecordStream with SparkRecordStream { + + @transient private var streamingContext: StreamingContext = _ + + /** + * Allows subclasses to register which property descriptor objects are + * supported. + * + * @return PropertyDescriptor objects this processor currently supports + */ + override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { + return new util.ArrayList[PropertyDescriptor]() + } + + override def start(): Unit = { + val stream = streamingContext.receiverStream(new Receiver[Long](StorageLevel.NONE) { + override def onStart(): Unit = {} + + override def onStop(): Unit = {} + }) + stream.foreachRDD(rdd => { + //do nothing :) + }) + stream.start() + + } + + /** + * setup the stream with spark app properties + * + * @param appName + * @param ssc + * @param streamContext + */ + override def setup(appName: String, ssc: StreamingContext, streamContext: StreamContext, engineContext: EngineContext): Unit = { + streamingContext = ssc + + } + + override def getStreamContext(): StreamingContext = streamingContext +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamDebugger.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamDebugger.scala new file mode 100644 index 000000000..3af102811 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamDebugger.scala @@ -0,0 +1,191 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (bailet.thomas@gmail.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark + +import java.util +import java.util.Collections + +import com.hurence.logisland.record.{FieldDictionary, Record, RecordUtils} +import com.hurence.logisland.util.record.RecordSchemaUtil +import com.hurence.logisland.util.spark.ProcessorMetrics +import org.apache.avro.Schema +import org.apache.kafka.clients.consumer.ConsumerRecord +import org.apache.spark.TaskContext +import org.apache.spark.rdd.RDD +import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange} +import org.slf4j.LoggerFactory + +import scala.collection.JavaConversions._ +import com.hurence.logisland.stream.StreamProperties._ + +class KafkaRecordStreamDebugger extends AbstractKafkaRecordStream { + val logger = LoggerFactory.getLogger(this.getClass.getName) + + + /** + * launch the chain of processing for each partition of the RDD in parallel + * + * @param rdd + */ + override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { + if (!rdd.isEmpty()) { + // Cast the rdd to an interface that lets us get an array of OffsetRange + val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges + + val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString + val outputTopics = streamContext.getPropertyValue(OUTPUT_TOPICS).asString + val errorTopics = streamContext.getPropertyValue(ERROR_TOPICS).asString + val brokerList = streamContext.getPropertyValue(KAFKA_METADATA_BROKER_LIST).asString + + + rdd.foreachPartition(partition => { + if (partition.nonEmpty) { + /** + * index to get the correct offset range for the rdd partition we're working on + * This is safe because we haven't shuffled or otherwise disrupted partitioning, + * and the original input rdd partitions were 1:1 with kafka partitions + */ + val partitionId = TaskContext.get.partitionId() + val offsetRange = offsetRanges(TaskContext.get.partitionId) + + /** + * create serializers + */ + val deserializer = getSerializer( + streamContext.getPropertyValue(INPUT_SERIALIZER).asString, + streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) + val serializer = getSerializer( + streamContext.getPropertyValue(OUTPUT_SERIALIZER).asString, + streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) + val errorSerializer = getSerializer( + streamContext.getPropertyValue(ERROR_SERIALIZER).asString, + streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) + + /** + * process events by chaining output records + */ + var firstPass = true + var incomingEvents: util.Collection[Record] = Collections.emptyList() + var outgoingEvents: util.Collection[Record] = Collections.emptyList() + val processingMetrics: util.Collection[Record] = new util.ArrayList[Record]() + logger.info("start processing") + + streamContext.getProcessContexts.foreach(processorContext => { + val startTime = System.currentTimeMillis() + val processor = processorContext.getProcessor + + + if (firstPass) { + /** + * convert incoming Kafka messages into Records + * if there's no serializer we assume that we need to compute a Record from K/V + */ + incomingEvents = if ( + streamContext.getPropertyValue(INPUT_SERIALIZER).asString + == NO_SERIALIZER.getValue) { + // parser + partition.map(rawMessage => { + val key = if (rawMessage.key() != null) new String(rawMessage.key()) else "" + val value = if (rawMessage.value() != null) new String(rawMessage.value()) else "" + RecordUtils.getKeyValueRecord(key, value) + }).toList + } else { + // processor + deserializeRecords(partition, deserializer) + } + + firstPass = false + } else { + incomingEvents = outgoingEvents + } + + /** + * process incoming events + */ + outgoingEvents = processor.process(processorContext, incomingEvents) + + + }) + + + /** + * Do we make records compliant with a given Avro schema ? + */ + if (streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).isSet) { + try { + val strSchema = streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString() + val schema = RecordSchemaUtil.compileSchema(strSchema) + + + outgoingEvents = outgoingEvents.map(record => RecordSchemaUtil.convertToValidRecord(record, schema)) + } catch { + case t: Throwable => + logger.warn("something wrong while converting records " + + "to valid accordingly to provide Avro schema " + t.getMessage) + } + + } + + + logger.info("sending to kafka") + + /** + * push outgoing events and errors to Kafka + */ + kafkaSink.value.produce( + streamContext.getPropertyValue(OUTPUT_TOPICS).asString, + outgoingEvents.toList, + serializer + ) + + kafkaSink.value.produce( + streamContext.getPropertyValue(ERROR_TOPICS).asString, + outgoingEvents.filter(r => r.hasField(FieldDictionary.RECORD_ERRORS)).toList, + errorSerializer + ) + + logger.info("saving offsets") + + /** + * save latest offset to Zookeeper + */ + // zkSink.value.saveOffsetRangesToZookeeper(appName, offsetRange) + logger.info("processed " + outgoingEvents.size() + " messages") + } + }) + + return Some(offsetRanges) + } + None + } +} + + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamHDFSBurner.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamHDFSBurner.scala new file mode 100644 index 000000000..3cf4fdc3a --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamHDFSBurner.scala @@ -0,0 +1,229 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (bailet.thomas@gmail.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark + +import java.text.SimpleDateFormat +import java.util +import java.util.Collections + +import com.hurence.logisland.component.PropertyDescriptor +import com.hurence.logisland.record.{FieldDictionary, FieldType} +import com.hurence.logisland.stream.StreamProperties._ +import com.hurence.logisland.util.spark.SparkUtils +import org.apache.kafka.clients.consumer.ConsumerRecord +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession} +import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange} +import org.slf4j.LoggerFactory + + +class KafkaRecordStreamHDFSBurner extends AbstractKafkaRecordStream { + + + private val logger = LoggerFactory.getLogger(classOf[KafkaRecordStreamHDFSBurner]) + + + override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { + val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] + + descriptors.addAll(super.getSupportedPropertyDescriptors()) + + descriptors.add(OUTPUT_FOLDER_PATH) + descriptors.add(OUTPUT_FORMAT) + descriptors.add(RECORD_TYPE) + descriptors.add(NUM_PARTITIONS) + descriptors.add(EXCLUDE_ERRORS) + descriptors.add(DATE_FORMAT) + descriptors.add(INPUT_FORMAT) + Collections.unmodifiableList(descriptors) + } + + private def sanitizeSchema(dataType: DataType): DataType = { + dataType match { + case structType: StructType => + DataTypes.createStructType(structType.fields.map(f => + DataTypes.createStructField(f.name.replaceAll("[:,-]", "_"), sanitizeSchema(f.dataType), f.nullable, f.metadata) + )) + case arrayType: ArrayType => + DataTypes.createArrayType(sanitizeSchema(arrayType.elementType), arrayType.containsNull) + case mapType: MapType => + DataTypes.createMapType(sanitizeSchema(mapType.keyType), sanitizeSchema(mapType.valueType), mapType.valueContainsNull) + case other => other + } + + + } + + override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { + if (!rdd.isEmpty()) { + // Cast the rdd to an interface that lets us get an array of OffsetRange + val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges + + // Get the singleton instance of SQLContext + val sqlContext = SparkSession + .builder() + .appName(appName) + .config(ssc.sparkContext.getConf) + .getOrCreate() + + + // this is used to implicitly convert an RDD to a DataFrame. + + val deserializer = getSerializer( + streamContext.getPropertyValue(INPUT_SERIALIZER).asString, + streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) + + + val records = rdd.mapPartitions(p => deserializeRecords(p, deserializer).iterator) + + + if (!records.isEmpty()) { + + + val sdf = new SimpleDateFormat(streamContext.getPropertyValue(DATE_FORMAT).asString) + + + val numPartitions = streamContext.getPropertyValue(NUM_PARTITIONS).asInteger() + val outputFormat = streamContext.getPropertyValue(OUTPUT_FORMAT).asString() + val doExcludeErrors = streamContext.getPropertyValue(EXCLUDE_ERRORS).asBoolean() + val recordType = streamContext.getPropertyValue(RECORD_TYPE).asString() + val outPath = streamContext.getPropertyValue(OUTPUT_FOLDER_PATH).asString() + + val records = rdd.mapPartitions(p => deserializeRecords(p, deserializer).iterator) + .filter(r => + r.hasField(FieldDictionary.RECORD_TYPE) && + r.getField(FieldDictionary.RECORD_TYPE).asString() == recordType) + .map(r => { + try { + if (r.hasField(FieldDictionary.RECORD_DAYTIME)) + r + else + r.setField(FieldDictionary.RECORD_DAYTIME, FieldType.STRING, sdf.format(r.getTime)) + } + catch { + case ex: Throwable => r + } + }) + + + if (!records.isEmpty()) { + var df: DataFrame = null; + val inputFormat = streamContext.getPropertyValue(INPUT_FORMAT).asString() + if (inputFormat.isEmpty) { + + val schema = SparkUtils.convertFieldsNameToSchema(records.take(1)(0)) + val rows = if (doExcludeErrors) { + records + .filter(r => !r.hasField(FieldDictionary.RECORD_ERRORS)) + .map(r => SparkUtils.convertToRow(r, schema)) + } else { + records.map(r => SparkUtils.convertToRow(r, schema)) + } + + + logger.info(schema.toString()) + df = sqlContext.createDataFrame(rows, schema) + } else { + if ("json".equals(inputFormat)) { + import sqlContext.implicits._ + val rdf = records.map(record => (record.getType, record.getField(FieldDictionary.RECORD_DAYTIME).asString)) + .toDF(FieldDictionary.RECORD_TYPE, FieldDictionary.RECORD_DAYTIME) + val json = sqlContext.read.json(records.map(record => record.getField(FieldDictionary.RECORD_VALUE).asString())) + val merged = rdf.rdd.zip(json.rdd) + .map { + case (rowLeft, rowRight) => Row.fromSeq(rowLeft.toSeq ++ rowRight.toSeq) + } + df = sqlContext.createDataFrame(merged, StructType(rdf.schema.fields ++ sanitizeSchema(json.schema).asInstanceOf[StructType].fields)) + } else { + throw new IllegalArgumentException(s"Input format $inputFormat is not supported") + } + } + + outputFormat match { + case FILE_FORMAT_PARQUET => + df.repartition(numPartitions) + .write + .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) + .mode(SaveMode.Append) + .parquet(outPath) + case FILE_FORMAT_JSON => + df.repartition(numPartitions) + .write + .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) + .mode(SaveMode.Append) + .json(outPath) + case FILE_FORMAT_ORC => + df.repartition(numPartitions) + .write + .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) + .mode(SaveMode.Append) + .orc(outPath) + case FILE_FORMAT_TXT => + df.repartition(numPartitions) + .write + .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) + .mode(SaveMode.Append) + .text(outPath) + case _ => + throw new IllegalArgumentException(s"$outputFormat not supported yet") + } + + /** + * save latest offset to Zookeeper + */ + // offsetRanges.foreach(offsetRange => zkSink.value.saveOffsetRangesToZookeeper(appName, offsetRange)) + } + + } + + return Some(offsetRanges) + } + None + } +} + + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamParallelProcessing.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamParallelProcessing.scala new file mode 100644 index 000000000..4fef443f3 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamParallelProcessing.scala @@ -0,0 +1,226 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark + +import java.util +import java.util.Collections + +import com.hurence.logisland.component.PropertyDescriptor +import com.hurence.logisland.record.{FieldDictionary, Record, RecordUtils} +import com.hurence.logisland.util.record.RecordSchemaUtil +import com.hurence.logisland.util.spark.ProcessorMetrics +import org.apache.avro.Schema +import org.apache.kafka.clients.consumer.ConsumerRecord +import org.apache.kafka.common.errors.OffsetOutOfRangeException +import org.apache.spark.TaskContext +import org.apache.spark.groupon.metrics.{SparkMeter, UserMetricsSystem} +import org.apache.spark.rdd.RDD +import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, OffsetRange} +import org.slf4j.LoggerFactory + +import scala.collection.JavaConversions._ +import com.hurence.logisland.stream.StreamProperties._ + + +class KafkaRecordStreamParallelProcessing extends AbstractKafkaRecordStream { + val logger = LoggerFactory.getLogger(this.getClass) + + override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { + val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] + + descriptors.addAll(super.getSupportedPropertyDescriptors()) + Collections.unmodifiableList(descriptors) + } + + /** + * launch the chain of processing for each partition of the RDD in parallel + * + * @param rdd + */ + override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { + if (!rdd.isEmpty()) { + // Cast the rdd to an interface that lets us get an array of OffsetRange + val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges + + rdd.foreachPartition(partition => { + try { + if (partition.nonEmpty) { + /** + * index to get the correct offset range for the rdd partition we're working on + * This is safe because we haven't shuffled or otherwise disrupted partitioning, + * and the original input rdd partitions were 1:1 with kafka partitions + */ + val partitionId = TaskContext.get.partitionId() + val offsetRange = offsetRanges(TaskContext.get.partitionId) + + val pipelineMetricPrefix = streamContext.getIdentifier + "." + + "partition" + partitionId + "." + val pipelineTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + "Pipeline.processing_time_ms" ).time() + + + /** + * create serializers + */ + val deserializer = getSerializer( + streamContext.getPropertyValue(INPUT_SERIALIZER).asString, + streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) + val serializer = getSerializer( + streamContext.getPropertyValue(OUTPUT_SERIALIZER).asString, + streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) + val errorSerializer = getSerializer( + streamContext.getPropertyValue(ERROR_SERIALIZER).asString, + streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) + + /** + * process events by chaining output records + */ + var firstPass = true + var incomingEvents: util.Collection[Record] = Collections.emptyList() + var outgoingEvents: util.Collection[Record] = Collections.emptyList() + + streamContext.getProcessContexts.foreach(processorContext => { + val startTime = System.currentTimeMillis() + val processor = processorContext.getProcessor + + val processorTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + + processorContext.getIdentifier + ".processing_time_ms").time() + /** + * convert incoming Kafka messages into Records + * if there's no serializer we assume that we need to compute a Record from K/V + */ + if (firstPass) { + incomingEvents = if ( + streamContext.getPropertyValue(INPUT_SERIALIZER).asString + == NO_SERIALIZER.getValue) { + // parser + partition.map(rawMessage => { + val key = if (rawMessage.key() != null) new String(rawMessage.key()) else "" + val value = if (rawMessage.value() != null) new String(rawMessage.value()) else "" + RecordUtils.getKeyValueRecord(key, value) + }).toList + } else { + // processor + deserializeRecords(partition, deserializer) + } + + firstPass = false + } else { + incomingEvents = outgoingEvents + } + + /** + * process incoming events + */ + if (processor.hasControllerService) { + val controllerServiceLookup = controllerServiceLookupSink.value.getControllerServiceLookup() + processorContext.setControllerServiceLookup(controllerServiceLookup) + } + + if (!processor.isInitialized) { + processor.init(processorContext) + } + + outgoingEvents = processor.process(processorContext, incomingEvents) + + /** + * compute metrics + */ + ProcessorMetrics.computeMetrics( + pipelineMetricPrefix + processorContext.getIdentifier + ".", + incomingEvents, + outgoingEvents, + offsetRange.fromOffset, + offsetRange.untilOffset, + System.currentTimeMillis() - startTime) + + processorTimerContext.stop() + }) + + + /** + * Do we make records compliant with a given Avro schema ? + */ + if (streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).isSet) { + try { + val strSchema = streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString() + val schema = RecordSchemaUtil.compileSchema(strSchema) + + outgoingEvents = outgoingEvents.map(record => RecordSchemaUtil.convertToValidRecord(record, schema)) + } catch { + case t: Throwable => + logger.warn("something wrong while converting records " + + "to valid accordingly to provide Avro schema " + t.getMessage) + } + + } + + /** + * push outgoing events and errors to Kafka + */ + kafkaSink.value.produce( + streamContext.getPropertyValue(OUTPUT_TOPICS).asString, + outgoingEvents.toList, + serializer + ) + + kafkaSink.value.produce( + streamContext.getPropertyValue(ERROR_TOPICS).asString, + outgoingEvents.filter(r => r.hasField(FieldDictionary.RECORD_ERRORS)).toList, + errorSerializer + ) + + pipelineTimerContext.stop() + } + } + catch { + case ex: OffsetOutOfRangeException => + val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString + val brokerList = streamContext.getPropertyValue(KAFKA_METADATA_BROKER_LIST).asString + /* val latestOffsetsString = zkSink.value.loadOffsetRangesFromZookeeper( + brokerList, + appName, + inputTopics.split(",").toSet) + .map(t => s"${t._1.topic}_${t._1.partition}:${t._2}") + .mkString(", ") + val offestsString = offsetRanges + .map(o => s"${o.topic}_${o.partition}:${o.fromOffset}/${o.untilOffset}") + .mkString(", ") + logger.error(s"unable to process partition. current Offsets $offestsString latest offsets $latestOffsetsString")*/ + logger.error(s"exception : ${ex.toString}") + + } + }) + Some(offsetRanges) + } + else None + } +} + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamSQLAggregator.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamSQLAggregator.scala new file mode 100644 index 000000000..bdc723ede --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamSQLAggregator.scala @@ -0,0 +1,160 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (bailet.thomas@gmail.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark + +import java.util +import java.util.Collections + +import com.hurence.logisland.annotation.documentation.{CapabilityDescription, Tags} +import com.hurence.logisland.component.PropertyDescriptor +import com.hurence.logisland.record.{FieldDictionary, Record} +import com.hurence.logisland.util.spark.{ProcessorMetrics, SparkUtils} +import com.hurence.logisland.validator.StandardValidators +import org.apache.avro.Schema +import org.apache.kafka.clients.consumer.ConsumerRecord +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession +import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange} +import org.slf4j.LoggerFactory + +import scala.collection.JavaConversions._ +import com.hurence.logisland.stream.StreamProperties._ + + +@Tags(Array("stream", "SQL", "query", "record")) +@CapabilityDescription("This is a stream capable of SQL query interpretations.") +class KafkaRecordStreamSQLAggregator extends AbstractKafkaRecordStream { + + private val logger = LoggerFactory.getLogger(classOf[KafkaRecordStreamSQLAggregator]) + + + override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { + val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] + descriptors.addAll(super.getSupportedPropertyDescriptors()) + descriptors.add(MAX_RESULTS_COUNT) + descriptors.add(SQL_QUERY) + descriptors.add(OUTPUT_RECORD_TYPE) + Collections.unmodifiableList(descriptors) + } + + override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { + if (!rdd.isEmpty()) { + // Cast the rdd to an interface that lets us get an array of OffsetRange + // val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges + + val sqlContext = SparkSession + .builder() + .appName(appName) + .config(ssc.sparkContext.getConf) + .getOrCreate() + + // this is used to implicitly convert an RDD to a DataFrame. + @transient lazy val deserializer = getSerializer( + streamContext.getPropertyValue(INPUT_SERIALIZER).asString, + streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) + + val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString + + //here how to handle elements that are not successfully deserialized ??? + //currently we lose them ! + //I think we should create an ErrorRecord containing key, value. + val records: RDD[Record] = rdd.mapPartitions(p => deserializeRecords(p, deserializer).iterator) + + /** + * get a Dataframe schema (either from an Avro schema or from the first record) + */ + val schema = try { + val parser = new Schema.Parser + val schema = parser.parse(streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) + SparkUtils.convertAvroSchemaToDataframeSchema(schema) + } + catch { + case e: Exception => + logger.error("unable to add schema :{}", e.getMessage) + SparkUtils.convertFieldsNameToSchema(records.take(1)(0)) + } + + if (!records.isEmpty()) { + + val rows = records.filter(r => !r.hasField(FieldDictionary.RECORD_ERRORS)) + .map(r => SparkUtils.convertToRow(r, schema)) + + + sqlContext.createDataFrame(rows, schema).createOrReplaceTempView(inputTopics) + + + + + val query = streamContext.getPropertyValue(SQL_QUERY).asString() + val outputRecordType = streamContext.getPropertyValue(OUTPUT_RECORD_TYPE).asString() + + sqlContext.sql(query).rdd + .foreachPartition(rows => { + val outgoingEvents = rows.map(row => SparkUtils.convertToRecord(row, outputRecordType)).toList + /** + * create serializers + */ + val serializer = getSerializer( + streamContext.getPropertyValue(OUTPUT_SERIALIZER).asString, + streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) + val errorSerializer = getSerializer( + streamContext.getPropertyValue(ERROR_SERIALIZER).asString, + streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) + + + + + /** + * push outgoing events and errors to Kafka + */ + kafkaSink.value.produce( + streamContext.getPropertyValue(OUTPUT_TOPICS).asString, + outgoingEvents, + serializer + ) + + kafkaSink.value.produce( + streamContext.getPropertyValue(ERROR_TOPICS).asString, + outgoingEvents.filter(r => r.hasField(FieldDictionary.RECORD_ERRORS)), + errorSerializer + ) + + }) + + + } + return None //Some(offsetRanges) + } + None + } +} + + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/SparkRecordStream.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/SparkRecordStream.scala new file mode 100644 index 000000000..19cff7c7b --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/SparkRecordStream.scala @@ -0,0 +1,34 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark + +import com.hurence.logisland.engine.EngineContext +import com.hurence.logisland.stream.{RecordStream, StreamContext} +import org.apache.spark.streaming.StreamingContext + + +trait SparkRecordStream extends RecordStream { + + /** + * ssetup the stream with spark app properties + * + * @param appName + * @param ssc + * @param streamContext + */ + def setup(appName: String,ssc: StreamingContext, streamContext: StreamContext, engineContext: EngineContext) + def getStreamContext() : StreamingContext +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala new file mode 100644 index 000000000..de8cb9871 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala @@ -0,0 +1,546 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream + +import com.hurence.logisland.component.{AllowableValue, PropertyDescriptor} +import com.hurence.logisland.serializer._ +import com.hurence.logisland.stream.spark.structured.provider.StructuredStreamProviderService +import com.hurence.logisland.validator.StandardValidators + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +object StreamProperties { + + val NONE_TOPIC = "none" + + val DEFAULT_RAW_TOPIC = new AllowableValue("_raw", "default raw topic", "the incoming non structured topic") + val DEFAULT_RECORDS_TOPIC = new AllowableValue("_records", "default events topic", "the outgoing structured topic") + val DEFAULT_ERRORS_TOPIC = new AllowableValue("_errors", "default raw topic", "the outgoing structured error topic") + val DEFAULT_METRICS_TOPIC = new AllowableValue("_metrics", "default metrics topic", "the topic to place processing metrics") + + val INPUT_TOPICS: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.input.topics") + .description("Sets the input Kafka topic name") + .required(true) + .defaultValue(DEFAULT_RAW_TOPIC.getValue) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build + + val OUTPUT_TOPICS: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.output.topics") + .description("Sets the output Kafka topic name") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .defaultValue(DEFAULT_RECORDS_TOPIC.getValue) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build + + val ERROR_TOPICS: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.error.topics") + .description("Sets the error topics Kafka topic name") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .defaultValue(DEFAULT_ERRORS_TOPIC.getValue) + .build + + val INPUT_TOPICS_PARTITIONS: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.input.topics.partitions") + .description("if autoCreate is set to true, this will set the number of partition at topic creation time") + .required(false) + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("20") + .build + + val OUTPUT_TOPICS_PARTITIONS: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.output.topics.partitions") + .description("if autoCreate is set to true, this will set the number of partition at topic creation time") + .required(false) + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("20") + .build + + val AVRO_INPUT_SCHEMA: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("avro.input.schema") + .description("the avro schema definition") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build + + val AVRO_OUTPUT_SCHEMA: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("avro.output.schema") + .description("the avro schema definition for the output serialization") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build + + val AVRO_SERIALIZER = new AllowableValue(classOf[AvroSerializer].getName, + "avro serialization", "serialize events as avro blocs") + val JSON_SERIALIZER = new AllowableValue(classOf[JsonSerializer].getName, + "json serialization", "serialize events as json blocs") + val EXTENDED_JSON_SERIALIZER = new AllowableValue(classOf[ExtendedJsonSerializer].getName, + "extended json serialization", "serialize events as json blocs supporting nested objects/arrays") + val KRYO_SERIALIZER = new AllowableValue(classOf[KryoSerializer].getName, + "kryo serialization", "serialize events as binary blocs") + val STRING_SERIALIZER = new AllowableValue(classOf[StringSerializer].getName, + "string serialization", "serialize events as string") + val BYTESARRAY_SERIALIZER = new AllowableValue(classOf[BytesArraySerializer].getName, + "byte array serialization", "serialize events as byte arrays") + val KURA_PROTOCOL_BUFFER_SERIALIZER = new AllowableValue(classOf[KuraProtobufSerializer].getName, + "Kura Protobuf serialization", "serialize events as Kura protocol buffer") + val NO_SERIALIZER = new AllowableValue("none", "no serialization", "send events as bytes") + + val INPUT_SERIALIZER: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.input.topics.serializer") + .description("") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .allowableValues(KRYO_SERIALIZER, JSON_SERIALIZER, EXTENDED_JSON_SERIALIZER, AVRO_SERIALIZER, BYTESARRAY_SERIALIZER, STRING_SERIALIZER, NO_SERIALIZER) + .defaultValue(KRYO_SERIALIZER.getValue) + .build + + val OUTPUT_SERIALIZER: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.output.topics.serializer") + .description("") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .allowableValues(KRYO_SERIALIZER, JSON_SERIALIZER, EXTENDED_JSON_SERIALIZER, AVRO_SERIALIZER, BYTESARRAY_SERIALIZER, STRING_SERIALIZER, NO_SERIALIZER) + .defaultValue(KRYO_SERIALIZER.getValue) + .build + + val ERROR_SERIALIZER: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.error.topics.serializer") + .description("") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .defaultValue(JSON_SERIALIZER.getValue) + .allowableValues(KRYO_SERIALIZER, JSON_SERIALIZER, EXTENDED_JSON_SERIALIZER, AVRO_SERIALIZER, BYTESARRAY_SERIALIZER, STRING_SERIALIZER, NO_SERIALIZER) + .build + + + val KAFKA_TOPIC_AUTOCREATE: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.topic.autoCreate") + .description("define wether a topic should be created automatically if not already exists") + .required(false) + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .defaultValue("true") + .build + + val KAFKA_TOPIC_DEFAULT_PARTITIONS: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.topic.default.partitions") + .description("if autoCreate is set to true, this will set the number of partition at topic creation time") + .required(false) + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("20") + .build + + val KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.topic.default.replicationFactor") + .description("if autoCreate is set to true, this will set the number of replica for each partition at topic creation time") + .required(false) + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("3") + .build + + val KAFKA_METADATA_BROKER_LIST: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.metadata.broker.list") + .description("a comma separated list of host:port brokers") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .defaultValue("sandbox:9092") + .build + + val KAFKA_ZOOKEEPER_QUORUM: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.zookeeper.quorum") + .description("") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .defaultValue("sandbox:2181") + .build + + val LATEST_OFFSET = new AllowableValue("latest", "latest", "the offset to the latest offset") + val EARLIEST_OFFSET = new AllowableValue("earliest", "earliest offset", "the offset to the earliest offset") + val NONE_OFFSET = new AllowableValue("none", "none offset", "the latest saved offset") + + val KAFKA_MANUAL_OFFSET_RESET: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.manual.offset.reset") + .description("What to do when there is no initial offset in Kafka or if the current offset does not exist " + + "any more on the server (e.g. because that data has been deleted):\n" + + "earliest: automatically reset the offset to the earliest offset\n" + + "latest: automatically reset the offset to the latest offset\n" + + "none: throw exception to the consumer if no previous offset is found for the consumer's group\n" + + "anything else: throw exception to the consumer.") + .required(false) + .allowableValues(LATEST_OFFSET, EARLIEST_OFFSET, NONE_OFFSET) + .defaultValue(EARLIEST_OFFSET.getValue) + .build + + + val KAFKA_BATCH_SIZE: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.batch.size") + .description("measures batch size in total bytes instead of the number of messages. " + + "It controls how many bytes of data to collect before sending messages to the Kafka broker. " + + "Set this as high as possible, without exceeding available memory. The default value is 16384.\n\n" + + "If you increase the size of your buffer, it might never get full." + + "The Producer sends the information eventually, based on other triggers, such as linger time in milliseconds. " + + "Although you can impair memory usage by setting the buffer batch size too high, " + + "this does not impact latency.\n\n" + + "If your producer is sending all the time, " + + "you are probably getting the best throughput possible. If the producer is often idle, " + + "you might not be writing enough data to warrant the current allocation of resources.") + .required(false) + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("16384") + .build + + + val KAFKA_LINGER_MS: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.linger.ms") + .description("linger.ms sets the maximum time to buffer data in asynchronous mode. " + + "For example, a setting of 100 batches 100ms of messages to send at once. " + + "This improves throughput, but the buffering adds message delivery latency.\n\n" + + "By default, the producer does not wait. It sends the buffer any time data is available.\n\n" + + "Instead of sending immediately, you can set linger.ms to 5 and send more messages in one batch." + + " This would reduce the number of requests sent, but would add up to 5 milliseconds of latency to records " + + "sent, even if the load on the system does not warrant the delay.\n\n" + + "The farther away the broker is from the producer, the more overhead required to send messages. " + + "Increase linger.ms for higher latency and higher throughput in your producer.") + .required(false) + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("5") + .build + + val KAFKA_ACKS: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("kafka.acks") + .description("The number of acknowledgments the producer requires the leader to have received before considering a request complete. This controls the " + + " durability of records that are sent. The following settings are common: " + + "
    " + + "
  • acks=0 If set to zero then the producer will not wait for any acknowledgment from the" + + " server at all. The record will be immediately added to the socket buffer and considered sent. No guarantee can be" + + " made that the server has received the record in this case, and the retries configuration will not" + + " take effect (as the client won't generally know of any failures). The offset given back for each record will" + + " always be set to -1." + + "
  • acks=1 This will mean the leader will write the record to its local log but will respond" + + " without awaiting full acknowledgement from all followers. In this case should the leader fail immediately after" + + " acknowledging the record but before the followers have replicated it then the record will be lost." + + "
  • acks=all This means the leader will wait for the full set of in-sync replicas to" + + " acknowledge the record. This guarantees that the record will not be lost as long as at least one in-sync replica" + + " remains alive. This is the strongest available guarantee.") + .required(false) + .defaultValue("all") + .build + + + val WINDOW_DURATION: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("window.duration") + .description("all the elements in seen in a sliding window of time over. windowDuration = width of the window; must be a multiple of batching interval") + .addValidator(StandardValidators.LONG_VALIDATOR) + .required(false) + .build + + val SLIDE_DURATION: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("slide.duration") + .description("sliding interval of the window (i.e., the interval after which the new DStream will generate RDDs); must be a multiple of batching interval") + .addValidator(StandardValidators.LONG_VALIDATOR) + .required(false) + .build + + val GROUPBY: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("groupby") + .description("comma separated list of fields to group the partition by") + .addValidator(StandardValidators.COMMA_SEPARATED_LIST_VALIDATOR) + .required(false) + .build + + val STATE_TIMEOUT_MS: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("state.timeout.ms") + .description("the time in ms before we invalidate the microbatch state") + .addValidator(StandardValidators.LONG_VALIDATOR) + .required(false) + .defaultValue("2000") + .build + + val CHUNK_SIZE: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("chunk.size") + .description("the number of records to group into chunks") + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .required(false) + .defaultValue("100") + .build + ////////////////////////////////////// + // MQTT options + ////////////////////////////////////// + + val MQTT_BROKER_URL: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("mqtt.broker.url") + .description("brokerUrl A url MqttClient connects to. Set this or path as the url of the Mqtt Server. e.g. tcp://localhost:1883") + .addValidator(StandardValidators.URL_VALIDATOR) + .defaultValue("tcp://localhost:1883") + .required(false) + .build + + val MQTT_PERSISTENCE: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("mqtt.persistence") + .description("persistence By default it is used for storing incoming messages on disk. " + + "If memory is provided as value for this option, then recovery on restart is not supported.") + .defaultValue("memory") + .required(false) + .build + + val MQTT_TOPIC: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("mqtt.topic") + .description("Topic MqttClient subscribes to.") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(true) + .build + + val MQTT_CLIENTID: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("mqtt.client.id") + .description("clientID this client is associated. Provide the same value to recover a stopped client.") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(true) + .build + + val MQTT_QOS: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("mqtt.qos") + .description(" QoS The maximum quality of service to subscribe each topic at.Messages published at a lower " + + "quality of service will be received at the published QoS.Messages published at a higher quality of " + + "service will be received using the QoS specified on the subscribe") + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("0") + .required(false) + .build + + val MQTT_USERNAME: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("mqtt.username") + .description(" username Sets the user name to use for the connection to Mqtt Server. " + + "Do not set it, if server does not need this. Setting it empty will lead to errors.") + .required(false) + .build + + val MQTT_PASSWORD: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("mqtt.password") + .description("password Sets the password to use for the connection") + .required(false) + .build + + val MQTT_CLEAN_SESSION: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("mqtt.clean.session") + .description("cleanSession Setting it true starts a clean session, removes all checkpointed messages by " + + "a previous run of this source. This is set to false by default.") + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .defaultValue("true") + .required(false) + .build + + val MQTT_CONNECTION_TIMEOUT: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("mqtt.connection.timeout") + .description("connectionTimeout Sets the connection timeout, a value of 0 is interpreted as " + + "wait until client connects. See MqttConnectOptions.setConnectionTimeout for more information") + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("5000") + .required(false) + .build + + val MQTT_KEEP_ALIVE: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("mqtt.keep.alive") + .description("keepAlive Same as MqttConnectOptions.setKeepAliveInterval.") + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("5000") + .required(false) + .build + + + val MQTT_VERSION: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("mqtt.version") + .description("mqttVersion Same as MqttConnectOptions.setMqttVersion") + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("5000") + .required(false) + .build + + val READ_TOPICS: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("read.topics") + .description("the input path for any topic to be read from") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(true) + .build + + val READ_TOPICS_SERIALIZER: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("read.topics.serializer") + .description("the serializer to use") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .allowableValues(KRYO_SERIALIZER, JSON_SERIALIZER, EXTENDED_JSON_SERIALIZER, AVRO_SERIALIZER, BYTESARRAY_SERIALIZER, STRING_SERIALIZER, NO_SERIALIZER, KURA_PROTOCOL_BUFFER_SERIALIZER) + .defaultValue(NO_SERIALIZER.getValue) + .build + + val READ_TOPICS_KEY_SERIALIZER: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("read.topics.key.serializer") + .description("The key serializer to use") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .allowableValues(KRYO_SERIALIZER, JSON_SERIALIZER, EXTENDED_JSON_SERIALIZER, AVRO_SERIALIZER, BYTESARRAY_SERIALIZER, KURA_PROTOCOL_BUFFER_SERIALIZER, STRING_SERIALIZER, NO_SERIALIZER) + .defaultValue(NO_SERIALIZER.getValue) + .build + + val READ_STREAM_SERVICE_PROVIDER: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("read.stream.service.provider") + .description("the controller service that gives connection information") + .required(true) + .identifiesControllerService(classOf[StructuredStreamProviderService]) + .build + + + val WRITE_TOPICS: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("write.topics") + .description("the input path for any topic to be written to") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(true) + .build + + val WRITE_TOPICS_SERIALIZER: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("write.topics.serializer") + .description("the serializer to use") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .allowableValues(KRYO_SERIALIZER, JSON_SERIALIZER, EXTENDED_JSON_SERIALIZER, AVRO_SERIALIZER, BYTESARRAY_SERIALIZER, STRING_SERIALIZER, NO_SERIALIZER, KURA_PROTOCOL_BUFFER_SERIALIZER) + .defaultValue(NO_SERIALIZER.getValue) + .build + + val WRITE_TOPICS_KEY_SERIALIZER: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("write.topics.key.serializer") + .description("The key serializer to use") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .allowableValues(KRYO_SERIALIZER, JSON_SERIALIZER, EXTENDED_JSON_SERIALIZER, AVRO_SERIALIZER, BYTESARRAY_SERIALIZER, STRING_SERIALIZER, NO_SERIALIZER, KURA_PROTOCOL_BUFFER_SERIALIZER) + .defaultValue(NO_SERIALIZER.getValue) + .build + + val WRITE_STREAM_SERVICE_PROVIDER: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("write.stream.service.provider") + .description("the controller service that gives connection information") + .required(true) + .identifiesControllerService(classOf[StructuredStreamProviderService]) + .build + + + ////////////////////////////////////// + // HDFS options + ////////////////////////////////////// + val FILE_FORMAT_PARQUET = "parquet" + val FILE_FORMAT_ORC = "orc" + val FILE_FORMAT_JSON = "json" + val FILE_FORMAT_TXT = "txt" + + val OUTPUT_FOLDER_PATH = new PropertyDescriptor.Builder() + .name("output.folder.path") + .description("the location where to put files : file:///tmp/out") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build + + + val INPUT_FORMAT = new PropertyDescriptor.Builder() + .name("input.format") + .description("Used to load data from a raw record_value. Only json supported") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .defaultValue("") + .build + + val OUTPUT_FORMAT = new PropertyDescriptor.Builder() + .name("output.format") + .description("can be parquet, orc csv") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .allowableValues(FILE_FORMAT_PARQUET, FILE_FORMAT_TXT, FILE_FORMAT_JSON, FILE_FORMAT_JSON) + .build + + val RECORD_TYPE = new PropertyDescriptor.Builder() + .name("record.type") + .description("the type of event to filter") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build + + val NUM_PARTITIONS = new PropertyDescriptor.Builder() + .name("num.partitions") + .description("the numbers of physical files on HDFS") + .required(false) + .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) + .defaultValue("4") + .build + + val EXCLUDE_ERRORS = new PropertyDescriptor.Builder() + .name("exclude.errors") + .description("do we include records with errors ?") + .required(false) + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .defaultValue("true") + .build + + val DATE_FORMAT = new PropertyDescriptor.Builder() + .name("date.format") + .description("The format of the date for the partition") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .defaultValue("yyyy-MM-dd") + .build + + + ////////////////////////////////////// + // SQL options + ////////////////////////////////////// + val SQL_QUERY = new PropertyDescriptor.Builder() + .name("sql.query") + .description("The SQL query to execute, " + + "please note that the table name must exists in input topics names") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build + + val MAX_RESULTS_COUNT = new PropertyDescriptor.Builder() + .name("max.results.count") + .description("the max number of rows to output. (-1 for no limit)") + .required(false) + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .defaultValue("-1") + .build + + val OUTPUT_RECORD_TYPE = new PropertyDescriptor.Builder() + .name("output.record.type") + .description("the output type of the record") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .defaultValue("aggregation") + .build + + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectBaseProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectBaseProviderService.scala new file mode 100644 index 000000000..61d3d2592 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectBaseProviderService.scala @@ -0,0 +1,112 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark.provider + +import java.util +import java.util.Collections + +import com.hurence.logisland.annotation.lifecycle.OnEnabled +import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} +import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} +import com.hurence.logisland.record.Record +import com.hurence.logisland.stream.StreamContext +import com.hurence.logisland.stream.spark.structured.provider.StructuredStreamProviderService +import com.hurence.logisland.util.spark.ControllerServiceLookupSink +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.streaming.DataStreamWriter +import org.apache.spark.sql.{Dataset, SparkSession} + +class KafkaConnectBaseProviderService extends AbstractControllerService with StructuredStreamProviderService { + + var connectorProperties = "" + var keyConverter = "" + var valueConverter = "" + var keyConverterProperties = "" + var valueConverterProperties = "" + var maxConfigurations = 1 + var delegateConnectorClass = "" + var offsetBackingStore = "" + var offsetBackingStoreProperties = "" + + @OnEnabled + @throws[InitializationException] + override def init(context: ControllerServiceInitializationContext): Unit = { + super.init(context) + this.synchronized { + try { + delegateConnectorClass = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_CONNECTOR_CLASS).asString() + connectorProperties = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_CONNECTOR_PROPERTIES).asString() + valueConverter = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER).asString() + valueConverterProperties = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER_PROPERTIES).asString() + keyConverter = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER).asString() + keyConverterProperties = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER_PROPERTIES).asString() + maxConfigurations = (context getPropertyValue StreamOptions.KAFKA_CONNECT_MAX_TASKS).asInteger() + offsetBackingStore = (context getPropertyValue StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE).asString() + offsetBackingStoreProperties = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE_PROPERTIES).asString() + } catch { + case e: Exception => + throw new InitializationException(e) + } + } + } + + + /** + * Allows subclasses to register which property descriptor objects are + * supported. + * + * @return PropertyDescriptor objects this processor currently supports + */ + override def getSupportedPropertyDescriptors() = { + val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] + descriptors.add(StreamOptions.KAFKA_CONNECT_CONNECTOR_CLASS) + descriptors.add(StreamOptions.KAFKA_CONNECT_CONNECTOR_PROPERTIES) + descriptors.add(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER) + descriptors.add(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER_PROPERTIES) + descriptors.add(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER) + descriptors.add(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER_PROPERTIES) + descriptors.add(StreamOptions.KAFKA_CONNECT_MAX_TASKS) + descriptors.add(StreamOptions.KAFKA_CONNECT_MAX_PARTITIONS) + descriptors.add(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE) + descriptors.add(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE_PROPERTIES) + Collections.unmodifiableList(descriptors) + } + + + /** + * create a streaming DataFrame that represents data received + * + * @param spark + * @param streamContext + * @return DataFrame currently loaded + */ + override def read(spark: SparkSession, streamContext: StreamContext): Dataset[Record] = { + throw new UnsupportedOperationException("Operation not supported. Please be sure to use the right component") + } + + + /** + * create a streaming DataFrame that represents data to be written + * + * @param streamContext + * @return DataFrame currently loaded + */ + override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): DataStreamWriter[_] = { + throw new UnsupportedOperationException("Operation not supported. Please be sure to use the right component") + } + + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSinkProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSinkProviderService.scala new file mode 100644 index 000000000..066a41711 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSinkProviderService.scala @@ -0,0 +1,122 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark.provider + +import com.hurence.logisland.annotation.lifecycle.OnEnabled +import com.hurence.logisland.component.InitializationException +import com.hurence.logisland.connect.Utils +import com.hurence.logisland.connect.sink.KafkaConnectStreamSink +import com.hurence.logisland.controller.ControllerServiceInitializationContext +import com.hurence.logisland.record.{FieldDictionary, Record} +import com.hurence.logisland.stream.{StreamContext, StreamProperties} +import com.hurence.logisland.util.spark.ControllerServiceLookupSink +import org.apache.kafka.connect.sink.SinkConnector +import org.apache.spark.TaskContext +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql._ +import org.apache.spark.sql.streaming.DataStreamWriter + +class KafkaConnectStructuredSinkProviderService extends KafkaConnectBaseProviderService { + + + var maxPartitions = 1 + @transient var writer: KafkaConnectStreamSink = null + + @OnEnabled + @throws[InitializationException] + override def init(context: ControllerServiceInitializationContext): Unit = { + super.init(context) + this.synchronized { + try { + maxPartitions = maxConfigurations + if (context.getPropertyValue(StreamOptions.KAFKA_CONNECT_MAX_PARTITIONS).isSet) { + maxPartitions = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_MAX_PARTITIONS).asInteger() + } + } catch { + case e: Exception => + throw new InitializationException(e) + } + } + } + + + /** + * create a streaming DataFrame that represents data to be written + * + * @param streamContext + * @return DataFrame currently loaded + */ + override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): DataStreamWriter[_] = { + implicit val encoder = Encoders.tuple(Encoders.BINARY, Encoders.BINARY) + val df2 = df + .mapPartitions(record => record.map(record => (record.getField(FieldDictionary.RECORD_KEY).getRawValue().asInstanceOf[Array[Byte]], + record.getField(FieldDictionary.RECORD_VALUE).getRawValue().asInstanceOf[Array[Byte]]))) + .toDF("key", "value") + + val topicName = streamContext.getPropertyValue(StreamProperties.WRITE_TOPICS).asString().split(",")(0).trim + + def writer() = controllerServiceLookupSink.value.getControllerService(getIdentifier).asInstanceOf[KafkaConnectStructuredSinkProviderService] + .createWriter(SparkSession.builder().getOrCreate().sqlContext, streamContext, topicName) + + df2/*.repartition(maxPartitions, df2.col("key"))*/ + .writeStream + .foreach(new ForeachWriter[Row] { + + override def process(value: Row): Unit = { + writer().enqueueOnPartition(TaskContext.getPartitionId(), value.getAs(0), value.getAs(1)) + } + + override def close(errorOrNull: Throwable): Unit = { + if (errorOrNull != null) { + logger.error("Error while storing data", errorOrNull) + } + writer().flushPartition(TaskContext.getPartitionId()) + } + + override def open(partitionId: Long, version: Long): Boolean = { + writer().openPartition(partitionId.intValue()) + } + }) + } + + + def createWriter(sqlContext: SQLContext, streamContext: StreamContext, topic: String): KafkaConnectStreamSink = + synchronized { + + if (writer == null) { + val keyConverterInstance = Utils.createConverter(keyConverter, keyConverterProperties, true) + val valueConverterInstance = Utils.createConverter(valueConverter, valueConverterProperties, false) + //create the right backing store + val offsetBackingStoreInstance = Utils.createOffsetBackingStore(offsetBackingStore, Utils.propertiesToMap(offsetBackingStoreProperties)) + + writer = new KafkaConnectStreamSink( + sqlContext, + Utils.propertiesToMap(connectorProperties), + keyConverterInstance, + valueConverterInstance, + offsetBackingStoreInstance, + maxConfigurations, + topic, + delegateConnectorClass, + streamContext.getIdentifier) + writer.start() + } + + writer + } + + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSourceProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSourceProviderService.scala new file mode 100644 index 000000000..97b30d87b --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSourceProviderService.scala @@ -0,0 +1,83 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.hurence.logisland.stream.spark.provider + +import com.hurence.logisland.annotation.lifecycle.OnEnabled +import com.hurence.logisland.component.InitializationException +import com.hurence.logisland.controller.ControllerServiceInitializationContext +import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} +import com.hurence.logisland.stream.StreamContext +import org.apache.spark.SparkContext +import org.apache.spark.sql.{Dataset, SparkSession} + +class KafkaConnectStructuredSourceProviderService extends KafkaConnectBaseProviderService { + + var maxPartitions = 1 + + @OnEnabled + @throws[InitializationException] + override def init(context: ControllerServiceInitializationContext): Unit = { + super.init(context) + this.synchronized { + try { + maxPartitions = SparkContext.getOrCreate().defaultParallelism + if (context.getPropertyValue(StreamOptions.KAFKA_CONNECT_MAX_PARTITIONS).isSet) { + maxPartitions = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_MAX_PARTITIONS).asInteger() + } + } catch { + case e: Exception => + throw new InitializationException(e) + } + } + } + + + /** + * create a streaming DataFrame that represents data received + * + * @param spark + * @param streamContext + * @return DataFrame currently loaded + */ + override def read(spark: SparkSession, streamContext: StreamContext): Dataset[Record] = { + import spark.implicits._ + implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] + + getLogger.info(s"Connecting kafka-connect source $delegateConnectorClass") + spark.readStream + .format("com.hurence.logisland.connect.source.KafkaConnectStreamSourceProvider") + .option(StreamOptions.KAFKA_CONNECT_CONNECTOR_PROPERTIES.getName, connectorProperties) + .option(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER.getName, keyConverter) + .option(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER_PROPERTIES.getName, keyConverterProperties) + .option(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER.getName, valueConverter) + .option(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER_PROPERTIES.getName, valueConverterProperties) + .option(StreamOptions.KAFKA_CONNECT_MAX_TASKS.getName, maxConfigurations) + .option(StreamOptions.KAFKA_CONNECT_CONNECTOR_CLASS.getName, delegateConnectorClass) + .option(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE.getName, offsetBackingStore) + .option(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE_PROPERTIES.getName, offsetBackingStoreProperties) + .load(streamContext.getIdentifier) + //Topic, Partition, Key, Value + .as[(String, String, String, Array[Byte], Array[Byte])] + .map(r => + new StandardRecord("kafka_connect") + .setField(FieldDictionary.RECORD_KEY, FieldType.BYTES, r._4) + .setField(FieldDictionary.RECORD_VALUE, FieldType.BYTES, r._5)) + .coalesce(maxPartitions) + } + + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/package.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/package.scala new file mode 100644 index 000000000..0c7ff6c9f --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/package.scala @@ -0,0 +1,126 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.hurence.logisland.stream.spark.provider + +import com.hurence.logisland.component.{AllowableValue, PropertyDescriptor} +import com.hurence.logisland.validator.StandardValidators +import org.apache.kafka.connect.connector.Connector +import org.apache.kafka.connect.runtime.standalone.StandaloneConfig +import org.apache.kafka.connect.storage.Converter + +object StreamOptions { + + val MEMORY_BACKING_STORE = new AllowableValue("memory", "In memory backing store", + "Standalone in memory offset backing store. Not suitable for clustered deployments unless source is unique or stateless") + + val FILE_BACKING_STORE = new AllowableValue("file", "File based backing store", + "Standalone filesystem based offset backing store. " + + "You have to specify the property " + StandaloneConfig.OFFSET_STORAGE_FILE_FILENAME_CONFIG + " for the file path." + + "Not suitable for clustered deployments unless source is unique or standalone") + + val KAFKA_BACKING_STORE = new AllowableValue("kafka", "Kafka topic based backing store", + "Distributed kafka topic based offset backing store. " + + "See the javadoc of class org.apache.kafka.connect.storage.KafkaOffsetBackingStore for the configuration options." + + "This backing store is well suited for distributed deployments.") + + + ////////////////////////////////////// + // Kafka Connect options + ////////////////////////////////////// + + + val KAFKA_CONNECT_CONNECTOR_CLASS = new PropertyDescriptor.Builder() + .name("kc.connector.class") + .description("The class canonical name of the kafka connector to use.") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build + + val KAFKA_CONNECT_CONNECTOR_PROPERTIES = new PropertyDescriptor.Builder() + .name("kc.connector.properties") + .description("The properties (key=value) for the connector.") + .required(false) + .defaultValue("") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build + + val KAFKA_CONNECT_MAX_TASKS = new PropertyDescriptor.Builder() + .name("kc.worker.tasks.max") + .description("Max number of threads for this connector") + .required(true) + .defaultValue("1") + .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) + .build + + val KAFKA_CONNECT_MAX_PARTITIONS = new PropertyDescriptor.Builder() + .name("kc.partitions.max") + .description("Max number of partitions for this connector.") + .required(false) + .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) + .build + + val KAFKA_CONNECT_KEY_CONVERTER = new PropertyDescriptor.Builder() + .name("kc.data.key.converter") + .description("Key converter class") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .addValidator(StandardValidators.TYPE_VALIDATOR(classOf[Converter])) + .build + + val KAFKA_CONNECT_VALUE_CONVERTER = new PropertyDescriptor.Builder() + .name("kc.data.value.converter") + .description("Value converter class") + .required(true) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .addValidator(StandardValidators.TYPE_VALIDATOR(classOf[Converter])) + .build + + val KAFKA_CONNECT_KEY_CONVERTER_PROPERTIES = new PropertyDescriptor.Builder() + .name("kc.data.key.converter.properties") + .description("Key converter properties") + .required(false) + .defaultValue("") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build + + val KAFKA_CONNECT_VALUE_CONVERTER_PROPERTIES = new PropertyDescriptor.Builder() + .name("kc.data.value.converter.properties") + .description("Value converter properties") + .required(false) + .defaultValue("") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build + + + val KAFKA_CONNECT_OFFSET_BACKING_STORE = new PropertyDescriptor.Builder() + .name("kc.connector.offset.backing.store") + .required(false) + .description("The underlying backing store to be used.") + .defaultValue(MEMORY_BACKING_STORE.getValue) + .allowableValues(MEMORY_BACKING_STORE, FILE_BACKING_STORE, KAFKA_BACKING_STORE) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build() + + val KAFKA_CONNECT_OFFSET_BACKING_STORE_PROPERTIES = new PropertyDescriptor.Builder() + .name("kc.connector.offset.backing.store.properties") + .description("Properties to configure the offset backing store") + .required(false) + .defaultValue("") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala new file mode 100644 index 000000000..d5ebc9427 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala @@ -0,0 +1,184 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark.structured + +import java.util +import java.util.Collections + +import com.hurence.logisland.component.PropertyDescriptor +import com.hurence.logisland.engine.EngineContext +import com.hurence.logisland.engine.spark.remote.PipelineConfigurationBroadcastWrapper +import com.hurence.logisland.stream.StreamProperties._ +import com.hurence.logisland.stream.spark.SparkRecordStream +import com.hurence.logisland.stream.spark.structured.provider.StructuredStreamProviderService +import com.hurence.logisland.stream.{AbstractRecordStream, StreamContext} +import com.hurence.logisland.util.spark._ +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.groupon.metrics.UserMetricsSystem +import org.apache.spark.sql.{Dataset, SQLContext, SparkSession} +import org.apache.spark.streaming.StreamingContext +import org.slf4j.LoggerFactory + + +class StructuredStream extends AbstractRecordStream with SparkRecordStream { + + + protected var provider: StructuredStreamProviderService = _ + + + protected var appName: String = "" + @transient protected var ssc: StreamingContext = _ + @transient protected var streamContext: StreamContext = _ + protected var engineContext: EngineContext = _ + protected var controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink] = _ + protected var needMetricsReset = false + + + private val logger = LoggerFactory.getLogger(this.getClass) + + override def getSupportedPropertyDescriptors() = { + val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] + + descriptors.add(READ_STREAM_SERVICE_PROVIDER) + descriptors.add(READ_TOPICS_SERIALIZER) + descriptors.add(READ_TOPICS_KEY_SERIALIZER) + descriptors.add(WRITE_STREAM_SERVICE_PROVIDER) + descriptors.add(WRITE_TOPICS_SERIALIZER) + descriptors.add(WRITE_TOPICS_KEY_SERIALIZER) + descriptors.add(GROUPBY) + descriptors.add(STATE_TIMEOUT_MS) + descriptors.add(CHUNK_SIZE) + + Collections.unmodifiableList(descriptors) + } + + override def setup(appName: String, ssc: StreamingContext, streamContext: StreamContext, engineContext: EngineContext) = { + this.appName = appName + this.ssc = ssc + this.streamContext = streamContext + this.engineContext = engineContext + } + + override def getStreamContext(): StreamingContext = this.ssc + + override def start() = { + if (ssc == null) + throw new IllegalStateException("stream not initialized") + + try { + + val pipelineMetricPrefix = streamContext.getIdentifier /*+ ".partition" + partitionId*/ + "." + val pipelineTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + "Pipeline.processing_time_ms").time() + + controllerServiceLookupSink = ssc.sparkContext.broadcast( + ControllerServiceLookupSink(engineContext.getControllerServiceConfigurations) + ) + val spark = SparkSession.builder() + .config(this.ssc.sparkContext.getConf) + .getOrCreate() + + spark.sqlContext.setConf("spark.sql.shuffle.partitions", "4")//TODO make this configurable + + + val controllerServiceLookup = controllerServiceLookupSink.value.getControllerServiceLookup() + streamContext.setControllerServiceLookup(controllerServiceLookup) + + + val readStreamService = streamContext.getPropertyValue(READ_STREAM_SERVICE_PROVIDER) + .asControllerService() + .asInstanceOf[StructuredStreamProviderService] + + //TODO stange way to update streamcontext, should'nt it be broadcasted ? + // moreover the streamcontext should always be the last updated one in this function for me. + // If driver wants to change it, it should call setup which would use a broadcast value for example ? + // Unfortunately we should not attempt changes before having good unit test so that we do not broke streams + // while cleaning streams code... Indeed I am afraid the remote api engines use this strange behaviour here + // to change config on the fly when it should use the setup method (maybe using broadcast as well). + // In this method start, the config should be considered already up to date in my opinion. + streamContext.getProcessContexts.clear() + streamContext.getProcessContexts.addAll( + PipelineConfigurationBroadcastWrapper.getInstance().get(streamContext.getIdentifier)) + + val readDF = readStreamService.load(spark, controllerServiceLookupSink, streamContext) + + val writeStreamService = streamContext.getPropertyValue(WRITE_STREAM_SERVICE_PROVIDER) + .asControllerService() + .asInstanceOf[StructuredStreamProviderService] + + // Write key-value data from a DataFrame to a specific Kafka topic specified in an option + val ds = writeStreamService.save(readDF, controllerServiceLookupSink, streamContext) + pipelineTimerContext.stop() + + } + catch { + case ex: Throwable => + logger.error("Error while processing the streaming query. ", ex) + throw new IllegalStateException("Error while processing the streaming query", ex) + } + } + + override def stop(): Unit + + = { + super.stop() + //stop the source + val thisStream = SQLContext.getOrCreate(getStreamContext().sparkContext).streams.active.find(stream => streamContext.getIdentifier.equals(stream.name)); + if (thisStream.isDefined) { + if (!getStreamContext().sparkContext.isStopped && thisStream.get.isActive) { + try { + thisStream.get.stop() + thisStream.get.awaitTermination() + } catch { + case ex: Throwable => logger.warn(s"Stream ${streamContext.getIdentifier} may not have been correctly stopped") + } + } + } else { + logger.warn(s"Unable to find an active streaming query for stream ${streamContext.getIdentifier}") + } + } +} + + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/ConsoleStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/ConsoleStructuredStreamProviderService.scala new file mode 100644 index 000000000..af129a23d --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/ConsoleStructuredStreamProviderService.scala @@ -0,0 +1,183 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark.structured.provider + +import java.util +import java.util.Collections + +import com.hurence.logisland.annotation.documentation.CapabilityDescription +import com.hurence.logisland.annotation.lifecycle.OnEnabled +import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} +import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} +import com.hurence.logisland.record.Record +import com.hurence.logisland.serializer.SerializerProvider +import com.hurence.logisland.stream.StreamContext +import com.hurence.logisland.stream.StreamProperties.{AVRO_OUTPUT_SCHEMA, WRITE_TOPICS_KEY_SERIALIZER, WRITE_TOPICS_SERIALIZER} +import com.hurence.logisland.util.spark.ControllerServiceLookupSink +import com.hurence.logisland.validator.StandardValidators +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.streaming.{DataStreamWriter, OutputMode, StreamingQuery} +import org.apache.spark.sql.{Dataset, ForeachWriter, SparkSession} + +@CapabilityDescription("Provide a ways to print output in console in a StructuredStream streams") +class ConsoleStructuredStreamProviderService extends AbstractControllerService with StructuredStreamProviderService { + + val NUM_ROWS_TO_SHOW: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("rows") + .description("Number of rows to print every trigger (default: 20 see spark documentation)") + .addValidator(StandardValidators.POSITIVE_LONG_VALIDATOR) + .required(true) + .build + + val TRUNCATE_OUTPUT: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("truncate") + .description("Whether to truncate the output if too long (default: true see spark documentation) ") + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .required(false) + .build + + var numRows: Option[Long] = _ + var truncate: Option[Boolean] = _ + + @OnEnabled + @throws[InitializationException] + override def init(context: ControllerServiceInitializationContext): Unit = { + super.init(context) + this.synchronized { + try { + if (context.getPropertyValue(NUM_ROWS_TO_SHOW).isSet) { + numRows = Some(context.getPropertyValue(NUM_ROWS_TO_SHOW).asLong()) + } else { + numRows = None + } + if (context.getPropertyValue(TRUNCATE_OUTPUT).isSet) { + truncate = Some(context.getPropertyValue(TRUNCATE_OUTPUT).asBoolean()) + } else { + truncate = None + } + } catch { + case e: Exception => + throw new InitializationException(e) + } + } + } + + /** + * Allows subclasses to register which property descriptor objects are + * supported. + * + * @return PropertyDescriptor objects this processor currently supports + */ + override def getSupportedPropertyDescriptors() = { + val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] + + Collections.unmodifiableList(descriptors) + } + + /** + * create a streaming DataFrame that represents data received + * + * @param spark + * @param streamContext + * @return DataFrame currently loaded + */ + override def read(spark: SparkSession, streamContext: StreamContext) = { + throw new IllegalArgumentException("ConsoleStructuredStreamProviderService class does not support read operation yet"); + } + + /** + * create a streaming DataFrame that represents data received + * + * @param streamContext + * @return DataFrame currently loaded + */ + override def save(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): StreamingQuery = { + + implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] + + // make sure controller service lookup won't be serialized !! + streamContext.setControllerServiceLookup(null) + + // create serializer + val serializer = SerializerProvider.getSerializer( + streamContext.getPropertyValue(WRITE_TOPICS_SERIALIZER).asString, + streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) + + // create serializer + val keySerializer = SerializerProvider.getSerializer( + streamContext.getPropertyValue(WRITE_TOPICS_KEY_SERIALIZER).asString, null) + + // do the parallel processing + val df2 = df.mapPartitions(record => record.map(record => serializeRecords(serializer, keySerializer, record))) + + + write(df2, controllerServiceLookupSink, streamContext) + .queryName(streamContext.getIdentifier) + // .outputMode("update") + .foreach(new ForeachWriter[Record] { + def open(partitionId: Long, version: Long): Boolean = { + // open connection + true + } + + def process(record: Record) = { + println(record) + // write string to connection + } + + def close(errorOrNull: Throwable): Unit = { + // close the connection + } + }).start() + + // .processAllAvailable() + + } + /** + * create a streaming DataFrame that represents data received + * + * @param streamContext + * @return DataFrame currently loaded + */ + override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): DataStreamWriter[Record] = { +// implicit val myObjEncoder = org.apache.spark.sql.Encoders.kryo[Record] + + val dataStreamWriter = df.writeStream + .format("console") + if (numRows.isDefined) { + dataStreamWriter.option("numRows", numRows.get) + } + if (truncate.isDefined) { + dataStreamWriter.option("truncate", truncate.get) + } + dataStreamWriter + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/KafkaStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/KafkaStructuredStreamProviderService.scala new file mode 100644 index 000000000..f081e4046 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/KafkaStructuredStreamProviderService.scala @@ -0,0 +1,275 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark.structured.provider + +import java.util +import java.util.Collections + +import com.hurence.logisland.annotation.documentation.CapabilityDescription +import com.hurence.logisland.annotation.lifecycle.OnEnabled +import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} +import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} +import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} +import com.hurence.logisland.stream.StreamContext +import com.hurence.logisland.stream.StreamProperties._ +import com.hurence.logisland.util.kafka.KafkaSink +import com.hurence.logisland.util.spark.ControllerServiceLookupSink +import kafka.admin.AdminUtils +import org.apache.kafka.clients.consumer.ConsumerConfig +import org.apache.kafka.clients.producer.ProducerConfig +import org.apache.kafka.common.security.JaasUtils +import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySerializer} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.{Dataset, SparkSession} + +/** + * Compatible with kafka 2.4 or higher + */ +@CapabilityDescription("Provide a ways to use kafka as input or output in StructuredStream streams") +class KafkaStructuredStreamProviderService() extends AbstractControllerService with StructuredStreamProviderService { + + // private val logger = LoggerFactory.getLogger(this.getClass) + + + var appName = "" + var kafkaSinkParams: Map[String, Object] = _ + var kafkaParams: Map[String, Object] = _ + // Define the Kafka parameters, broker list must be specified + var inputTopics = Set[String]() + var outputTopics = Set[String]() + var errorTopics = Set[String]() + var metricsTopics = Set[String]() + var topicAutocreate = true + var topicDefaultPartitions = 3 + var topicDefaultReplicationFactor = 1 + var brokerList = "" + var zkQuorum = "" + var kafkaBatchSize = "16384" + var kafkaLingerMs = "5" + var kafkaAcks = "0" + var kafkaOffset = "latest" + var inputSerializerType = "" + var outputSerializerType = "" + + @OnEnabled + @throws[InitializationException] + override def init(context: ControllerServiceInitializationContext): Unit = { + super.init(context) + this.synchronized { + try { + + // Define the Kafka parameters, broker list must be specified + inputTopics = context.getPropertyValue(INPUT_TOPICS).asString.split(",").toSet + outputTopics = context.getPropertyValue(OUTPUT_TOPICS).asString.split(",").toSet + errorTopics = context.getPropertyValue(ERROR_TOPICS).asString.split(",").toSet + metricsTopics = DEFAULT_METRICS_TOPIC.getValue.split(",").toSet + + inputSerializerType = context.getPropertyValue(INPUT_SERIALIZER).asString() + outputSerializerType = context.getPropertyValue(OUTPUT_SERIALIZER).asString() + + topicAutocreate = context.getPropertyValue(KAFKA_TOPIC_AUTOCREATE).asBoolean().booleanValue() + topicDefaultPartitions = context.getPropertyValue(KAFKA_TOPIC_DEFAULT_PARTITIONS).asInteger().intValue() + topicDefaultReplicationFactor = context.getPropertyValue(KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR).asInteger().intValue() + brokerList = context.getPropertyValue(KAFKA_METADATA_BROKER_LIST).asString + zkQuorum = context.getPropertyValue(KAFKA_ZOOKEEPER_QUORUM).asString + + + kafkaBatchSize = context.getPropertyValue(KAFKA_BATCH_SIZE).asString + kafkaLingerMs = context.getPropertyValue(KAFKA_LINGER_MS).asString + kafkaAcks = context.getPropertyValue(KAFKA_ACKS).asString + kafkaOffset = context.getPropertyValue(KAFKA_MANUAL_OFFSET_RESET).asString + + + kafkaSinkParams = Map( + ProducerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokerList, + ProducerConfig.CLIENT_ID_CONFIG -> appName, + ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getCanonicalName, + ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName, + ProducerConfig.ACKS_CONFIG -> kafkaAcks, + ProducerConfig.RETRIES_CONFIG -> "3", + ProducerConfig.LINGER_MS_CONFIG -> kafkaLingerMs, + ProducerConfig.BATCH_SIZE_CONFIG -> kafkaBatchSize, + ProducerConfig.RETRY_BACKOFF_MS_CONFIG -> "1000", + ProducerConfig.RECONNECT_BACKOFF_MS_CONFIG -> "1000") + + +// // TODO deprecate topic creation here (must be done through the agent) +// if (topicAutocreate) { +// val zkUtils = ZkUtils.apply(zkQuorum, 10000, 10000, JaasUtils.isZkSecurityEnabled) +// createTopicsIfNeeded(zkUtils, inputTopics, topicDefaultPartitions, topicDefaultReplicationFactor) +// createTopicsIfNeeded(zkUtils, outputTopics, topicDefaultPartitions, topicDefaultReplicationFactor) +// createTopicsIfNeeded(zkUtils, errorTopics, topicDefaultPartitions, topicDefaultReplicationFactor) +// createTopicsIfNeeded(zkUtils, metricsTopics, 1, 1) +// } + + + kafkaParams = Map[String, Object]( + ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokerList, + ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer], + ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer], + ConsumerConfig.GROUP_ID_CONFIG -> appName, + ConsumerConfig.RECONNECT_BACKOFF_MS_CONFIG -> "50", + ConsumerConfig.RETRY_BACKOFF_MS_CONFIG -> "100", + ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> kafkaOffset, + ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> "false", + ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG -> "30000" + /*, + ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG -> "5000"*/ + ) + + } catch { + case e: Exception => + throw new InitializationException(e) + } + } + } + + /** + * create a streaming DataFrame that represents data received + * + * @param spark + * @param streamContext + * @return DataFrame currently loaded + */ + override def read(spark: SparkSession, streamContext: StreamContext) = { + import spark.implicits._ + implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] + + logger.info(s"starting Kafka direct stream on topics $inputTopics from $kafkaOffset offsets") + val df = spark.readStream + .format("kafka") + .option("kafka.bootstrap.servers", brokerList) + .option("subscribe", inputTopics.mkString(",")) + .load() + .selectExpr("CAST(key AS STRING)", "CAST(value AS BINARY)") + // .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") + .as[(String, Array[Byte])] + .map(r => { + new StandardRecord(inputTopics.head) + .setField(FieldDictionary.RECORD_KEY, FieldType.STRING, r._1) + .setField(FieldDictionary.RECORD_VALUE, FieldType.BYTES, r._2) + }) + + df + } + + /** + * Allows subclasses to register which property descriptor objects are + * supported. + * + * @return PropertyDescriptor objects this processor currently supports + */ + override def getSupportedPropertyDescriptors() = { + val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] + descriptors.add(ERROR_TOPICS) + descriptors.add(INPUT_TOPICS) + descriptors.add(OUTPUT_TOPICS) + descriptors.add(AVRO_INPUT_SCHEMA) + descriptors.add(AVRO_OUTPUT_SCHEMA) + descriptors.add(INPUT_SERIALIZER) + descriptors.add(OUTPUT_SERIALIZER) + descriptors.add(ERROR_SERIALIZER) + descriptors.add(KAFKA_TOPIC_AUTOCREATE) + descriptors.add(KAFKA_TOPIC_DEFAULT_PARTITIONS) + descriptors.add(KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR) + descriptors.add(KAFKA_METADATA_BROKER_LIST) + descriptors.add(KAFKA_ZOOKEEPER_QUORUM) + descriptors.add(KAFKA_MANUAL_OFFSET_RESET) + descriptors.add(KAFKA_BATCH_SIZE) + descriptors.add(KAFKA_LINGER_MS) + descriptors.add(KAFKA_ACKS) + descriptors.add(WINDOW_DURATION) + descriptors.add(SLIDE_DURATION) + Collections.unmodifiableList(descriptors) + } + +// /** +// * Topic creation +// * +// * @param zkUtils +// * @param topics +// * @param topicDefaultPartitions +// * @param topicDefaultReplicationFactor +// */ +// def createTopicsIfNeeded(zkUtils: ZkUtils, +// topics: Set[String], +// topicDefaultPartitions: Int, +// topicDefaultReplicationFactor: Int): Unit = { +// +// topics.foreach(topic => { +// +// if (!topic.equals(NONE_TOPIC) && !AdminUtils.topicExists(zkUtils, topic)) { +// AdminUtils.createTopic(zkUtils, topic, topicDefaultPartitions, topicDefaultReplicationFactor) +// Thread.sleep(1000) +// logger.info(s"created topic $topic with" + +// s" $topicDefaultPartitions partitions and" + +// s" $topicDefaultReplicationFactor replicas") +// } +// }) +// } + + case class RecordWrapper(record:Record) + + /** + * create a streaming DataFrame that represents data received + * + * @param streamContext + * @return DataFrame currently loaded + */ + override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext) = { + val sender = df.sparkSession.sparkContext.broadcast(KafkaSink(kafkaSinkParams)) + + import df.sparkSession.implicits._ + + // Write key-value data from a DataFrame to a specific Kafka topic specified in an option + df .map(r => { + (r.getField(FieldDictionary.RECORD_KEY).asString(), r.getField(FieldDictionary.RECORD_VALUE).asBytes()) + }) + .as[(String, Array[Byte])] + .toDF("key","value") + .writeStream + .format("kafka") + .option("kafka.bootstrap.servers", brokerList) + .option("topic", outputTopics.mkString(",")) + .option("checkpointLocation", "checkpoints") + + } + + private def getOrElse[T](record: Record, field: String, defaultValue: T): T = { + val value = record.getField(field) + if (value != null && value.isSet) { + return value.getRawValue.asInstanceOf[T] + } + defaultValue + } + + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderService.scala new file mode 100644 index 000000000..feb83d33d --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderService.scala @@ -0,0 +1,167 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark.structured.provider + +import java.util +import java.util.Collections + +import com.hurence.logisland.annotation.documentation.CapabilityDescription +import com.hurence.logisland.annotation.lifecycle.OnEnabled +import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} +import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} +import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} +import com.hurence.logisland.stream.StreamContext +import com.hurence.logisland.util.spark.ControllerServiceLookupSink +import com.hurence.logisland.validator.StandardValidators +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.streaming.DataStreamWriter +import org.apache.spark.sql.{Dataset, SparkSession} + +/** + * You can look at spark documentation for detail on some options : + * @author bailett + */ +@CapabilityDescription("Provide a way to read a local file as input in StructuredStream streams") +class LocalFileStructuredStreamProviderService extends AbstractControllerService with StructuredStreamProviderService { + + + val LOCAL_INPUT_PATH: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("local.input.path") + .description("the location of the directory of files to be loaded. All files inside the directory will be taked as input") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)//TODO directory validator + .required(true) + .build + + val MAX_FILES_PER_TRIGGER: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("max.files.per.trigger") + .description(" maximum number of new files to be considered in every trigger (default: no max) ") + .addValidator(StandardValidators.POSITIVE_LONG_VALIDATOR) + .required(false) + .build + + val LATEST_FIRST: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("latest.first") + .description("whether to processs the latest new files first, useful when there is a large backlog of files (default: false)") + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .required(false) + .build + + val FILENAME_ONLY: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("filename.only") + .description("whether to check new files based on only the filename instead of on the full path (default: false). " + + "With this set to `true`, the following files would be considered as the same file, because their filenames, \"dataset.txt\", " + + "are the same:\n\"file:///dataset.txt\"\n\"s3://a/dataset.txt\"\n\"s3n://a/b/dataset.txt\"\n\"s3a://a/b/c/dataset.txt\"") + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .required(false) + .build + + var path: String = _ + var maxFilesPerTrigger: Option[Long] = _ + var latestFirst: Option[Boolean] = _ + var fileNameOnly: Option[Boolean] = _ + + @OnEnabled + @throws[InitializationException] + override def init(context: ControllerServiceInitializationContext): Unit = { + super.init(context) + path = context.getPropertyValue(LOCAL_INPUT_PATH).asString() + if (context.getPropertyValue(MAX_FILES_PER_TRIGGER).isSet) { + maxFilesPerTrigger = Some(context.getPropertyValue(MAX_FILES_PER_TRIGGER).asLong()) + } else { + maxFilesPerTrigger = None + } + if (context.getPropertyValue(LATEST_FIRST).isSet) { + latestFirst = Some(context.getPropertyValue(LATEST_FIRST).asBoolean()) + } else { + latestFirst = None + } + if (context.getPropertyValue(FILENAME_ONLY).isSet) { + fileNameOnly = Some(context.getPropertyValue(FILENAME_ONLY).asBoolean()) + } else { + fileNameOnly = None + } + } + + /** + * Allows subclasses to register which property descriptor objects are + * supported. + * + * @return PropertyDescriptor objects this processor currently supports + */ + override def getSupportedPropertyDescriptors() = { + val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] + descriptors.add(LOCAL_INPUT_PATH) + descriptors.add(MAX_FILES_PER_TRIGGER) + descriptors.add(LATEST_FIRST) + descriptors.add(FILENAME_ONLY) + Collections.unmodifiableList(descriptors) + } + + /** + * create a streaming DataFrame that represents data received + * + * @param spark + * @param streamContext + * @return DataFrame currently loaded + */ + override def read(spark: SparkSession, streamContext: StreamContext) = { + import spark.implicits._ + implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] + + val dataStreamReader = spark.readStream + .format("text") + if (maxFilesPerTrigger.isDefined) { + dataStreamReader.option("maxFilesPerTrigger", maxFilesPerTrigger.get) + } + if (latestFirst.isDefined) { + dataStreamReader.option("latestFirst", latestFirst.get) + } + if (fileNameOnly.isDefined) { + dataStreamReader.option("fileNameOnly", fileNameOnly.get) + } + dataStreamReader.load(path) + .as[String] + .map(r => { + new StandardRecord("line") + .setField(FieldDictionary.RECORD_VALUE, FieldType.STRING, r) + }) + } + + /** + * create a streaming DataFrame that represents data received + * + * @param streamContext + * @return DataFrame currently loaded + */ + override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): DataStreamWriter[_] = { + throw new IllegalArgumentException("LocalFileStructuredStreamProviderService class does not support write operation yet") + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/MQTTStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/MQTTStructuredStreamProviderService.scala new file mode 100644 index 000000000..a0440517f --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/MQTTStructuredStreamProviderService.scala @@ -0,0 +1,174 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark.structured.provider + + +import java.sql.Timestamp +import java.util +import java.util.Collections + +import com.hurence.logisland.annotation.documentation.CapabilityDescription +import com.hurence.logisland.annotation.lifecycle.OnEnabled +import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} +import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} +import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} +import com.hurence.logisland.stream.StreamContext +import com.hurence.logisland.stream.StreamProperties._ +import com.hurence.logisland.util.spark.ControllerServiceLookupSink +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.streaming.DataStreamWriter +import org.apache.spark.sql.{Dataset, SparkSession} + +@CapabilityDescription("Provide a ways to use Mqtt a input or output in StructuredStream streams") +class MQTTStructuredStreamProviderService extends AbstractControllerService with StructuredStreamProviderService { + + + var brokerUrl = "" + var persistence = "" + var clientId = "" + var QoS = 0 + var username = "" + var password = "" + var cleanSession = true + var connectionTimeout = 5000 + var keepAlive = 30000 + var mqttVersion = "3.1.1" + var topic = "" + + @OnEnabled + @throws[InitializationException] + override def init(context: ControllerServiceInitializationContext): Unit = { + super.init(context) + this.synchronized { + try { + + // Define the MQTT parameters, broker list must be specified + brokerUrl = context.getPropertyValue(MQTT_BROKER_URL).asString + persistence = context.getPropertyValue(MQTT_PERSISTENCE).asString + clientId = context.getPropertyValue(MQTT_CLIENTID).asString + QoS = context.getPropertyValue(MQTT_QOS).asInteger().intValue() + username = context.getPropertyValue(MQTT_USERNAME).asString + password = context.getPropertyValue(MQTT_PASSWORD).asString + cleanSession = context.getPropertyValue(MQTT_CLEAN_SESSION).asBoolean().booleanValue() + connectionTimeout = context.getPropertyValue(MQTT_CONNECTION_TIMEOUT).asInteger().intValue() + keepAlive = context.getPropertyValue(MQTT_KEEP_ALIVE).asInteger().intValue() + mqttVersion = context.getPropertyValue(MQTT_VERSION).asString + topic = context.getPropertyValue(MQTT_TOPIC).asString + } catch { + case e: Exception => + throw new InitializationException(e) + } + } + } + + /** + * Allows subclasses to register which property descriptor objects are + * supported. + * + * @return PropertyDescriptor objects this processor currently supports + */ + override def getSupportedPropertyDescriptors() = { + val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] + descriptors.add(MQTT_BROKER_URL) + descriptors.add(MQTT_CLEAN_SESSION) + descriptors.add(MQTT_CLIENTID) + descriptors.add(MQTT_CONNECTION_TIMEOUT) + descriptors.add(MQTT_KEEP_ALIVE) + descriptors.add(MQTT_PASSWORD) + descriptors.add(MQTT_PERSISTENCE) + descriptors.add(MQTT_VERSION) + descriptors.add(MQTT_USERNAME) + descriptors.add(MQTT_QOS) + descriptors.add(MQTT_TOPIC) + Collections.unmodifiableList(descriptors) + } + + /** + * create a streaming DataFrame that represents data received + * + * @param spark + * @param streamContext + * @return DataFrame currently loaded + */ + override def read(spark: SparkSession, streamContext: StreamContext) = { + import spark.implicits._ + implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] + + getLogger.info("connecting to MQTT") + spark.readStream + .format("com.hurence.logisland.util.mqtt.MQTTStreamSourceProvider") + .option("topic", topic) + .option("persistence", persistence) + .option("clientId", clientId) + .option("QoS", QoS) + .option("username", username) + .option("password", password) + .option("cleanSession", cleanSession) + .option("connectionTimeout", connectionTimeout) + .option("keepAlive", keepAlive) + .option("mqttVersion", mqttVersion) + .load(brokerUrl) + .as[(String, Array[Byte], Timestamp)] + .map(r => { + new StandardRecord("kura_metric") + .setTime(r._3) + .setField(FieldDictionary.RECORD_VALUE, FieldType.BYTES, r._2) + .setField(FieldDictionary.RECORD_NAME, FieldType.STRING, r._1) + }) + + } + + + /** + * create a streaming DataFrame that represents data received + * + * @param streamContext + * @return DataFrame currently loaded + */ + override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): DataStreamWriter[_] = { + + + // Create DataFrame representing the stream of input lines from connection to mqtt server + df.writeStream + .format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSourceProvider") + .option("topic", topic) + .option("persistence", persistence) + .option("clientId", clientId) + .option("QoS", QoS) + .option("username", username) + .option("password", password) + .option("cleanSession", cleanSession) + .option("connectionTimeout", connectionTimeout) + .option("keepAlive", keepAlive) + .option("mqttVersion", mqttVersion) + + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/RateStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/RateStructuredStreamProviderService.scala new file mode 100644 index 000000000..ebb36107c --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/RateStructuredStreamProviderService.scala @@ -0,0 +1,202 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark.structured.provider + +import java.io.{File, FileReader} +import java.util +import java.util.Collections + +import com.hurence.logisland.annotation.documentation.CapabilityDescription +import com.hurence.logisland.annotation.lifecycle.OnEnabled +import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} +import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} +import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} +import com.hurence.logisland.stream.StreamContext +import com.hurence.logisland.util.spark.ControllerServiceLookupSink +import com.hurence.logisland.validator.StandardValidators +import org.apache.commons.csv.CSVFormat +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.streaming.DataStreamWriter +import org.apache.spark.sql.{Dataset, SparkSession} + +import scala.collection.JavaConversions._ + +/** + * + * @author bailett + */ + +@CapabilityDescription("Generates data at the specified number of rows per second, each output row contains a timestamp and value. " + + "Where timestamp is a Timestamp type containing the time of message dispatch, and value is of Long type containing the message count, " + + "starting from 0 as the first row. This source is intended for testing and benchmarking. Used in StructuredStream streams.") +class RateStructuredStreamProviderService extends AbstractControllerService with StructuredStreamProviderService { + + + val LOCAL_FILE_INPUT_PATH: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("local.file.input.path") + .description("the location of the file to be loaded") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(true) + .build + + val HAS_CSV_HEADER: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("has.csv.header") + .description("Is this a csv file with the first line as a header") + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .required(false) + .defaultValue("true") + .build + + val CSV_DELIMITER: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("csv.delimiter") + .description("the delimiter") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(false) + .defaultValue(",") + .build + + val LOCAL_FILE_OUTPUT_PATH: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("local.file.output.path") + .description("the location of the file to be writen") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(false) + .build + + + var recordSeq:Seq[Record] = _ + + @OnEnabled + @throws[InitializationException] + override def init(context: ControllerServiceInitializationContext): Unit = { + super.init(context) + this.synchronized { + try { + + val delimiter = context.getPropertyValue(CSV_DELIMITER).asString() + val path = context.getPropertyValue(LOCAL_FILE_INPUT_PATH).asString() + val f = new File(path) + + if (f.exists && !f.isDirectory) { + val in = new FileReader(path) + val csv = CSVFormat.DEFAULT.withDelimiter(delimiter.charAt(0)).withFirstRecordAsHeader + val records = csv.withHeader().withSkipHeaderRecord(false).parse(in) + recordSeq = records.map(record => { + val logislandRecord:Record = new StandardRecord() + .setField(FieldDictionary.RECORD_VALUE, FieldType.STRING, record.toString) + for (columnName <- record.toMap) { + logislandRecord.setField(columnName._1, FieldType.STRING, columnName._2) + } + logislandRecord + }).toSeq + }else{ + val resourcePath = classOf[RateStructuredStreamProviderService].getResource(path).getPath + val in = new FileReader(resourcePath) + val csv = CSVFormat.DEFAULT.withDelimiter(delimiter.charAt(0)).withFirstRecordAsHeader + val records = csv.withHeader().withSkipHeaderRecord(false).parse(in).getRecords + + recordSeq = records.map(record => { + val logislandRecord:Record = new StandardRecord() + .setField(FieldDictionary.RECORD_VALUE, FieldType.STRING, record.toString) + for (columnName <- record.toMap) { + logislandRecord.setField(columnName._1, FieldType.STRING, columnName._2) + } + logislandRecord + }).toSeq + } + + + + } catch { + case e: Exception => + throw new InitializationException(e) + } + } + } + + /** + * Allows subclasses to register which property descriptor objects are + * supported. + * + * @return PropertyDescriptor objects this processor currently supports + */ + override def getSupportedPropertyDescriptors() = { + val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] + + descriptors.add(LOCAL_FILE_INPUT_PATH) + descriptors.add(LOCAL_FILE_OUTPUT_PATH) + descriptors.add(HAS_CSV_HEADER) + descriptors.add(CSV_DELIMITER) + Collections.unmodifiableList(descriptors) + } + + /** + * create a streaming DataFrame that represents data received + * + * @param spark + * @param streamContext + * @return DataFrame currently loaded + */ + override def read(spark: SparkSession, streamContext: StreamContext) = { + import spark.implicits._ + implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] + + // val headers = records.iterator.next.toMap.keySet + + + + recordSeq.toDS() + } + + /** + * create a streaming DataFrame that represents data received + * + * @param streamContext + * @return DataFrame currently loaded + */ + override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): DataStreamWriter[_] = { + throw new IllegalArgumentException("RateStructuredStreamProviderService class does not support write operation"); + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala new file mode 100644 index 000000000..1fb834060 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala @@ -0,0 +1,391 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark.structured.provider + +import java.io.{ByteArrayInputStream, ByteArrayOutputStream} +import java.util +import java.util.Date + +import com.hurence.logisland.controller.ControllerService +import com.hurence.logisland.record._ +import com.hurence.logisland.serializer.{JsonSerializer, NoopSerializer, RecordSerializer, SerializerProvider} +import com.hurence.logisland.stream.StreamContext +import com.hurence.logisland.stream.StreamProperties._ +import com.hurence.logisland.util.spark.{ControllerServiceLookupSink, ProcessorMetrics} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.groupon.metrics.UserMetricsSystem +import org.apache.spark.sql.streaming._ +import org.apache.spark.sql.{Dataset, SparkSession} +import org.slf4j.LoggerFactory + +import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ + + +trait StructuredStreamProviderService extends ControllerService { + + val logger = LoggerFactory.getLogger(this.getClass) + + + /** + * create a streaming DataFrame that represents data received + * + * @param spark + * @param streamContext + * @return DataFrame currently loaded + */ + protected def read(spark: SparkSession, streamContext: StreamContext): Dataset[Record] + + /** + * create a streaming DataFrame that represents data received + * + * @param streamContext + * @return DataFrame currently loaded + */ + protected def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): DataStreamWriter[_] + + /** + * + * + * @param spark + * @param streamContext + * @return + */ + def load(spark: SparkSession, controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): Dataset[Record] = { + + import spark.implicits._ + implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] + + val df = read(spark, streamContext) + + /** + * create serializers + */ + val serializer = SerializerProvider.getSerializer( + streamContext.getPropertyValue(READ_TOPICS_SERIALIZER).asString, + streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) + + val keySerializer = SerializerProvider.getSerializer( + streamContext.getPropertyValue(READ_TOPICS_KEY_SERIALIZER).asString, + null) + + + // convert to logisland records + + val processingRecords: Dataset[Record] = df.flatMap(r => { + serializer match { + case sr: NoopSerializer => Some(r) + case _ => deserializeRecords(serializer, keySerializer, r) + } + }) + + + if (streamContext.getPropertyValue(GROUPBY).isSet) { + + val keys = streamContext.getPropertyValue(GROUPBY).asString() + val stateTimeoutDuration = streamContext.getPropertyValue(STATE_TIMEOUT_MS).asLong() + val chunkSize = streamContext.getPropertyValue(CHUNK_SIZE).asInteger() + + processingRecords + .filter(_.hasField(keys)) + .groupByKey(_.getField(keys).asString()) + .flatMapGroupsWithState(outputMode = OutputMode.Append, timeoutConf = GroupStateTimeout.ProcessingTimeTimeout())( + mappingFunction(controllerServiceLookupSink, streamContext, chunkSize, stateTimeoutDuration) + ) + + } else { + processingRecords.mapPartitions(iterator => { + executePipeline(controllerServiceLookupSink, streamContext, iterator) + }) + } + + + } + + val ALL_RECORDS = "all_records" + val CHUNK_CREATION_TS = "chunk_creation_ts" + + def mappingFunction(controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], + streamContext: StreamContext, + chunkSize: Int, + timeOutDuration: Long) + (key: String, + value: Iterator[Record], + state: GroupState[Record]): Iterator[Record] = { + + + val currentTimestamp = new Date().getTime + val inputRecords = value.toList + val allRecords = if (state.exists) state.get.getField(ALL_RECORDS).getRawValue.asInstanceOf[List[Record]] ++ inputRecords else inputRecords + val recordChunks = allRecords.grouped(chunkSize).toList + + + if (state.hasTimedOut || (state.exists && (currentTimestamp - state.get.getField(CHUNK_CREATION_TS).asLong()) >= timeOutDuration)) { + state.remove() + // logger.debug("TIMEOUT key " + key + ", flushing " + allRecords.size + " records in " + recordChunks.size + "chunks") + recordChunks + .flatMap(subset => executePipeline(controllerServiceLookupSink, streamContext, subset.iterator)) + .iterator + } + else if (recordChunks.last.size == chunkSize) { + state.remove() + //logger.debug("REMOVE key " + key + ", flushing " + allRecords.size + " records in " + recordChunks.size + "chunks") + recordChunks + .flatMap(subset => executePipeline(controllerServiceLookupSink, streamContext, subset.iterator)) + .iterator + } + else if (!state.exists) { + + val newChunk = new StandardRecord("chunk_record") //Chunk(key, recordChunks.last) + newChunk.setObjectField(ALL_RECORDS, recordChunks.last) + newChunk.setStringField(FieldDictionary.RECORD_KEY, key) + newChunk.setLongField(CHUNK_CREATION_TS, new Date().getTime ) + // logger.debug("CREATE key " + key + " new chunk with " + allRecords.size + " records") + + state.update(newChunk) + state.setTimeoutDuration(timeOutDuration) + + recordChunks + .slice(0, recordChunks.length - 1) + .flatMap(subset => executePipeline(controllerServiceLookupSink, streamContext, subset.iterator)) + .iterator + } + + + else { + val currentChunk = state.get + if (recordChunks.size == 1) { + currentChunk.setObjectField(ALL_RECORDS, allRecords) + state.update(currentChunk) + // logger.debug("UPDATE key " + key + ", allRecords " + allRecords.size + ", recordChunks " + recordChunks.size) + Iterator.empty + }else{ + currentChunk.setObjectField(ALL_RECORDS, recordChunks.last) + //logger.debug("UPDATE key " + key + ", allRecords " + allRecords.size + ", recordChunks " + recordChunks.size) + + state.update(currentChunk) + state.setTimeoutDuration(timeOutDuration) + + recordChunks + .slice(0, recordChunks.length - 1) + .flatMap(subset => executePipeline(controllerServiceLookupSink, streamContext, subset.iterator)) + .iterator + } + + } + + + } + + private def executePipeline(controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext, iterator: Iterator[Record]) + + = { + val controllerServiceLookup = controllerServiceLookupSink.value.getControllerServiceLookup() + + + // convert to logisland records + var processingRecords: util.Collection[Record] = iterator.toList + + val pipelineMetricPrefix = streamContext.getIdentifier + "." + // loop over processor chain + streamContext.getProcessContexts.foreach(processorContext => { + val startTime = System.currentTimeMillis() + val processor = processorContext.getProcessor + + val processorTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + + processorContext.getIdentifier + ".processing_time_ms").time() + + // injects controller service lookup into processor context + if (processor.hasControllerService) { + processorContext.setControllerServiceLookup(controllerServiceLookup) + } + + // processor setup (don't forget that) + if(!processor.isInitialized) + processor.init(processorContext) + + // do the actual processing + processingRecords = processor.process(processorContext, processingRecords) + + // compute metrics + ProcessorMetrics.computeMetrics( + pipelineMetricPrefix + processorContext.getIdentifier + ".", + processingRecords, + processingRecords, + 0, + processingRecords.size, + System.currentTimeMillis() - startTime) + + processorTimerContext.stop() + }) + + + processingRecords.asScala.iterator + } + + /** + * create a streaming DataFrame that represents data received + * + * @param streamContext + * @return DataFrame currently loaded + */ + def save(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): StreamingQuery = { + + + implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] + + // make sure controller service lookup won't be serialized !! + streamContext.setControllerServiceLookup(null) + + // create serializer + val serializer = SerializerProvider.getSerializer( + streamContext.getPropertyValue(WRITE_TOPICS_SERIALIZER).asString, + streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) + + // create serializer + val keySerializer = SerializerProvider.getSerializer( + streamContext.getPropertyValue(WRITE_TOPICS_KEY_SERIALIZER).asString, null) + + // do the parallel processing + val df2 = df.mapPartitions(record => record.map(record => serializeRecords(serializer, keySerializer, record))) + + write(df2, controllerServiceLookupSink, streamContext) + .queryName(streamContext.getIdentifier) + // .outputMode("update") + .option("checkpointLocation", "checkpoints/" + streamContext.getIdentifier) + .start() + // .processAllAvailable() + + } + + + protected def serializeRecords(valueSerializer: RecordSerializer, keySerializer: RecordSerializer, record: Record) + + = { + + try { + val ret = valueSerializer match { + case s: JsonSerializer => + new StandardRecord() + .setField(FieldDictionary.RECORD_VALUE, FieldType.STRING, doSerializeAsString(valueSerializer, record)) + case _ => + new StandardRecord() + .setField(FieldDictionary.RECORD_VALUE, FieldType.BYTES, doSerialize(valueSerializer, record)) + } + val fieldKey = record.getField(FieldDictionary.RECORD_KEY); + if (fieldKey != null) { + ret.setField(FieldDictionary.RECORD_KEY, FieldType.BYTES, doSerialize(keySerializer, new StandardRecord().setField(fieldKey))) + } else { + ret.setField(FieldDictionary.RECORD_KEY, FieldType.NULL, null) + + } + ret + + } catch { + case t: Throwable => + logger.error(s"exception while serializing events ${ + t.getMessage + }") + null + } + + + } + + private def doSerializeAsString(serializer: RecordSerializer, record: Record): String + + = { + val baos: ByteArrayOutputStream = new ByteArrayOutputStream + serializer.serialize(baos, record) + val bytes = baos.toByteArray + baos.close() + new String(bytes) + + + } + + private def doSerialize(serializer: RecordSerializer, record: Record): Array[Byte] + + = { + val baos: ByteArrayOutputStream = new ByteArrayOutputStream + serializer.serialize(baos, record) + val bytes = baos.toByteArray + baos.close() + bytes + + + } + + private def doDeserialize(serializer: RecordSerializer, field: Field): Record + + = { + val f = field.getRawValue + val s = if (f.isInstanceOf[String]) f.asInstanceOf[String].getBytes else f; + val bais = new ByteArrayInputStream(s.asInstanceOf[Array[Byte]]) + try { + serializer.deserialize(bais) + } finally { + bais.close() + } + } + + protected def deserializeRecords(serializer: RecordSerializer, keySerializer: RecordSerializer, r: Record) + + = { + try { + val deserialized = doDeserialize(serializer, r.getField(FieldDictionary.RECORD_VALUE)) + // copy root record field + if (r.hasField(FieldDictionary.RECORD_NAME)) + deserialized.setField(r.getField(FieldDictionary.RECORD_NAME)) + + if (r.hasField(FieldDictionary.RECORD_KEY) && r.getField(FieldDictionary.RECORD_KEY).getRawValue != null) { + val deserializedKey = doDeserialize(keySerializer, r.getField(FieldDictionary.RECORD_KEY)) + if (deserializedKey.hasField(FieldDictionary.RECORD_VALUE) && deserializedKey.getField(FieldDictionary.RECORD_VALUE).getRawValue != null) { + val f = deserializedKey.getField(FieldDictionary.RECORD_VALUE) + deserialized.setField(FieldDictionary.RECORD_KEY, f.getType, f.getRawValue) + } else { + logger.warn("Unable to serialize key for record $r with serializer $keySerializer") + } + } + + Some(deserialized) + + } catch { + case t: Throwable => + logger.error(s"exception while deserializing events ${ + t.getMessage + }") + None + } + } + + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaReporter.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaReporter.scala new file mode 100644 index 000000000..accb2f1c6 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaReporter.scala @@ -0,0 +1,224 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.util.kafka + +import java.util.Properties +import java.util.concurrent.TimeUnit + +import com.codahale.metrics._ +import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} +import org.json4s.JsonAST.JObject +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods._ +import org.slf4j.{Logger, LoggerFactory} + +import scala.collection.JavaConverters._ +import scala.language.existentials +import scala.util.{Failure, Success, Try} + +class KafkaReporter( + registry: MetricRegistry, + kafkaEndpoint: String, + kafkaTopic: String, + properties: Properties) + extends ScheduledReporter( + registry, + "kafka-reporter", + MetricFilter.ALL, + TimeUnit.SECONDS, + TimeUnit.MILLISECONDS) { + + val logger: Logger = LoggerFactory.getLogger(this.getClass) + + var producer: Option[KafkaProducer[String, String]] = None + + // Any user properties set in the metrics config file + // prodconf_foo=this.setting.key=value + // prodconf_bar=this.setting.key2=value2 + private def setUserProperties(props: Properties) { + for { + entry <- properties.entrySet().asScala + if (entry.getKey().asInstanceOf[String].startsWith("prodconf_")) + } { + val kv = entry.getValue().asInstanceOf[String].split('=') + if (kv.length != 2) { + logger.error(s"Ignoring bad prodconf_* setting: ${entry.getValue()}") + } else { + props.put(kv(0), kv(1)) + } + } + } + + override def start(period: Long, unit: TimeUnit): Unit = { + super.start(period, unit) + val status = for { + kp <- Try { + logger.info(s"Opening Kafka endpoint $kafkaEndpoint") + val props = new Properties() + + // Set these, but may be overridden in setUserProperties + props.put("client.id", (s"KafkaReporter-$kafkaEndpoint-$kafkaTopic").replace(':', '-')) + + // load any KafkaProducer conf settings passed in from metrics config + setUserProperties(props) + + // Anything here takes precedence over user settings + props.put("bootstrap.servers", kafkaEndpoint) + props.put("key.serializer", + "org.apache.kafka.common.serialization.StringSerializer") + props.put("value.serializer", + "org.apache.kafka.common.serialization.StringSerializer") + + logger.info(s"Kafka producer properties:\n$props") + + new KafkaProducer[String, String](props) + } + } yield { + kp + } + status match { + case Success(kp) => { + logger.info(s"Kafka producer connected to $kafkaEndpoint") + producer = Some(kp) + } + case Failure(err) => { + logger.error(s"Failure opening Kafka endpoint $kafkaEndpoint:\n$err") + } + } + } + + override def stop(): Unit = { + logger.info(s"Stopping Kafka reporter at $kafkaEndpoint") + super.stop() + } + + def report( + gauges: java.util.SortedMap[String, Gauge[_]], + counters: java.util.SortedMap[String, Counter], + histograms: java.util.SortedMap[String, Histogram], + meters: java.util.SortedMap[String, Meter], + timers: java.util.SortedMap[String, Timer]): Unit = { + + if (producer.isEmpty) { + logger.error(s"Failed Kafka client for $kafkaEndpoint: metric output ignored") + } else { + // dump metric output to the kafka topic + val prod = producer.get + for {entry <- gauges.entrySet().asScala} { + gaugeJSON(entry.getValue()).foreach { jv => prod.send(metricRec(entry.getKey(), jv)) } + } + for {entry <- counters.entrySet().asScala} { + counterJSON(entry.getValue()).foreach { jv => prod.send(metricRec(entry.getKey(), jv)) } + } + for {entry <- histograms.entrySet().asScala} { + histJSON(entry.getValue()).foreach { jv => prod.send(metricRec(entry.getKey(), jv)) } + } + for {entry <- meters.entrySet().asScala} { + meterJSON(entry.getValue()).foreach { jv => prod.send(metricRec(entry.getKey(), jv)) } + } + for {entry <- timers.entrySet().asScala} { + timerJSON(entry.getValue()).foreach { jv => prod.send(metricRec(entry.getKey(), jv)) } + } + } + } + + private def metricRec(key: String, value: String) = + new ProducerRecord[String, String](kafkaTopic, key, value) + + private def gaugeJSON(gauge: Gauge[_]): Option[String] = { + val tpe = ("type" -> "gauge") + gauge.getValue() match { + case v: Int => Some(compact(render(tpe ~ ("value" -> v)))) + case v: Long => Some(compact(render(tpe ~ ("value" -> v)))) + case v: Float => Some(compact(render(tpe ~ ("value" -> v)))) + case v: Double => Some(compact(render(tpe ~ ("value" -> v)))) + case v => { + logger.error(s"Ignoring unexpected Gauge value: $v") + None + } + } + } + + private def counterJSON(counter: Counter): Option[String] = { + val tpe = ("type" -> "counter") + Some(compact(render(tpe ~ ("value" -> counter.getCount())))) + } + + private def histJSON(hist: Histogram): Option[String] = { + for { + hsub <- samplingAST(hist, "histquantiles") + nsub <- Some(("n" -> hist.getCount())) + } yield { + compact(render(("type" -> "histogram") ~ ("value" -> (nsub ~ hsub)))) + } + } + + private def meterJSON(meter: Meter): Option[String] = { + for { + + msub <- meteredAST(meter) + nsub <- Some(("n" -> meter.getCount())) + } yield { + compact(render(("type" -> "meter") ~ ("value" -> (nsub ~ msub)))) + } + } + + private def timerJSON(timer: Timer): Option[String] = { + for { + hsub <- samplingAST(timer, "timerquantiles") + msub <- meteredAST(timer) + nsub <- Some(("n" -> timer.getCount())) + } yield { + compact(render(("type" -> "timer") ~ ("value" -> (nsub ~ hsub ~ msub)))) + } + } + + private def samplingAST(hist: Sampling, qsetting: String): Option[JObject] = { + val snapshot = hist.getSnapshot() + Try { + val hqs = Option(properties.getProperty(qsetting)).getOrElse( + "0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0") + val q = hqs.split(",").map(_.toDouble).toVector + val x = q.map { z => snapshot.getValue(z) } + (q, x) + } match { + case Failure(_) => { + val hqs = properties.getProperty(qsetting) + logger.error(s"Bad quantile setting: $hqs\nIgnoring histogram metric output") + None + } + case Success((q, x)) => { + val hsub = + ("q" -> q) ~ + ("x" -> x) ~ + ("min" -> snapshot.getMin()) ~ + ("max" -> snapshot.getMax()) ~ + ("mean" -> snapshot.getMean()) ~ + ("stdv" -> snapshot.getStdDev()) + Some(hsub) + } + } + } + + private def meteredAST(meter: Metered): Option[JObject] = { + val msub = + ("rate1" -> meter.getOneMinuteRate()) ~ + ("rate5" -> meter.getFiveMinuteRate()) ~ + ("rate15" -> meter.getFifteenMinuteRate()) ~ + ("rateMean" -> meter.getMeanRate()) + Some(msub) + } +} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaSink.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaSink.scala new file mode 100644 index 000000000..fa293413a --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaSink.scala @@ -0,0 +1,76 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.util.kafka + +import java.io.ByteArrayOutputStream + +import com.hurence.logisland.record.{FieldDictionary, Record} +import com.hurence.logisland.serializer.RecordSerializer +import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} + +import scala.collection.JavaConversions._ + +class KafkaSink(createProducer: () => KafkaProducer[Array[Byte], Array[Byte]]) extends Serializable { + + lazy val producer = createProducer() + + + def send(topic: String, key: Array[Byte], value: Array[Byte]): Unit = + producer.send(new ProducerRecord(topic, value)) + + /** + * Send events to Kafka topics + * + * @param events + */ + def produce(topic: String, events: List[Record], serializer:RecordSerializer) = { + + // do nothing if topic name is 'none' + if (!topic.equals("none")) { + val messages = events.map(event => { + // messages are serialized with kryo first + val baos: ByteArrayOutputStream = new ByteArrayOutputStream + serializer.serialize(baos, event) + + // and then converted to KeyedMessage + val key = if (event.hasField(FieldDictionary.RECORD_ID)) + event.getField(FieldDictionary.RECORD_ID).asString() + else + "" + val message = new ProducerRecord(topic, key.getBytes(), baos.toByteArray) + baos.close() + + + producer.send(message) + }) + } + } +} + +object KafkaSink { + def apply(config: Map[String, Object]): KafkaSink = { + val f = () => { + val producer = new KafkaProducer[Array[Byte], Array[Byte]](config) + + /* sys.addShutdownHook { + producer.close() + } +*/ + producer + } + new KafkaSink(f) + } +} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MQTTStreamSource.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MQTTStreamSource.scala new file mode 100644 index 000000000..f5c390822 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MQTTStreamSource.scala @@ -0,0 +1,240 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.util.mqtt + +import java.nio.charset.Charset +import java.sql.Timestamp +import java.text.SimpleDateFormat +import java.util.Calendar +import java.util.concurrent.CountDownLatch + +import org.apache.bahir.utils.Logging +import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Source} +import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider} +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, SQLContext} +import org.eclipse.paho.client.mqttv3._ +import org.eclipse.paho.client.mqttv3.persist.{MemoryPersistence, MqttDefaultFilePersistence} + +import scala.collection.concurrent.TrieMap +import scala.collection.mutable.ArrayBuffer +import scala.util.{Failure, Success, Try} + + +object MQTTStreamConstants { + + val DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") + + val SCHEMA_DEFAULT = StructType(StructField("topic", StringType) + :: StructField("payload", BinaryType) + :: StructField("timestamp", TimestampType) :: Nil) +} + +/** + * A Text based mqtt stream source, it interprets the payload of each incoming message by converting + * the bytes to String using Charset.defaultCharset as charset. Each value is associated with a + * timestamp of arrival of the message on the source. It can be used to operate a window on the + * incoming stream. + * + * @param brokerUrl url MqttClient connects to. + * @param persistence an instance of MqttClientPersistence. By default it is used for storing + * incoming messages on disk. If memory is provided as option, then recovery on + * restart is not supported. + * @param topic topic MqttClient subscribes to. + * @param clientId clientId, this client is assoicated with. Provide the same value to recover + * a stopped client. + * @param messageParser parsing logic for processing incoming messages from Mqtt Server. + * @param sqlContext Spark provided, SqlContext. + * @param mqttConnectOptions an instance of MqttConnectOptions for this Source. + * @param qos the maximum quality of service to subscribe each topic at.Messages published at + * a lower quality of service will be received at the published QoS. Messages + * published at a higher quality of service will be received using the QoS specified + * on the subscribe. + */ +class MQTTTextStreamSource(brokerUrl: String, persistence: MqttClientPersistence, + topic: String, clientId: String, messageParser: (String, Array[Byte]) => (String, Array[Byte], Timestamp), + sqlContext: SQLContext, mqttConnectOptions: MqttConnectOptions, qos: Int) + extends Source with Logging { + + override def schema: StructType = MQTTStreamConstants.SCHEMA_DEFAULT + + private val store = new LocalMessageStore(persistence, sqlContext.sparkContext.getConf) + + private val messages = new TrieMap[Int, (String, Array[Byte], Timestamp)] + + private val initLock = new CountDownLatch(1) + + private var offset = 0 + + private var client: MqttClient = _ + + private def fetchLastProcessedOffset(): Int = { + Try(store.maxProcessedOffset) match { + case Success(x) => + log.info(s"Recovering from last stored offset $x") + x + case Failure(e) => 0 + } + } + + initialize() + private def initialize(): Unit = { + + client = new MqttClient(brokerUrl, clientId, persistence) + + val callback = new MqttCallbackExtended() { + + override def messageArrived(topic_ : String, message: MqttMessage): Unit = synchronized { + initLock.await() // Wait for initialization to complete. + val temp = offset + 1 + messages.put(temp, messageParser(topic_, message.getPayload)) + offset = temp + log.trace(s"Message arrived, $topic_ $message") + } + + override def deliveryComplete(token: IMqttDeliveryToken): Unit = { + } + + override def connectionLost(cause: Throwable): Unit = { + log.warn("Connection to mqtt server lost.", cause) + } + + override def connectComplete(reconnect: Boolean, serverURI: String): Unit = { + log.info(s"Connect complete $serverURI. Is it a reconnect?: $reconnect") + } + } + client.setCallback(callback) + client.connect(mqttConnectOptions) + client.subscribe(topic, qos) + // It is not possible to initialize offset without `client.connect` + offset = fetchLastProcessedOffset() + initLock.countDown() // Release. + } + + /** Stop this source and free any resources it has allocated. */ + override def stop(): Unit = { + client.disconnect() + persistence.close() + client.close() + } + + /** Returns the maximum available offset for this source. */ + override def getOffset: Option[Offset] = { + if (offset == 0) { + None + } else { + Some(LongOffset(offset)) + } + } + + /** + * Returns the data that is between the offsets (`start`, `end`]. When `start` is `None` then + * the batch should begin with the first available record. This method must always return the + * same data for a particular `start` and `end` pair. + */ + override def getBatch(start: Option[Offset], end: Offset): DataFrame = synchronized { + val startIndex = start.getOrElse(LongOffset(0L)).asInstanceOf[LongOffset].offset.toInt + val endIndex = end.asInstanceOf[LongOffset].offset.toInt + val data: ArrayBuffer[(String, Array[Byte], Timestamp)] = ArrayBuffer.empty + // Move consumed messages to persistent store. + (startIndex + 1 to endIndex).foreach { id => + val element: (String, Array[Byte], Timestamp) = messages.getOrElse(id, store.retrieve(id)) + data += element + store.store(id, element) + messages.remove(id, element) + } + log.trace(s"Get Batch invoked, ${data.mkString}") + import sqlContext.implicits._ + data.toDF("topic", "payload", "timestamp") + } + +} + +class MQTTStreamSourceProvider extends StreamSourceProvider with DataSourceRegister with Logging { + + override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType], + providerName: String, parameters: Map[String, String]): (String, StructType) = { + ("mqtt", MQTTStreamConstants.SCHEMA_DEFAULT) + } + + override def createSource(sqlContext: SQLContext, metadataPath: String, + schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { + + def e(s: String) = new IllegalArgumentException(s) + + val brokerUrl: String = parameters.getOrElse("brokerUrl", parameters.getOrElse("path", + throw e("Please provide a `brokerUrl` by specifying path or .options(\"brokerUrl\",...)"))) + + + val persistence: MqttClientPersistence = parameters.get("persistence") match { + case Some("memory") => new MemoryPersistence() + case _ => val localStorage: Option[String] = parameters.get("localStorage") + localStorage match { + case Some(x) => new MqttDefaultFilePersistence(x) + case None => new MqttDefaultFilePersistence() + } + } + + val messageParserWithTimeStamp = (x: Array[Byte]) => + (new String(x, Charset.defaultCharset()), Timestamp.valueOf( + MQTTStreamConstants.DATE_FORMAT.format(Calendar.getInstance().getTime))) + + + val messageNOPParser = (x: Array[Byte]) => (x,Timestamp.valueOf( + MQTTStreamConstants.DATE_FORMAT.format(Calendar.getInstance().getTime))) + + val messageKVParser = (topic:String, x: Array[Byte]) => (topic, x,Timestamp.valueOf( + MQTTStreamConstants.DATE_FORMAT.format(Calendar.getInstance().getTime))) + + // if default is subscribe everything, it leads to getting lot unwanted system messages. + val topic: String = parameters.getOrElse("topic", + throw e("Please specify a topic, by .options(\"topic\",...)")) + + val clientId: String = parameters.getOrElse("clientId", { + log.warn("If `clientId` is not set, a random value is picked up." + + "\nRecovering from failure is not supported in such a case.") + MqttClient.generateClientId()}) + + val username: Option[String] = parameters.get("username") + val password: Option[String] = parameters.get("password") + val connectionTimeout: Int = parameters.getOrElse("connectionTimeout", + MqttConnectOptions.CONNECTION_TIMEOUT_DEFAULT.toString).toInt + val keepAlive: Int = parameters.getOrElse("keepAlive", MqttConnectOptions + .KEEP_ALIVE_INTERVAL_DEFAULT.toString).toInt + val mqttVersion: Int = parameters.getOrElse("mqttVersion", MqttConnectOptions + .MQTT_VERSION_DEFAULT.toString).toInt + val cleanSession: Boolean = parameters.getOrElse("cleanSession", "false").toBoolean + val qos: Int = parameters.getOrElse("QoS", "1").toInt + + val mqttConnectOptions: MqttConnectOptions = new MqttConnectOptions() + mqttConnectOptions.setAutomaticReconnect(true) + mqttConnectOptions.setCleanSession(cleanSession) + mqttConnectOptions.setConnectionTimeout(connectionTimeout) + mqttConnectOptions.setKeepAliveInterval(keepAlive) + mqttConnectOptions.setMqttVersion(mqttVersion) + (username, password) match { + case (Some(u: String), Some(p: String)) => + mqttConnectOptions.setUserName(u) + mqttConnectOptions.setPassword(p.toCharArray) + case _ => + } + + new MQTTTextStreamSource(brokerUrl, persistence, topic, clientId, + messageKVParser, sqlContext, mqttConnectOptions, qos) + } + + override def shortName(): String = "mqtt" +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MessageStore.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MessageStore.scala new file mode 100644 index 000000000..6228e2901 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MessageStore.scala @@ -0,0 +1,109 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.hurence.logisland.util.mqtt + +import java.nio.ByteBuffer +import java.util + +import org.apache.bahir.utils.Logging +import org.apache.spark.SparkConf +import org.apache.spark.serializer.{JavaSerializer, Serializer, SerializerInstance} +import org.eclipse.paho.client.mqttv3.{MqttClientPersistence, MqttPersistable, MqttPersistenceException} + +import scala.reflect.ClassTag + + +/** A message store for MQTT stream source for SQL Streaming. */ +trait MessageStore { + + /** Store a single id and corresponding serialized message */ + def store[T: ClassTag](id: Int, message: T): Boolean + + /** Retrieve messages corresponding to certain offset range */ + def retrieve[T: ClassTag](start: Int, end: Int): Seq[T] + + /** Retrieve message corresponding to a given id. */ + def retrieve[T: ClassTag](id: Int): T + + /** Highest offset we have stored */ + def maxProcessedOffset: Int + +} + +private[mqtt] class MqttPersistableData(bytes: Array[Byte]) extends MqttPersistable { + + override def getHeaderLength: Int = bytes.length + + override def getHeaderOffset: Int = 0 + + override def getPayloadOffset: Int = 0 + + override def getPayloadBytes: Array[Byte] = null + + override def getHeaderBytes: Array[Byte] = bytes + + override def getPayloadLength: Int = 0 +} + +/** + * A message store to persist messages received. This is not intended to be thread safe. + * It uses `MqttDefaultFilePersistence` for storing messages on disk locally on the client. + */ +private[mqtt] class LocalMessageStore(val persistentStore: MqttClientPersistence, + val serializer: Serializer) extends MessageStore with Logging { + + val classLoader = Thread.currentThread.getContextClassLoader + + def this(persistentStore: MqttClientPersistence, conf: SparkConf) = + this(persistentStore, new JavaSerializer(conf)) + + val serializerInstance: SerializerInstance = serializer.newInstance() + + private def get(id: Int) = { + persistentStore.get(id.toString).getHeaderBytes + } + + import scala.collection.JavaConverters._ + + def maxProcessedOffset: Int = { + val keys: util.Enumeration[_] = persistentStore.keys() + keys.asScala.map(x => x.toString.toInt).max + } + + /** Store a single id and corresponding serialized message */ + override def store[T: ClassTag](id: Int, message: T): Boolean = { + val bytes: Array[Byte] = serializerInstance.serialize(message).array() + try { + persistentStore.put(id.toString, new MqttPersistableData(bytes)) + true + } catch { + case e: MqttPersistenceException => log.warn(s"Failed to store message Id: $id", e) + false + } + } + + /** Retrieve messages corresponding to certain offset range */ + override def retrieve[T: ClassTag](start: Int, end: Int): Seq[T] = { + (start until end).map(x => retrieve(x)) + } + + /** Retrieve message corresponding to a given id. */ + override def retrieve[T: ClassTag](id: Int): T = { + serializerInstance.deserialize(ByteBuffer.wrap(get(id)), classLoader) + } + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/ControllerServiceLookupSink.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/ControllerServiceLookupSink.scala new file mode 100644 index 000000000..702900c60 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/ControllerServiceLookupSink.scala @@ -0,0 +1,52 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.util.spark + +import java.util +import java.util.Objects._ + +import com.hurence.logisland.component.PropertyDescriptor +import com.hurence.logisland.config.ControllerServiceConfiguration +import com.hurence.logisland.controller._ + + +class ControllerServiceLookupSink(createControllerServiceLookup: () => ControllerServiceLookup) extends Serializable { + + lazy val controllerServiceLookup = createControllerServiceLookup() + + + def getControllerServiceLookup(): ControllerServiceLookup = controllerServiceLookup + + def getControllerService(serviceIdentifier: String): ControllerService = + controllerServiceLookup.getControllerService(serviceIdentifier) + + def addControllerService(serviceIdentifier: String, controllerService: ControllerService, properties: Map[PropertyDescriptor, String]) { + requireNonNull(controllerService) + } + + +} + +object ControllerServiceLookupSink { + def apply(configs: util.Collection[ControllerServiceConfiguration]): ControllerServiceLookupSink = { + val f = () => { + new StandardControllerServiceLookup(configs) + + + } + new ControllerServiceLookupSink(f) + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/SparkUtils.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/SparkUtils.scala new file mode 100644 index 000000000..b6691a424 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/SparkUtils.scala @@ -0,0 +1,272 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (bailet.thomas@gmail.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + Copyright 2016 Hurence + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.hurence.logisland.util.spark + +import java.text.SimpleDateFormat +import java.util +import java.util.Date + +import com.hurence.logisland.record._ +import com.typesafe.scalalogging.slf4j.LazyLogging +import org.apache.avro.Schema +import org.apache.avro.Schema.Type +import org.apache.log4j.{Level, Logger} +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Row, SQLContext} +import org.apache.spark.{SparkConf, SparkContext} + +/** + * Created by tom on 11/06/15. + */ + +object SparkUtils extends LazyLogging { + + + def initContext(appName: String, + blockInterval: String = "", + maxRatePerPartition: String = "", + master: String = ""): SparkContext = { + + // job configuration + val conf = new SparkConf() + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + if (maxRatePerPartition.nonEmpty) { + conf.set("spark.streaming.kafka.maxRatePerPartition", maxRatePerPartition) + } + if (blockInterval.nonEmpty) { + conf.set("spark.streaming.blockInterval", blockInterval) + } + conf.set("spark.streaming.backpressure.enabled", "true") + conf.set("spark.streaming.unpersist", "false") + conf.set("spark.ui.port", "4050") + conf.setAppName(appName) + + if (master.nonEmpty) { + conf.setMaster(master) + } + + val sc = new SparkContext(conf) + + logger.info(s"spark context initialized with master:$master, appName:$appName, " + + s"blockInterval:$blockInterval, maxRatePerPartition:$maxRatePerPartition") + + sc + } + + + /** + * Get a file and a schema and convert this to a dataframe + * + * @param schema + * @param filePath + * @param tableName + */ + def registerDataFrame( + schema: String, + filePath: String, + tableName: String, + sc: SparkContext, + sqlContext: SQLContext, + separator: String = "\u0001"): DataFrame = { + // Generate the schema based on the string of schema + val parsedSchema = StructType(schema.split(" ").map(fieldName => StructField(fieldName, StringType, true))) + + // Convert records of the RDD (people) to Rows. + val schemaLength = schema.split(" ").length + val rawRDD = sc.textFile(filePath) + .map(_.split(separator)) + .filter(_.length == schemaLength) + .map(tokens => Row.fromSeq(tokens)) + + // Apply the schema to the RDD. + val dataFrame = sqlContext.createDataFrame(rawRDD, parsedSchema) + + // Register the DataFrames as a table. + dataFrame.createOrReplaceTempView(tableName) + dataFrame + } + + + def registerUdfs(sqlContext: SQLContext) = { + + + sqlContext.udf.register("timestamp", (date: String) => { + try { + val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S") + + sdf.parse(date).getTime + } catch { + case e: Exception => 0 + } + }) + + } + + /** + * convert a Record to a SQL Row + * + * @param record the Record to convert + * @return the Spar SQL row + */ + def convertToRow(record: Record, schema: StructType): Row = { + + Row.fromSeq(schema.map(structField => { + val fieldName = structField.name + + if (record.hasField(fieldName)) { + structField.dataType match { + case DataTypes.StringType => + if (record.getField(fieldName).getType == FieldType.ARRAY) + record.getField(fieldName).getRawValue.asInstanceOf[util.ArrayList[String]].toArray.mkString + else + record.getField(fieldName).asString() + case DataTypes.IntegerType => record.getField(fieldName).asInteger() + case DataTypes.LongType => record.getField(fieldName).asLong() + case DataTypes.FloatType => record.getField(fieldName).asFloat() + case DataTypes.DoubleType => record.getField(fieldName).asDouble() + case _ => record.getField(fieldName).asString() + } + } else { + null + } + + + })) + } + + /** + * convert a SQL Row to a Record to + * + * @param row the Row to convert + * @return the Record + */ + def convertToRecord(row: Row, inRecordType: String = "logisland_record"): Record = { + + var recordType = inRecordType + var recordTime = new Date().getTime + val fields = row.schema.map(structField => { + val fieldName = structField.name + + structField.dataType match { + case DataTypes.StringType => + if (fieldName == FieldDictionary.RECORD_TYPE) { + recordType = row.getAs[String](fieldName) + } + new Field(fieldName, FieldType.STRING, row.getAs[String](fieldName)) + case DataTypes.IntegerType => new Field(fieldName, FieldType.INT, row.getAs[Int](fieldName)) + case DataTypes.LongType => + if (fieldName == FieldDictionary.RECORD_TIME) { + recordTime = row.getAs[Long](fieldName) + } + new Field(fieldName, FieldType.LONG, row.getAs[Long](fieldName)) + case DataTypes.FloatType => new Field(fieldName, FieldType.FLOAT, row.getAs[Float](fieldName)) + case DataTypes.DoubleType => new Field(fieldName, FieldType.DOUBLE, row.getAs[Double](fieldName)) + case _ => new Field(fieldName, FieldType.STRING, row.getAs[String](fieldName)) + } + + }) + + // construct new Record with type and time from the row + val outputRecord = new StandardRecord() + .setType(recordType) + .setTime(new Date(recordTime)) + fields.foreach(field => outputRecord.setField(field)) + outputRecord + } + + /** + * create a dataframe schema from a Record + * + * @param record the Record to infer schema + * @return th schema + */ + def convertFieldsNameToSchema(record: Record): StructType = { + StructType( + record.getAllFieldsSorted.toArray(Array[Field]()).map(f => { + f.getType match { + case FieldType.INT => StructField(f.getName, DataTypes.IntegerType, nullable = true) + case FieldType.LONG => StructField(f.getName, DataTypes.LongType, nullable = true) + case FieldType.FLOAT => StructField(f.getName, DataTypes.FloatType, nullable = true) + case FieldType.DOUBLE => StructField(f.getName, DataTypes.DoubleType, nullable = true) + case FieldType.STRING => StructField(f.getName, DataTypes.StringType, nullable = true) + case _ => StructField(f.getName, DataTypes.StringType, nullable = true) + } + }) + ) + } + + /** + * create a dataframe schema from an Avro one + * + * @param avroSchema the Avro Schema + * @return th schema + */ + def convertAvroSchemaToDataframeSchema(avroSchema: Schema): StructType = { + val types = avroSchema.getFields.toArray(Array[Schema.Field]()) + .map(s => { + (s.name(), + s.schema() + .getTypes + .toArray(Array[Schema]()) + .filter(t => t.getType != Type.NULL) + .toList + .head) + }) + + StructType(types.map(f => { + f._2.getType match { + case Type.INT => StructField(f._1, DataTypes.IntegerType, nullable = true) + case Type.LONG => StructField(f._1, DataTypes.LongType, nullable = true) + case Type.FLOAT => StructField(f._1, DataTypes.FloatType, nullable = true) + case Type.DOUBLE => StructField(f._1, DataTypes.DoubleType, nullable = true) + case Type.STRING => StructField(f._1, DataTypes.StringType, nullable = true) + case _ => StructField(f._1, DataTypes.StringType, nullable = true) + } + }) + ) + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/org/apache/spark/metrics/sink/KafkaSink.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/org/apache/spark/metrics/sink/KafkaSink.scala new file mode 100644 index 000000000..ea0adbd78 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/org/apache/spark/metrics/sink/KafkaSink.scala @@ -0,0 +1,91 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.metrics.sink + +import java.util.{Locale, Properties} +import java.util.concurrent.TimeUnit + +import org.slf4j.Logger +import org.slf4j.LoggerFactory +import com.codahale.metrics.MetricRegistry +import com.hurence.logisland.util.kafka.KafkaReporter +import org.apache.spark.SecurityManager + +/** + * A Kafka metric sink for Apache Spark + + +Configure your spark metrics.properties file + +Edit /path/to/spark/conf/metrics.properites to look like this: + +master.source.jvm.class=org.apache.spark.metrics.source.JvmSource +worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource +driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource +executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource + + *.sink.kafka.class=org.apache.spark.metrics.sink.KafkaSink + *.sink.kafka.broker=127.0.0.1:9092 + *.sink.kafka.topic=test + *.sink.kafka.period=10 + *.sink.kafka.unit=seconds + +# histquantiles and timerquantiles have following defaults: +#*.sink.kafka.histquantiles=0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0 +#*.sink.kafka.timerquantiles=0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0 + +# These carry configure settings to the KafkaProducer +# *.sink.kafka.prodconf_xxx, where xxx can be anything, just has to +# be unique per setting: + *.sink.kafka.prodconf_a=retries=0 + *.sink.kafka.prodconf_b=acks=all + *.sink.kafka.prodconf_c=request.timeout.ms=5 + *.sink.kafka.prodconf_d=max.block.ms=5 + + + */ +class KafkaSink(val properties: Properties, val registry: MetricRegistry, + securityMgr: SecurityManager) extends org.apache.spark.metrics.sink.Sink { + + val logger: Logger = LoggerFactory.getLogger(this.getClass) + + private def popt(prop: String): Option[String] = + Option(properties.getProperty(prop)) + + // These are non-negotiable + val broker = popt("broker").get + val topic = popt("topic").get + + lazy val reporter = new KafkaReporter(registry, broker, topic, properties) + + def start(): Unit = { + logger.info(s"Starting Kafka metric reporter at $broker, topic $topic") + val period = popt("period").getOrElse("10").toLong + val tstr = popt("unit").getOrElse("seconds").toUpperCase(Locale.ROOT) + val tunit = TimeUnit.valueOf(tstr) + reporter.start(period, tunit) + } + + def stop(): Unit = { + logger.info(s"Stopping Kafka metric reporter at $broker, topic $topic") + reporter.stop() + } + + def report(): Unit = { + logger.info(s"Reporting metrics to Kafka reporter at $broker, topic $topic") + reporter.report() + } +} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/KafkaConnectTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/KafkaConnectTest.java new file mode 100644 index 000000000..2ad335080 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/KafkaConnectTest.java @@ -0,0 +1,84 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect; + +import com.hurence.logisland.component.ComponentFactory; +import com.hurence.logisland.config.ConfigReader; +import com.hurence.logisland.config.LogislandConfiguration; +import com.hurence.logisland.engine.EngineContext; +import com.hurence.logisland.util.runner.TestRunner; +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Optional; +import java.util.Scanner; + + +/** + * End to end test. + */ +public class KafkaConnectTest { + private static Logger logger = LoggerFactory.getLogger(KafkaConnectTest.class); + + private static final String JOB_CONF_FILE = "/conf/kafka-connect-stream.yml"; + + @Test + @Ignore + public void remoteTest() { + + + logger.info("starting StreamProcessingRunner"); + + Optional engineInstance = Optional.empty(); + try { + + String configFile = KafkaConnectTest.class.getResource(JOB_CONF_FILE).getPath(); + + // load the YAML config + LogislandConfiguration sessionConf = ConfigReader.loadConfig(configFile); + + // instantiate engine and all the processor from the config + engineInstance = ComponentFactory.getEngineContext(sessionConf.getEngine()); + assert engineInstance.isPresent(); + assert engineInstance.get().isValid(); + + logger.info("starting Logisland session version {}", sessionConf.getVersion()); + logger.info(sessionConf.getDocumentation()); + } catch (Exception e) { + logger.error("unable to launch runner : {}", e); + } + + try { + // start the engine + EngineContext engineContext = engineInstance.get(); + engineInstance.get().getEngine().start(engineContext); + new Scanner(System.in).nextLine(); + } catch (Exception e) { + Assert.fail("something went bad while running the job : " + e); + + } + + + + + + + } + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/converter/LogIslandRecordConverterTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/converter/LogIslandRecordConverterTest.java new file mode 100644 index 000000000..1618799b1 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/converter/LogIslandRecordConverterTest.java @@ -0,0 +1,129 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect.converter; + +import com.hurence.logisland.record.Field; +import com.hurence.logisland.record.FieldDictionary; +import com.hurence.logisland.record.Record; +import com.hurence.logisland.serializer.BytesArraySerializer; +import com.hurence.logisland.serializer.KryoSerializer; +import com.hurence.logisland.serializer.RecordSerializer; +import com.hurence.logisland.serializer.SerializerProvider; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; +import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.util.*; + +import static org.junit.Assert.*; + +public class LogIslandRecordConverterTest { + + private LogIslandRecordConverter setupInstance(Class serializerClass, boolean isKey) { + final LogIslandRecordConverter instance = new LogIslandRecordConverter(); + instance.configure( + Collections.singletonMap(LogIslandRecordConverter.PROPERTY_RECORD_SERIALIZER, serializerClass.getCanonicalName()), + isKey); + return instance; + } + + private void assertFieldEquals(Record record, String fieldName, Object expected) { + Field field = record.getField(fieldName); + if (expected == null) { + assertNull(field); + } else { + assertNotNull(field); + assertEquals(expected, field.getRawValue()); + } + } + + private void assertFieldEquals(Record record, String fieldName, byte[] expected) { + Field field = record.getField(fieldName); + if (expected == null) { + assertNull(field); + } else { + assertNotNull(field); + assertArrayEquals(expected, (byte[]) field.getRawValue()); + } + } + + + @Test + public void testBytesSchema() { + byte[] data = new byte[16]; + new Random().nextBytes(data); + RecordSerializer serializer = new BytesArraySerializer(); + LogIslandRecordConverter instance = setupInstance(serializer.getClass(), false); + byte[] serialized = instance.fromConnectData("", Schema.BYTES_SCHEMA, data); + Record record = serializer.deserialize(new ByteArrayInputStream(serialized)); + assertNotNull(record); + assertFieldEquals(record, FieldDictionary.RECORD_VALUE, data); + } + + @Test + public void testComplexSchema() { + //our schema + + final Schema complexSchema = SchemaBuilder + .struct() + .field("f1", SchemaBuilder.bool()) + .field("f2", SchemaBuilder.string()) + .field("f3", SchemaBuilder.int8()) + .field("f4", SchemaBuilder.int16()) + .field("f5", SchemaBuilder.string().optional()) + .field("f6", SchemaBuilder.float32()) + .field("arr", SchemaBuilder.array(SchemaBuilder.int32())) + .field("map", SchemaBuilder.map(SchemaBuilder.string(), SchemaBuilder.string())) + .field("struct", SchemaBuilder.struct() + .field("child", SchemaBuilder.string()).build()) + .build(); + + //setup converters + LogIslandRecordConverter instance = setupInstance(KryoSerializer.class, false); + RecordSerializer serializer = SerializerProvider.getSerializer(KryoSerializer.class.getName(), null); + Struct complex = new Struct(complexSchema) + .put("f1", true) + .put("f2", "test") + .put("f3", (byte) 0) + .put("f4", (short) 1) + .put("f5", null) + .put("f6", 3.1415f) + .put("arr", new ArrayList<>(Arrays.asList(0, 1, 2))) + .put("map", new HashMap<>(Collections.singletonMap("key", "value"))) + .put("struct", + new Struct(complexSchema.field("struct").schema()) + .put("child", "child")); + + Record record = serializer.deserialize(new ByteArrayInputStream(instance.fromConnectData(null, complexSchema, complex))); + System.out.println(record); + //assertions + assertNotNull(record); + Record extracted = record.getField(FieldDictionary.RECORD_VALUE).asRecord(); + assertNotNull(extracted); + assertFieldEquals(extracted, "f1", true); + assertFieldEquals(extracted, "f2", "test"); + assertFieldEquals(extracted, "f3", (byte) 0); + assertFieldEquals(extracted, "f4", (short) 1); + assertFieldEquals(extracted, "f5", null); + assertFieldEquals(extracted, "f6", (float) 3.1415); + assertFieldEquals(extracted, "arr", new ArrayList<>(Arrays.asList(0, 1, 2))); + assertFieldEquals(extracted, "map", new HashMap<>(Collections.singletonMap("key", "value"))); + //assertFieldEquals(((Map)extracted.getField("struct").getRawValue()).get("child"), "child", "child"); + + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/FakeConnector.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/FakeConnector.java new file mode 100644 index 000000000..0e15152f9 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/FakeConnector.java @@ -0,0 +1,116 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect.fake; + +import org.apache.commons.lang3.RandomStringUtils; +import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.connect.connector.Task; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.source.SourceConnector; +import org.apache.kafka.connect.source.SourceRecord; +import org.apache.kafka.connect.source.SourceTask; + +import java.util.*; +import java.util.concurrent.SynchronousQueue; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +public class FakeConnector extends SourceConnector { + + + public static class FakeTask extends SourceTask { + + private SynchronousQueue queue = new SynchronousQueue<>(); + private final Timer timer = new Timer(); + + + @Override + public void start(Map props) { + + + } + + @Override + public List poll() throws InterruptedException { + Random random = new Random(); + + return IntStream.range(0, 1000).mapToObj(i -> { + int p = random.nextInt(10); + Schema schema = SchemaBuilder.struct() + .field("partition", SchemaBuilder.int32()) + .field("val", SchemaBuilder.string()) + .build(); + return new SourceRecord( + Collections.singletonMap("partition", p), + Collections.singletonMap("offset", System.currentTimeMillis()), + "", + null, + schema, + new Struct(schema) + .put("partition", p) + .put("val", RandomStringUtils.randomAscii(30))); + } + ).collect(Collectors.toList()); + } + + + @Override + public void stop() { + timer.cancel(); + } + + @Override + public String version() { + return "1.0"; + } + + } + + @Override + public String version() { + return "1.0"; + } + + @Override + public void start(Map props) { + } + + @Override + public Class taskClass() { + return FakeTask.class; + } + + @Override + public List> taskConfigs(int maxTasks) { + List> ret = new ArrayList<>(); + for (int i = 0; i < maxTasks; i++) { + ret.add(Collections.emptyMap()); + } + return ret; + } + + @Override + public void stop() { + + } + + @Override + public ConfigDef config() { + return new ConfigDef(); + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSink.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSink.java new file mode 100644 index 000000000..c0d557489 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSink.java @@ -0,0 +1,57 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect.fake; + +import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.connect.connector.Task; +import org.apache.kafka.connect.sink.SinkConnector; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +public class TestSink extends SinkConnector { + + @Override + public String version() { + return null; + } + + @Override + public void start(Map props) { + + } + + @Override + public Class taskClass() { + return TestSinkTask.class; + } + + @Override + public List> taskConfigs(int maxTasks) { + return Collections.singletonList(Collections.emptyMap()); + } + + @Override + public void stop() { + + } + + @Override + public ConfigDef config() { + return new ConfigDef(); + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSinkTask.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSinkTask.java new file mode 100644 index 000000000..e874dc1f8 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSinkTask.java @@ -0,0 +1,55 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.connect.fake; + +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.connect.sink.SinkRecord; +import org.apache.kafka.connect.sink.SinkTask; + +import java.util.Collection; +import java.util.Map; + +public class TestSinkTask extends SinkTask { + + @Override + public void start(Map props) { + System.out.println("Task started"); + } + + @Override + public void put(Collection records) { + + System.out.println("Adding " + records.size() + " records"); + records.stream().findFirst().ifPresent(System.out::println); + } + + @Override + public void flush(Map offsets) { + System.out.println("Flushed offset: " +offsets); + } + + @Override + public void stop() { + System.out.println("Task stopped"); + + } + + @Override + public String version() { + return ""; + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/AbstractStreamProcessingIntegrationTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/AbstractStreamProcessingIntegrationTest.java new file mode 100644 index 000000000..fb01c96bf --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/AbstractStreamProcessingIntegrationTest.java @@ -0,0 +1,246 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine; + +import com.hurence.logisland.record.Record; +import com.hurence.logisland.serializer.KryoSerializer; +import com.hurence.logisland.stream.StreamProperties; +import kafka.admin.RackAwareMode; +import kafka.server.KafkaConfig; +import kafka.server.KafkaServer; +import kafka.utils.*; +import kafka.zk.AdminZkClient; +import kafka.zk.EmbeddedZookeeper; +import kafka.zk.KafkaZkClient; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.utils.Time; +import org.junit.After; +import org.junit.Before; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.InetAddress; +import java.net.ServerSocket; +import java.nio.file.Files; +import java.util.*; + +import static org.junit.Assert.assertTrue; + +/** + * Abstract class for integration testing + */ +public abstract class AbstractStreamProcessingIntegrationTest { + + + protected static final String ZKHOST = "127.0.0.1"; + protected static final String BROKERHOST = "127.0.0.1"; + protected static final int BROKERPORT = choosePorts(2)[0]; + protected static final String INPUT_TOPIC = "mock_in"; + protected static final String OUTPUT_TOPIC = "mock_out"; + + private static Logger logger = LoggerFactory.getLogger(AbstractStreamProcessingIntegrationTest.class); + + private static KafkaProducer producer; + private static KafkaConsumer consumer; + private static ProcessingEngine engine; + private static EngineContext engineContext; + protected EmbeddedZookeeper zkServer; + private KafkaServer kafkaServer; + private KafkaZkClient kafkaZkClient; + private AdminZkClient adminZkClient; + + /** + * Choose a number of random available ports + */ + public static int[] choosePorts(int count) { + try { + ServerSocket[] sockets = new ServerSocket[count]; + int[] ports = new int[count]; + for (int i = 0; i < count; i++) { + sockets[i] = new ServerSocket(0, 0, InetAddress.getByName("0.0.0.0")); + ports[i] = sockets[i].getLocalPort(); + } + for (int i = 0; i < count; i++) + sockets[i].close(); + return ports; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Before + public void setUp() throws InterruptedException, IOException { + + // setup Zookeeper + zkServer = new EmbeddedZookeeper(); + String zkConnect = ZKHOST + ":" + zkServer.port(); + + // setup Broker + Properties brokerProps = new Properties(); + brokerProps.setProperty("zookeeper.connect", zkConnect); + brokerProps.setProperty("broker.id", "0"); + brokerProps.setProperty("log.dirs", Files.createTempDirectory("kafka-").toAbsolutePath().toString()); + brokerProps.setProperty("listeners", "PLAINTEXT://" + BROKERHOST + ":" + BROKERPORT); + KafkaConfig config = new KafkaConfig(brokerProps); + Time mock = new MockTime(); + kafkaServer = TestUtils.createServer(config, mock); + kafkaZkClient = kafkaServer.zkClient(); + adminZkClient = new AdminZkClient(kafkaZkClient); + + // create topics + if (!kafkaZkClient.topicExists(StreamProperties.DEFAULT_ERRORS_TOPIC().getValue())) + adminZkClient.createTopic( + StreamProperties.DEFAULT_ERRORS_TOPIC().getValue(), + 1, + 1, + new Properties(), + RackAwareMode.Disabled$.MODULE$); + if (!kafkaZkClient.topicExists(StreamProperties.DEFAULT_RECORDS_TOPIC().getValue())) + adminZkClient.createTopic(StreamProperties.DEFAULT_RECORDS_TOPIC().getValue(), 1, 1, new Properties(), RackAwareMode.Disabled$.MODULE$); + if (!kafkaZkClient.topicExists(StreamProperties.DEFAULT_RAW_TOPIC().getValue())) + adminZkClient.createTopic(StreamProperties.DEFAULT_RAW_TOPIC().getValue(), 1, 1, new Properties(), RackAwareMode.Disabled$.MODULE$); + if (!kafkaZkClient.topicExists(StreamProperties.DEFAULT_METRICS_TOPIC().getValue())) + adminZkClient.createTopic(StreamProperties.DEFAULT_METRICS_TOPIC().getValue(), 1, 1, new Properties(), RackAwareMode.Disabled$.MODULE$); + + + // deleting zookeeper information to make sure the consumer starts from the beginning + adminZkClient.deleteTopic("/consumers/group0"); + +/* + File checkpointDir = new File("checkpoints"); + if (checkpointDir.isDirectory()) + FileUtils.forceDelete(checkpointDir); +*/ + + Optional instance = getEngineContext(); + assertTrue(instance.isPresent()); + assertTrue(instance.get().isValid()); + engine = instance.get().getEngine(); + engineContext = instance.get(); + + + + System.setProperty("hadoop.home.dir", "/"); + + Runnable testRunnable = () -> { + engine.start(engineContext); + }; + + Thread t = new Thread(testRunnable); + logger.info("starting engine thread {}", t.getId()); + t.start(); + + } + + @After + public void tearDown() throws NoSuchFieldException, IllegalAccessException, InterruptedException { + + engine.shutdown(engineContext); + Thread.sleep(2000); + + if (kafkaServer != null) { + kafkaServer.shutdown(); + // Remove any persistent data + CoreUtils.delete(kafkaServer.config().logDirs()); + } + + if (kafkaZkClient.topicExists(StreamProperties.DEFAULT_ERRORS_TOPIC().getValue())) + adminZkClient.deleteTopic(StreamProperties.DEFAULT_ERRORS_TOPIC().getValue()); + if (kafkaZkClient.topicExists(StreamProperties.DEFAULT_RECORDS_TOPIC().getValue())) + adminZkClient.deleteTopic(StreamProperties.DEFAULT_RECORDS_TOPIC().getValue()); + if (kafkaZkClient.topicExists(StreamProperties.DEFAULT_RAW_TOPIC().getValue())) + adminZkClient.deleteTopic(StreamProperties.DEFAULT_RAW_TOPIC().getValue()); + if (kafkaZkClient.topicExists(StreamProperties.DEFAULT_METRICS_TOPIC().getValue())) + adminZkClient.deleteTopic(StreamProperties.DEFAULT_METRICS_TOPIC().getValue()); + + if (zkServer != null) { + zkServer.shutdown(); + } + } + + + abstract Optional getEngineContext(); + + + protected static void sendRecord(String topic, Record record) throws IOException { + + // setup producer + Properties producerProps = new Properties(); + producerProps.setProperty("bootstrap.servers", BROKERHOST + ":" + BROKERPORT); + producerProps.setProperty("key.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); + producerProps.setProperty("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); + producer = new KafkaProducer(producerProps); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final KryoSerializer kryoSerializer = new KryoSerializer(true); + kryoSerializer.serialize(baos, record); + ProducerRecord data = new ProducerRecord<>(topic, null, baos.toByteArray()); + producer.send(data); + baos.close(); + + logger.info("sent record : " + record + " to topic " + topic); + producer.close(); + } + + protected static List readRecords(String topic) { + + + // setup consumer + Properties consumerProps = new Properties(); + consumerProps.setProperty("bootstrap.servers", BROKERHOST + ":" + BROKERPORT); + consumerProps.setProperty("group.id", "group0"); + consumerProps.setProperty("client.id", "consumer0"); + consumerProps.setProperty("key.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer"); + consumerProps.setProperty("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer"); + consumerProps.put("auto.offset.reset", "earliest"); // to make sure the consumer starts from the beginning of the topic + consumer = new KafkaConsumer<>(consumerProps); + consumer.subscribe(Arrays.asList(topic)); + + + List outputRecords = new ArrayList<>(); + + // starting consumer + ConsumerRecords records = consumer.poll(1000); + + // verify the integrity of the retrieved event + for (ConsumerRecord record : records) { + final KryoSerializer deserializer = new KryoSerializer(true); + + ByteArrayInputStream bais = new ByteArrayInputStream(record.value()); + Record deserializedRecord = deserializer.deserialize(bais); + logger.info(deserializedRecord.toString()); + outputRecords.add(deserializedRecord); + try { + bais.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + consumer.close(); + + return outputRecords; + } + + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/ProgrammaticStreamProcessingIntegrationTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/ProgrammaticStreamProcessingIntegrationTest.java new file mode 100644 index 000000000..fa1a74fcb --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/ProgrammaticStreamProcessingIntegrationTest.java @@ -0,0 +1,170 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine; + +import com.hurence.logisland.component.ComponentFactory; +import com.hurence.logisland.component.ComponentType; +import com.hurence.logisland.config.EngineConfiguration; +import com.hurence.logisland.config.ProcessorConfiguration; +import com.hurence.logisland.config.StreamConfiguration; +import com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine; +import com.hurence.logisland.stream.StreamProperties; +import com.hurence.logisland.util.runner.MockProcessor; +import com.hurence.logisland.record.FieldType; +import com.hurence.logisland.record.Record; +import com.hurence.logisland.record.StandardRecord; +import com.hurence.logisland.stream.spark.AbstractKafkaRecordStream; +import com.hurence.logisland.stream.spark.KafkaRecordStreamParallelProcessing; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.*; + +import static org.junit.Assert.assertTrue; + +/** + * Empty Java class for source jar generation (need to publish on OSS sonatype) + */ +public class ProgrammaticStreamProcessingIntegrationTest extends AbstractStreamProcessingIntegrationTest { + + + public static final String MAGIC_STRING = "the world is so big"; + + + private static Logger logger = LoggerFactory.getLogger(ProgrammaticStreamProcessingIntegrationTest.class); + + + Optional getEngineContext() { + Map properties = new HashMap<>(); + properties.put(KafkaStreamProcessingEngine.SPARK_APP_NAME().getName(), "testApp"); + properties.put(KafkaStreamProcessingEngine.SPARK_STREAMING_BATCH_DURATION().getName(), "500"); + properties.put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[4]"); + properties.put(KafkaStreamProcessingEngine.SPARK_STREAMING_TIMEOUT().getName(), "12000"); + + + EngineConfiguration conf = new EngineConfiguration(); + conf.setComponent(KafkaStreamProcessingEngine.class.getName()); + conf.setType(ComponentType.ENGINE.toString()); + conf.setConfiguration(properties); + conf.addPipelineConfigurations(createStreamConfig()); + + return ComponentFactory.getEngineContext(conf); + } + + + private StreamConfiguration createStreamConfig() { + Map properties = new HashMap<>(); + properties.put(StreamProperties.KAFKA_METADATA_BROKER_LIST().getName(), BROKERHOST + ":" + BROKERPORT); + properties.put(StreamProperties.KAFKA_ZOOKEEPER_QUORUM().getName(), ZKHOST + ":" + zkServer.port()); + properties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR().getName(), "1"); + properties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_PARTITIONS().getName(), "1"); + properties.put(StreamProperties.INPUT_SERIALIZER().getName(), StreamProperties.KRYO_SERIALIZER().getValue()); + properties.put(StreamProperties.OUTPUT_SERIALIZER().getName(), StreamProperties.KRYO_SERIALIZER().getValue()); + properties.put(StreamProperties.KAFKA_MANUAL_OFFSET_RESET().getName(), StreamProperties.LATEST_OFFSET().getValue()); + + properties.put(StreamProperties.INPUT_TOPICS().getName(), INPUT_TOPIC); + properties.put(StreamProperties.OUTPUT_TOPICS().getName(), OUTPUT_TOPIC); + + StreamConfiguration conf = new StreamConfiguration(); + conf.setComponent(KafkaRecordStreamParallelProcessing.class.getName()); + conf.setType(ComponentType.STREAM.toString()); + conf.setConfiguration(properties); + conf.setStream("KafkaStream"); + conf.addProcessorConfiguration(createProcessorConfiguration()); + + return conf; + } + + private ProcessorConfiguration createProcessorConfiguration() { + Map properties = new HashMap<>(); + properties.put(MockProcessor.FAKE_MESSAGE.getName(), MAGIC_STRING); + + ProcessorConfiguration conf = new ProcessorConfiguration(); + conf.setComponent(MockProcessor.class.getName()); + conf.setType(ComponentType.PROCESSOR.toString()); + conf.setConfiguration(properties); + conf.setProcessor("mock"); + + return conf; + } + + + @Test + @Ignore + public void validateIntegration() throws NoSuchFieldException, IllegalAccessException, InterruptedException, IOException { + + final List records = new ArrayList<>(); + + Runnable testRunnable = () -> { + + + // send message + Record record = new StandardRecord("cisco"); + record.setId("firewall_record1"); + record.setField("method", FieldType.STRING, "GET"); + record.setField("ip_source", FieldType.STRING, "123.34.45.123"); + record.setField("ip_target", FieldType.STRING, "255.255.255.255"); + record.setField("url_scheme", FieldType.STRING, "http"); + record.setField("url_host", FieldType.STRING, "origin-www.20minutes.fr"); + record.setField("url_port", FieldType.STRING, "80"); + record.setField("url_path", FieldType.STRING, "/r15lgc-100KB.js"); + record.setField("request_size", FieldType.INT, 1399); + record.setField("response_size", FieldType.INT, 452); + record.setField("is_outside_office_hours", FieldType.BOOLEAN, false); + record.setField("is_host_blacklisted", FieldType.BOOLEAN, false); + record.setField("tags", FieldType.ARRAY, new ArrayList<>(Arrays.asList("spam", "filter", "mail"))); + + + try { + Thread.sleep(8000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + try { + sendRecord(INPUT_TOPIC, record); + } catch (IOException e) { + e.printStackTrace(); + } + + + try { + Thread.sleep(2000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + records.addAll(readRecords(OUTPUT_TOPIC)); + }; + + Thread t = new Thread(testRunnable); + logger.info("starting validation thread {}", t.getId()); + t.start(); + + + try{ + Thread.sleep(15000); + assertTrue(records.size() == 1); + assertTrue(records.get(0).size() == 13); + assertTrue(records.get(0).getField("message").asString().equals(MAGIC_STRING)); + }catch (Exception e){ + logger.error("issue durring validation {}", e.getMessage()); + } + + + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RecordStreamProcessingDebuggerTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RecordStreamProcessingDebuggerTest.java new file mode 100644 index 000000000..3e565afdd --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RecordStreamProcessingDebuggerTest.java @@ -0,0 +1,282 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine; + +import com.hurence.logisland.component.ComponentFactory; +import com.hurence.logisland.component.ComponentType; +import com.hurence.logisland.config.EngineConfiguration; +import com.hurence.logisland.config.ProcessorConfiguration; +import com.hurence.logisland.config.StreamConfiguration; +import com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine; +import com.hurence.logisland.stream.StreamProperties; +import com.hurence.logisland.stream.spark.KafkaRecordStreamDebugger; +import com.hurence.logisland.stream.spark.KafkaRecordStreamHDFSBurner; +import com.hurence.logisland.stream.spark.KafkaRecordStreamParallelProcessing; +import com.hurence.logisland.stream.spark.KafkaRecordStreamSQLAggregator; +import com.hurence.logisland.util.runner.MockProcessor; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; + + +public class RecordStreamProcessingDebuggerTest { + private static Logger logger = LoggerFactory.getLogger(RecordStreamProcessingDebuggerTest.class); + + private static final String APACHE_LOG_FIELDS = "src_ip,identd,user,record_time,http_method,http_query,http_version,http_status,bytes_out"; + private static final String APACHE_LOG_REGEX = "(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+\\[([\\w:\\/]+\\s[+\\-]\\d{4})\\]\\s+\"(\\S+)\\s+(\\S+)\\s*(\\S*)\"\\s+(\\S+)\\s+(\\S+)"; + + + @Test + @Ignore + public void remoteTest() { + + logger.info("starting StreamProcessingRunner"); + + // ProcessorConfiguration processorConf = getSplitTextProcessorConfiguration(); + StreamConfiguration chainConf = getSQLStreamConfiguration(); + EngineConfiguration engineConf = getStandardEngineConfiguration(); + engineConf.addPipelineConfigurations(chainConf); + // chainConf.addProcessorConfiguration(processorConf); + + + try { + + // instanciate engine and all the processor from the config + Optional engineInstance = ComponentFactory.getEngineContext(engineConf); + + assert engineInstance.isPresent(); + assert engineInstance.get().isValid(); + + // start the engine + EngineContext engineContext = engineInstance.get(); + engineInstance.get().getEngine().start(engineContext); + + + } catch (Exception e) { + logger.error("unable to launch runner : {}", e); + } + + + } + + private EngineConfiguration getStandardEngineConfiguration() { + Map engineProperties = new HashMap<>(); + engineProperties.put(KafkaStreamProcessingEngine.SPARK_APP_NAME().getName(), "testApp"); + engineProperties.put(KafkaStreamProcessingEngine.SPARK_STREAMING_BATCH_DURATION().getName(), "5000"); + engineProperties.put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[4]"); + engineProperties.put(KafkaStreamProcessingEngine.SPARK_EXECUTOR_CORES().getName(), "4"); + engineProperties.put(KafkaStreamProcessingEngine.SPARK_STREAMING_TIMEOUT().getName(), "-1"); + + EngineConfiguration engineConf = new EngineConfiguration(); + engineConf.setComponent(KafkaStreamProcessingEngine.class.getName()); + engineConf.setType(ComponentType.ENGINE.toString()); + engineConf.setConfiguration(engineProperties); + return engineConf; + } + + private StreamConfiguration getBurnerStreamConfiguration() { + Map streamProperties = new HashMap<>(); + /*chainProperties.put(AbstractKafkaRecordStream.KAFKA_METADATA_BROKER_LIST().getName(), + "sd-84190:6667,sd-84191:6667,sd-84192:6667,sd-84196:6667"); + chainProperties.put(AbstractKafkaRecordStream.KAFKA_ZOOKEEPER_QUORUM().getName(), + "sd-76387:2181,sd-84186:2181,sd-84189:2181");*/ + streamProperties.put(StreamProperties.KAFKA_METADATA_BROKER_LIST().getName(), + "sandbox:9092"); + streamProperties.put(StreamProperties.KAFKA_ZOOKEEPER_QUORUM().getName(), + "sandbox:2181"); + streamProperties.put(StreamProperties.INPUT_TOPICS().getName(), "logisland_events"); + streamProperties.put(StreamProperties.OUTPUT_TOPICS().getName(), "none"); + streamProperties.put(StreamProperties.INPUT_SERIALIZER().getName(), StreamProperties.KRYO_SERIALIZER().getValue()); + streamProperties.put(StreamProperties.OUTPUT_SERIALIZER().getName(), StreamProperties.NO_SERIALIZER().getValue()); + streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR().getName(), "1"); + streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_PARTITIONS().getName(), "2"); + + streamProperties.put(StreamProperties.OUTPUT_FOLDER_PATH().getName(), "data/logisland_events"); + streamProperties.put(StreamProperties.OUTPUT_FORMAT().getName(), "parquet"); + streamProperties.put(StreamProperties.RECORD_TYPE().getName(), "record"); + + StreamConfiguration chainConf = new StreamConfiguration(); + chainConf.setComponent(KafkaRecordStreamHDFSBurner.class.getName()); + chainConf.setType(ComponentType.STREAM.toString()); + chainConf.setConfiguration(streamProperties); + chainConf.setStream("KafkaStream"); + return chainConf; + } + + + private StreamConfiguration getParallelStreamConfiguration() { + Map streamProperties = new HashMap<>(); + streamProperties.put(StreamProperties.KAFKA_METADATA_BROKER_LIST().getName(), + "sandbox:9092"); + streamProperties.put(StreamProperties.KAFKA_ZOOKEEPER_QUORUM().getName(), + "sandbox:2181"); + streamProperties.put(StreamProperties.OUTPUT_TOPICS().getName(), "logisland_events"); + streamProperties.put(StreamProperties.INPUT_TOPICS().getName(), "logisland_raw"); + streamProperties.put(StreamProperties.ERROR_TOPICS().getName(), "logisland_errors"); + streamProperties.put(StreamProperties.INPUT_SERIALIZER().getName(), + StreamProperties.NO_SERIALIZER().getValue()); + streamProperties.put(StreamProperties.OUTPUT_SERIALIZER().getName(), + StreamProperties.JSON_SERIALIZER().getValue()); + streamProperties.put(StreamProperties.ERROR_SERIALIZER().getName(), + StreamProperties.JSON_SERIALIZER().getValue()); + + streamProperties.put(StreamProperties.AVRO_OUTPUT_SCHEMA().getName(), + "{ \"version\":1,\n" + + " \"type\": \"record\",\n" + + " \"name\": \"com.hurence.logisland.record.apache_log\",\n" + + " \"fields\": [\n" + + " { \"name\": \"record_raw_value\", \"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"record_errors\", \"type\": [ {\"type\": \"array\", \"items\": \"string\"},\"null\"] },\n" + + " { \"name\": \"record_id\", \"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"record_time\", \"type\": [\"long\",\"null\"] },\n" + + " { \"name\": \"record_type\", \"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"src_ip\", \"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"http_method\", \"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"bytes_out\", \"type\": [\"long\",\"null\"] },\n" + + " { \"name\": \"http_query\", \"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"http_version\",\"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"http_status\", \"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"identd\", \"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"user\", \"type\": [\"string\",\"null\"] } ]}"); + + streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR().getName(), "1"); + streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_PARTITIONS().getName(), "2"); + + + StreamConfiguration chainConf = new StreamConfiguration(); + chainConf.setComponent(KafkaRecordStreamParallelProcessing.class.getName()); + chainConf.setType(ComponentType.STREAM.toString()); + chainConf.setConfiguration(streamProperties); + chainConf.setStream("KafkaStream"); + return chainConf; + } + + + private StreamConfiguration getDebuggerStreamConfiguration() { + Map streamProperties = new HashMap<>(); + streamProperties.put(StreamProperties.KAFKA_METADATA_BROKER_LIST().getName(), "sandbox:9092"); + streamProperties.put(StreamProperties.KAFKA_ZOOKEEPER_QUORUM().getName(), "sandbox:2181"); + streamProperties.put(StreamProperties.INPUT_TOPICS().getName(), "logisland_raw"); + streamProperties.put(StreamProperties.OUTPUT_TOPICS().getName(), "logisland_events"); + streamProperties.put(StreamProperties.INPUT_SERIALIZER().getName(), StreamProperties.NO_SERIALIZER().getValue()); + streamProperties.put(StreamProperties.OUTPUT_SERIALIZER().getName(), StreamProperties.JSON_SERIALIZER().getValue()); + streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR().getName(), "1"); + streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_PARTITIONS().getName(), "4"); + + + StreamConfiguration chainConf = new StreamConfiguration(); + chainConf.setComponent(KafkaRecordStreamDebugger.class.getName()); + chainConf.setType(ComponentType.STREAM.toString()); + chainConf.setConfiguration(streamProperties); + chainConf.setStream("KafkaSQLStream"); + return chainConf; + } + + + private StreamConfiguration getSQLStreamConfiguration() { + Map streamProperties = new HashMap<>(); + streamProperties.put(StreamProperties.OUTPUT_RECORD_TYPE().getName(), "product_metric"); + streamProperties.put(StreamProperties.KAFKA_METADATA_BROKER_LIST().getName(), + "sd-84190:6667,sd-84191:6667,sd-84192:6667,sd-84186:6667"); + streamProperties.put(StreamProperties.KAFKA_ZOOKEEPER_QUORUM().getName(), + "sd-76387:2181,sd-84186:2181,sd-84189:2181"); + streamProperties.put(StreamProperties.INPUT_TOPICS().getName(), "ffact_products"); + streamProperties.put(StreamProperties.OUTPUT_TOPICS().getName(), "ffact_metrics"); + streamProperties.put(StreamProperties.INPUT_SERIALIZER().getName(), StreamProperties.JSON_SERIALIZER().getValue()); + + streamProperties.put(StreamProperties.OUTPUT_SERIALIZER().getName(), StreamProperties.JSON_SERIALIZER().getValue()); + streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR().getName(), "1"); + streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_PARTITIONS().getName(), "1"); + + streamProperties.put(StreamProperties.MAX_RESULTS_COUNT().getName(), "10"); + streamProperties.put(StreamProperties.SQL_QUERY().getName(), "SELECT count(*)/first(theoretical_cadence) AS product_trs, count(*) as product_count, factory, line, first(product_type) as product_type, first(theoretical_cadence) as theoretical_cadence, max(record_time) as record_time\n" + + " FROM ffact_products\n" + + " GROUP BY factory, line\n" + + " LIMIT 20"); + + + streamProperties.put(StreamProperties.AVRO_INPUT_SCHEMA().getName(), + "{ \"version\": 1,\n" + + " \"type\": \"record\",\n" + + " \"name\": \"com.hurence.logisland.ffact.product\",\n" + + " \"fields\": [\n" + + " { \"name\": \"record_errors\", \"type\": [ {\"type\": \"array\", \"items\": \"string\"},\"null\"] },\n" + + " { \"name\": \"record_raw_key\", \"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"record_raw_value\", \"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"record_id\", \"type\": [\"string\"] },\n" + + " { \"name\": \"record_time\", \"type\": [\"long\"] },\n" + + " { \"name\": \"record_type\", \"type\": [\"string\"] },\n" + + " { \"name\": \"label\", \"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"product_type\", \"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"operator\", \"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"factory\", \"type\": [\"string\",\"null\"] },\n" + + " { \"name\": \"latitude\", \"type\": [\"float\",\"null\"] },\n" + + " { \"name\": \"longitude\", \"type\": [\"float\",\"null\"] },\n" + + " { \"name\": \"theoretical_cadence\",\"type\": [\"float\",\"null\"] },\n" + + " { \"name\": \"line\", \"type\": [\"string\",\"null\"] } \n" + + " ]}"); + + StreamConfiguration chainConf = new StreamConfiguration(); + chainConf.setComponent(KafkaRecordStreamSQLAggregator.class.getName()); + chainConf.setType("stream"); + chainConf.setConfiguration(streamProperties); + chainConf.setStream("KafkaSQLStream"); + return chainConf; + } + + private ProcessorConfiguration getSplitTextProcessorConfiguration() { + Map processorProperties = new HashMap<>(); + processorProperties.put("value.regex", APACHE_LOG_REGEX); + processorProperties.put("value.fields", APACHE_LOG_FIELDS); + processorProperties.put("key.regex", "(\\S*):(\\S*):(\\S*):(\\S*):(\\S*)"); + processorProperties.put("key.field", "search_index,sub_project_code,record_type,host_name,uuid"); + + ProcessorConfiguration processorConf = new ProcessorConfiguration(); + processorConf.setComponent("com.hurence.logisland.processor.SplitText"); + processorConf.setType("parser"); + processorConf.setConfiguration(processorProperties); + processorConf.setProcessor("parser"); + return processorConf; + } + + + private ProcessorConfiguration getMockProcessorConfiguration() { + + + ProcessorConfiguration processorConf = new ProcessorConfiguration(); + processorConf.setComponent(MockProcessor.class.getName()); + processorConf.setType(ComponentType.PROCESSOR.toString()); + processorConf.setProcessor("debugger"); + return processorConf; + } + + private ProcessorConfiguration getDebugProcessorConfiguration() { + Map processorProperties = new HashMap<>(); + processorProperties.put("event.serializer", "json"); + + ProcessorConfiguration processorConf = new ProcessorConfiguration(); + processorConf.setComponent("com.hurence.logisland.processor.DebugStream"); + processorConf.setType(ComponentType.PROCESSOR.toString()); + processorConf.setConfiguration(processorProperties); + processorConf.setProcessor("debugger"); + return processorConf; + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RemoteApiEngineTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RemoteApiEngineTest.java new file mode 100644 index 000000000..59291b45e --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RemoteApiEngineTest.java @@ -0,0 +1,85 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine; + +import com.hurence.logisland.component.ComponentFactory; +import com.hurence.logisland.config.ConfigReader; +import com.hurence.logisland.config.LogislandConfiguration; +import com.hurence.logisland.util.spark.SparkUtils; +import okhttp3.mockwebserver.MockResponse; +import okhttp3.mockwebserver.MockWebServer; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Optional; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; + + +public class RemoteApiEngineTest { + private static Logger logger = LoggerFactory.getLogger(RemoteApiEngineTest.class); + + private static final String JOB_CONF_FILE = "/conf/remote-engine.yml"; + + @Test + @Ignore + public void remoteTest() { + + logger.info("starting StreamProcessingRunner"); + + Optional engineInstance = Optional.empty(); + try { + + String configFile = RemoteApiEngineTest.class.getResource(JOB_CONF_FILE).getPath(); + + // load the YAML config + LogislandConfiguration sessionConf = ConfigReader.loadConfig(configFile); + + // instanciate engine and all the processor from the config + engineInstance = ComponentFactory.getEngineContext(sessionConf.getEngine()); + assert engineInstance.isPresent(); + assert engineInstance.get().isValid(); + + logger.info("starting Logisland session version {}", sessionConf.getVersion()); + logger.info(sessionConf.getDocumentation()); + } catch (Exception e) { + logger.error("unable to launch runner : {}", e); + } + + try { + // start the engine + final EngineContext engineContext = engineInstance.get(); + engineInstance.get().getEngine().start(engineContext); + + + engineContext.getEngine().awaitTermination(engineContext); + + } catch (Exception e) { + logger.error("something went bad while running the job : {}", e); + System.exit(-1); + } + + + + + + + } + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/SparkEngineConfTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/SparkEngineConfTest.java new file mode 100644 index 000000000..e1e75a0ad --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/SparkEngineConfTest.java @@ -0,0 +1,182 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine; + +import com.hurence.logisland.component.ComponentFactory; +import com.hurence.logisland.component.ComponentType; +import com.hurence.logisland.config.EngineConfiguration; +import com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; + +public class SparkEngineConfTest { + private static Logger logger = LoggerFactory.getLogger(SparkEngineConfTest.class); + + /** + * testing all value correct (see https://spark.apache.org/docs/latest/submitting-applications.html#master-urls 2.4.1 at time of this test) + * make sure it is compatible as well with first version 2.x https://spark.apache.org/docs/2.0.0/submitting-applications.html#master-urls + */ + @Test + public void sparkMasterConfigTest() { + EngineConfiguration engineConf = getStandardEngineConfiguration(); + + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[4]"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[2,1]"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[2,123]"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[*]"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[*,32]"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[33,32]"); + testConfIsValid(engineConf); + //spark://HOST:PORT + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://045.478.874.4785217"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://aze0484.44-44:089"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://aze0484.44-44"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://htrh"); + testConfIsValid(engineConf); + //spark://HOST1:PORT1,HOST2:PORT2 + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://cn1:2181,cn2:2181,cn3:2181"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://cn1:2181,cn2:2181"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://cn1:2181"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://cn1"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://cn1,cn2"); + testConfIsValid(engineConf); + //mesos://HOST:PORT + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "mesos://zk://cn1:2181,cn2:2181,cn3:2181/mesos"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "mesos://zk://cn1:2181,cn2:2181/mesos"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "mesos://zk://cn1:2181/mesos"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "mesos://207.184.161.138:7077"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "mesos://207.184.161.138"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "mesos://gregh:"); + testConfIsNotValid(engineConf); + //yarn + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "yarn"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "yarn-client"); + testConfIsNotValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "yarn-cluster"); + testConfIsNotValid(engineConf); + //k8s://HOST:PORT + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "k8s://hrgjtdyj:4589"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "k8s://http://1245.444.444.444:4589"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "k8s://https://WHATEVER:41587"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "k8s://WHATEVER"); + testConfIsValid(engineConf); + } + + @Test + public void somePropertiesConfigTest() { + EngineConfiguration engineConf = getStandardEngineConfiguration(); + + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_APP_NAME().getName(), "FSV-OracleDataCollectionWithSnapshot-2outof4-PROD-Next1"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.JAVA_MESOS_LIBRARY_PATH().getName(), "/opt/mesos-1.6.0/build/src/.libs/libmesos.so"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_DRIVER_MEMORY().getName(), "2G"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_DRIVER_CORES().getName(), "1"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_EXECUTOR_CORES().getName(), "5"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_EXECUTOR_INSTANCES().getName(), "1"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_EXECUTOR_MEMORY().getName(), "4G"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MESOS_CORE_MAX().getName(), "8"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_TASK_MAX_FAILURES().getName(), "8"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_SERIALIZER().getName(), "org.apache.spark.serializer.KryoSerializer"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_BATCH_DURATION().getName(), "20000"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_BACKPRESSURE_ENABLED().getName(), "false"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_UNPERSIST().getName(), "false"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_BLOCK_INTERVAL().getName(), "500"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_KAFKA_MAX_RATE_PER_PARTITION().getName(), "10000"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_TIMEOUT().getName(), "-1"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_KAFKA_MAXRETRIES().getName(), "3"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_UI_RETAINED_BATCHES().getName(), "200"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_RECEIVER_WAL_ENABLE().getName(), "false"); + testConfIsValid(engineConf); + engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_UI_PORT().getName(), "4050"); + testConfIsValid(engineConf); + } + + private void testConfIsValid(EngineConfiguration engineConf) { + Optional engineInstance = ComponentFactory.getEngineContext(engineConf); + Assert.assertTrue(engineInstance.isPresent()); + Assert.assertTrue(engineInstance.get().isValid()); + engineInstance.get(); + } + + private void testConfIsNotValid(EngineConfiguration engineConf) { + Optional engineInstance = ComponentFactory.getEngineContext(engineConf); + Assert.assertTrue(engineInstance.isPresent()); + Assert.assertFalse(engineInstance.get().isValid()); + engineInstance.get(); + } + + private EngineConfiguration getStandardEngineConfiguration() { + Map engineProperties = new HashMap<>(); + engineProperties.put(KafkaStreamProcessingEngine.SPARK_APP_NAME().getName(), "testApp"); + engineProperties.put(KafkaStreamProcessingEngine.SPARK_STREAMING_BATCH_DURATION().getName(), "5000"); + engineProperties.put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[4]"); + engineProperties.put(KafkaStreamProcessingEngine.SPARK_EXECUTOR_CORES().getName(), "4"); + engineProperties.put(KafkaStreamProcessingEngine.SPARK_STREAMING_TIMEOUT().getName(), "-1"); + + EngineConfiguration engineConf = new EngineConfiguration(); + engineConf.setComponent(KafkaStreamProcessingEngine.class.getName()); + engineConf.setType(ComponentType.ENGINE.toString()); + engineConf.setConfiguration(engineProperties); + return engineConf; + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/StreamDebuggerTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/StreamDebuggerTest.java new file mode 100644 index 000000000..a781e63aa --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/StreamDebuggerTest.java @@ -0,0 +1,79 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine; + +import com.hurence.logisland.component.ComponentFactory; +import com.hurence.logisland.config.ConfigReader; +import com.hurence.logisland.config.LogislandConfiguration; +import com.hurence.logisland.util.spark.SparkUtils; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Optional; + + +public class StreamDebuggerTest { + private static Logger logger = LoggerFactory.getLogger(StreamDebuggerTest.class); + + private static final String JOB_CONF_FILE = "/conf/structured-stream.yml"; + + @Test + @Ignore + public void remoteTest() { + + + logger.info("starting StreamProcessingRunner"); + + Optional engineInstance = Optional.empty(); + try { + + String configFile = StreamDebuggerTest.class.getResource(JOB_CONF_FILE).getPath(); + + // load the YAML config + LogislandConfiguration sessionConf = ConfigReader.loadConfig(configFile); + + // instanciate engine and all the processor from the config + engineInstance = ComponentFactory.getEngineContext(sessionConf.getEngine()); + assert engineInstance.isPresent(); + assert engineInstance.get().isValid(); + + logger.info("starting Logisland session version {}", sessionConf.getVersion()); + logger.info(sessionConf.getDocumentation()); + } catch (Exception e) { + logger.error("unable to launch runner : {}", e); + } + + try { + // start the engine + EngineContext engineContext = engineInstance.get(); + engineInstance.get().getEngine().start(engineContext); + + engineContext.getEngine().awaitTermination(engineContext); + } catch (Exception e) { + logger.error("something went bad while running the job : {}", e); + System.exit(-1); + } + + + + + + + } + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/RemoteApiClientTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/RemoteApiClientTest.java new file mode 100644 index 000000000..0f1c820aa --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/RemoteApiClientTest.java @@ -0,0 +1,90 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote; + +import okhttp3.Credentials; +import okhttp3.mockwebserver.MockResponse; +import okhttp3.mockwebserver.MockWebServer; +import okhttp3.mockwebserver.RecordedRequest; +import org.junit.Assert; +import org.junit.Test; + +import javax.ws.rs.core.HttpHeaders; +import java.time.Duration; +import java.util.concurrent.TimeUnit; + +public class RemoteApiClientTest { + + private final String dataflowName = "dummy"; + + private RemoteApiClient createInstance(MockWebServer server, String user, String password) { + return new RemoteApiClient(new RemoteApiClient.ConnectionSettings( server.url("/").toString(), + Duration.ofSeconds(2), Duration.ofSeconds(2), user, password)); + } + + @Test + public void testAllUnsecured() throws Exception { + + try (MockWebServer mockWebServer = new MockWebServer()) { + mockWebServer.enqueue(new MockResponse().setResponseCode(404)); + mockWebServer.enqueue(new MockResponse().setBodyDelay(3, TimeUnit.SECONDS)); + final String dummy = "\"name\":\"myName\", \"component\":\"myComponent\""; + mockWebServer.enqueue(new MockResponse().setBody("{" + dummy + ",\"lastModified\":\"1983-06-04T10:01:02Z\"," + + "\"streams\":[{" + dummy + "}]}")); + RemoteApiClient client = createInstance(mockWebServer, null, null); + Assert.assertFalse(client.fetchDataflow(dataflowName, new RemoteApiClient.State()).isPresent()); + Assert.assertFalse(client.fetchDataflow(dataflowName, new RemoteApiClient.State()).isPresent()); + Assert.assertTrue(client.fetchDataflow(dataflowName, new RemoteApiClient.State()).isPresent()); + + } + + + } + + @Test + public void testValidationFails() throws Exception { + try (MockWebServer mockWebServer = new MockWebServer()) { + mockWebServer.enqueue(new MockResponse().setBody("{\"name\":\"divPo\", \"lastModified\":\"1983-06-04T10:01:02Z\",\"services\":[{}],\"streams\":[{}]}")); + RemoteApiClient client = createInstance(mockWebServer, null, null); + Assert.assertFalse(client.fetchDataflow(dataflowName, new RemoteApiClient.State()).isPresent()); + } + + + } + + @Test + public void testAuthentication() throws Exception { + try (MockWebServer mockWebServer = new MockWebServer()) { + RemoteApiClient client = createInstance(mockWebServer, "test", "test"); + mockWebServer.enqueue(new MockResponse().setBody("{}")); + client.fetchDataflow(dataflowName, new RemoteApiClient.State()); + RecordedRequest request = mockWebServer.takeRequest(); + String auth = request.getHeader(HttpHeaders.AUTHORIZATION); + Assert.assertEquals(Credentials.basic("test", "test"), auth); + } + } + + @Test + public void testUri() throws Exception { + try (MockWebServer mockWebServer = new MockWebServer()) { + RemoteApiClient client = createInstance(mockWebServer, null, null); + mockWebServer.enqueue(new MockResponse().setBody("{}")); + client.fetchDataflow(dataflowName, new RemoteApiClient.State()); + RecordedRequest request = mockWebServer.takeRequest(); + Assert.assertEquals("/dataflows/"+dataflowName, request.getPath()); + } + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockProcessor.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockProcessor.java new file mode 100644 index 000000000..5d125dc7b --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockProcessor.java @@ -0,0 +1,37 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote.mock; + +import com.hurence.logisland.component.PropertyDescriptor; +import com.hurence.logisland.processor.AbstractProcessor; +import com.hurence.logisland.processor.ProcessContext; +import com.hurence.logisland.record.Record; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +public class MockProcessor extends AbstractProcessor { + @Override + public List getSupportedPropertyDescriptors() { + return new ArrayList<>(); + } + + @Override + public Collection process(ProcessContext context, Collection records) { + return records; + } +} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockServiceController.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockServiceController.java new file mode 100644 index 000000000..17931be0e --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockServiceController.java @@ -0,0 +1,29 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote.mock; + +import com.hurence.logisland.component.PropertyDescriptor; +import com.hurence.logisland.controller.AbstractControllerService; + +import java.util.ArrayList; +import java.util.List; + +public class MockServiceController extends AbstractControllerService { + @Override + public List getSupportedPropertyDescriptors() { + return new ArrayList<>(); + } +} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockStream.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockStream.java new file mode 100644 index 000000000..cff86c2bc --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockStream.java @@ -0,0 +1,29 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.engine.spark.remote.mock; + +import com.hurence.logisland.component.PropertyDescriptor; +import com.hurence.logisland.stream.AbstractRecordStream; + +import java.util.ArrayList; +import java.util.List; + +public class MockStream extends AbstractRecordStream { + @Override + public List getSupportedPropertyDescriptors() { + return new ArrayList<>(); + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/StructuredStreamTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/StructuredStreamTest.java new file mode 100644 index 000000000..b7fede78c --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/StructuredStreamTest.java @@ -0,0 +1,83 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark.structured; + +import com.hurence.logisland.component.ComponentFactory; +import com.hurence.logisland.config.ConfigReader; +import com.hurence.logisland.config.LogislandConfiguration; +import com.hurence.logisland.engine.EngineContext; +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Optional; +import java.util.Scanner; + + +/** + * End to end test. + */ +public class StructuredStreamTest { + private static Logger logger = LoggerFactory.getLogger(StructuredStreamTest.class); + + private static final String JOB_CONF_FILE = "/conf/timeseries-structured-stream.yml"; + + @Test + @Ignore + public void remoteTest() { + + + logger.info("starting StreamProcessingRunner"); + + Optional engineInstance = Optional.empty(); + try { + + String configFile = StructuredStreamTest.class.getResource(JOB_CONF_FILE).getPath(); + + // load the YAML config + LogislandConfiguration sessionConf = ConfigReader.loadConfig(configFile); + + // instantiate engine and all the processor from the config + engineInstance = ComponentFactory.getEngineContext(sessionConf.getEngine()); + assert engineInstance.isPresent(); + assert engineInstance.get().isValid(); + + logger.info("starting Logisland session version {}", sessionConf.getVersion()); + logger.info(sessionConf.getDocumentation()); + } catch (Exception e) { + logger.error("unable to launch runner : {}", e); + } + + try { + // start the engine + EngineContext engineContext = engineInstance.get(); + engineInstance.get().getEngine().start(engineContext); + new Scanner(System.in).nextLine(); + } catch (Exception e) { + Assert.fail("something went bad while running the job : " + e.toString()); + + } + + + + + + + } + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderServiceTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderServiceTest.java new file mode 100644 index 000000000..ab19ea8d1 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderServiceTest.java @@ -0,0 +1,59 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark.structured.provider; + +import com.hurence.logisland.annotation.documentation.CapabilityDescription; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * + * @author bailett + */ +@CapabilityDescription("Provide a way to read a local file as input in StructuredStream streams") +public class LocalFileStructuredStreamProviderServiceTest { + + private Logger logger = LoggerFactory.getLogger(LocalFileStructuredStreamProviderServiceTest.class); + + private String JOB_CONF_FILE = "/conf/timeseries-structured-stream.yml"; + + + @Test + @Ignore + public void testLocalFileStructuredStreamProviderService() { + ProviderServiceAsReaderRunner runner = new ProviderServiceAsReaderRunner(null); + runner.run(); + } + +} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/ProviderServiceAsReaderRunner.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/ProviderServiceAsReaderRunner.java new file mode 100644 index 000000000..1d6ae83c1 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/ProviderServiceAsReaderRunner.java @@ -0,0 +1,121 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark.structured.provider; + +import com.hurence.logisland.component.ComponentFactory; +import com.hurence.logisland.config.ControllerServiceConfiguration; +import com.hurence.logisland.config.EngineConfiguration; +import com.hurence.logisland.config.ProcessorConfiguration; +import com.hurence.logisland.config.StreamConfiguration; +import com.hurence.logisland.engine.EngineContext; +import com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine; +//import com.hurence.logisland.processor.DebugStream; +import com.hurence.logisland.stream.StreamProperties; +import com.hurence.logisland.stream.spark.structured.StructuredStream; +import com.hurence.logisland.util.runner.MockControllerServiceLookup; +import org.junit.Assert; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.*; + +public class ProviderServiceAsReaderRunner { + + private static Logger logger = LoggerFactory.getLogger(ProviderServiceAsReaderRunner.class); + + private final StructuredStreamProviderService provider; + private final MockControllerServiceLookup serviceLookup; + + + public ProviderServiceAsReaderRunner(StructuredStreamProviderService provider) { + this.provider = provider; + this.serviceLookup = new MockControllerServiceLookup(); + } + + public void run() { + EngineContext engineContext = ComponentFactory.getEngineContext(getEngineConfiguration()).get(); + Assert.assertTrue(engineContext.isValid()); + try { + engineContext.getEngine().start(engineContext); + engineContext.getEngine().awaitTermination(engineContext); + } catch (Exception ex) { + engineContext.getEngine().shutdown(engineContext); + } + } + + private EngineConfiguration getEngineConfiguration() { + EngineConfiguration engineConfiguration = new EngineConfiguration(); + engineConfiguration.setType("engine"); + engineConfiguration.setDocumentation("Plain java engine"); + engineConfiguration.setComponent(KafkaStreamProcessingEngine.class.getCanonicalName()); + Map props = new HashMap<>(); + props.put(StreamProperties.READ_TOPICS_SERIALIZER().getName(), "none"); + props.put(StreamProperties.READ_STREAM_SERVICE_PROVIDER().getName(), "local_file_service"); + props.put(StreamProperties.WRITE_TOPICS_SERIALIZER().getName(), StreamProperties.JSON_SERIALIZER().getValue()); + props.put(StreamProperties.WRITE_STREAM_SERVICE_PROVIDER().getName(), "console_service"); + StreamConfiguration streamConfiguration = testStructuredStreamStream(props); +// streamConfiguration.addProcessorConfiguration(debugProcessorConfiguration(Collections.emptyMap())); + engineConfiguration.addPipelineConfigurations(streamConfiguration); + //set up services + Map propsFileProvider = new HashMap<>(); + propsFileProvider.put("local.input.path", getClass().getResource("/input").getFile()); + List services = new ArrayList<>(); + services.add(testLocalFileProvider(propsFileProvider)); + + Map propsConsoleProvider = new HashMap<>(); + propsConsoleProvider.put("truncate", "false"); + services.add(testConsoleProvider(propsConsoleProvider)); + engineConfiguration.setControllerServiceConfigurations(services); + return engineConfiguration; + } + + private StreamConfiguration testStructuredStreamStream(Map props) { + StreamConfiguration streamConfiguration = new StreamConfiguration(); + streamConfiguration.setStream("testStructuredStream"); + streamConfiguration.setComponent(StructuredStream.class.getCanonicalName()); + streamConfiguration.setType("stream"); + streamConfiguration.setConfiguration(props); + return streamConfiguration; + } + + private ControllerServiceConfiguration testLocalFileProvider(Map props) { + ControllerServiceConfiguration serviceConfiguration = new ControllerServiceConfiguration(); + serviceConfiguration.setControllerService("local_file_service"); + serviceConfiguration.setComponent(LocalFileStructuredStreamProviderService.class.getCanonicalName()); + serviceConfiguration.setType("provider"); + serviceConfiguration.setConfiguration(props); + return serviceConfiguration; + } + + private ControllerServiceConfiguration testConsoleProvider(Map props) { + ControllerServiceConfiguration serviceConfiguration = new ControllerServiceConfiguration(); + serviceConfiguration.setControllerService("console_service"); + serviceConfiguration.setComponent(ConsoleStructuredStreamProviderService.class.getCanonicalName()); + serviceConfiguration.setType("provider"); + serviceConfiguration.setConfiguration(props); + return serviceConfiguration; + } + + private ProcessorConfiguration debugProcessorConfiguration(Map props) { + ProcessorConfiguration ret = new ProcessorConfiguration(); + ret.setProcessor(UUID.randomUUID().toString()); +// ret.setComponent(DebugStream.class.getCanonicalName()); + ret.setType("processor"); + return ret; + } + + +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/kafka-connect-stream.yml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/kafka-connect-stream.yml new file mode 100644 index 000000000..30d043e3a --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/kafka-connect-stream.yml @@ -0,0 +1,138 @@ +version: 1.2.0 +documentation: LogIsland future factory job + +engine: + component: com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine + type: engine + documentation: Index some apache logs with logisland + configuration: + spark.app.name: ConnectTest + spark.master: local[*] + spark.driver.memory: 512M + spark.driver.cores: 1 + spark.executor.memory: 512M + spark.executor.instances: 4 + spark.executor.cores: 2 + spark.yarn.queue: default + spark.yarn.maxAppAttempts: 4 + spark.yarn.am.attemptFailuresValidityInterval: 1h + spark.yarn.max.executor.failures: 20 + spark.yarn.executor.failuresValidityInterval: 1h + spark.task.maxFailures: 8 + spark.serializer: org.apache.spark.serializer.KryoSerializer + spark.streaming.batchDuration: 2000 + spark.streaming.backpressure.enabled: false + spark.streaming.blockInterval: 500 + spark.streaming.kafka.maxRatePerPartition: 10000 + spark.streaming.timeout: -1 + spark.streaming.unpersist: false + spark.streaming.kafka.maxRetries: 3 + spark.streaming.ui.retainedBatches: 200 + spark.streaming.receiver.writeAheadLog.enable: false + spark.ui.port: 4040 + + controllerServiceConfigurations: + + - controllerService: kc_source_service + component: com.hurence.logisland.stream.spark.provider.KafkaConnectStructuredSourceProviderService + configuration: + kc.data.value.converter: com.hurence.logisland.connect.converter.LogIslandRecordConverter + kc.data.value.converter.properties: | + record.serializer=com.hurence.logisland.serializer.KryoSerializer + kc.data.key.converter.properties: | + schemas.enable=false + kc.data.key.converter: org.apache.kafka.connect.storage.StringConverter + kc.worker.tasks.max: 3 + kc.partitions.max: 12 + kc.connector.class: com.hurence.logisland.connect.fake.FakeConnector + kc.connector.offset.backing.store: memory + kc.connector.properties: | + foo=bar + dummy=a long string + + - controllerService: kc_sink_service + component: com.hurence.logisland.stream.spark.provider.KafkaConnectStructuredSinkProviderService + configuration: + kc.data.value.converter: com.hurence.logisland.connect.converter.LogIslandRecordConverter + kc.data.value.converter.properties: | + record.serializer=com.hurence.logisland.serializer.KryoSerializer + kc.data.key.converter.properties: | + schemas.enable=false + kc.data.key.converter: org.apache.kafka.connect.storage.StringConverter + kc.worker.tasks.max: 2 + kc.partitions.max: 4 + kc.connector.class: com.hurence.logisland.connect.fake.TestSink + + + + + - controllerService: kafka_out_service + component: com.hurence.logisland.stream.spark.structured.provider.KafkaStructuredStreamProviderService + configuration: + kafka.output.topics: logisland_raw + kafka.error.topics: logisland_errors + kafka.input.topics.serializer: com.hurence.logisland.serializer.KryoSerializer + kafka.output.topics.serializer: com.hurence.logisland.serializer.KryoSerializer + kafka.error.topics.serializer: com.hurence.logisland.serializer.JsonSerializer + kafka.metadata.broker.list: sandbox:9092 + kafka.zookeeper.quorum: sandbox:2181 + kafka.topic.autoCreate: true + kafka.topic.default.partitions: 4 + kafka.topic.default.replicationFactor: 1 + + streamConfigurations: + ################ indexing stream ############### + - stream: indexing_stream + component: com.hurence.logisland.stream.spark.KafkaRecordStreamParallelProcessing + type: stream + documentation: a processor that converts raw excel file content into structured log records + configuration: + kafka.input.topics: logisland_raw + kafka.output.topics: none + kafka.error.topics: logisland_errors + kafka.input.topics.serializer: com.hurence.logisland.serializer.KryoSerializer + kafka.output.topics.serializer: com.hurence.logisland.serializer.KryoSerializer + kafka.error.topics.serializer: com.hurence.logisland.serializer.JsonSerializer + kafka.metadata.broker.list: sandbox:9092 + kafka.zookeeper.quorum: sandbox:2181 + kafka.topic.autoCreate: true + kafka.topic.default.partitions: 4 + kafka.topic.default.replicationFactor: 1 + processorConfigurations: + # do something useful here + - processor: stream_debugger + component: com.hurence.logisland.processor.DebugStream + type: processor + documentation: debug records + configuration: + event.serializer: json + + + ######### parsing stream ############## + - stream: parsing_stream_source + component: com.hurence.logisland.stream.spark.structured.StructuredStream + configuration: + read.topics: /a/in + read.topics.serializer: com.hurence.logisland.serializer.KryoSerializer + read.topics.key.serializer: com.hurence.logisland.serializer.StringSerializer + read.stream.service.provider: kc_source_service + write.topics: logisland_raw + write.topics.serializer: com.hurence.logisland.serializer.KryoSerializer + write.topics.key.serializer: com.hurence.logisland.serializer.StringSerializer + write.stream.service.provider: kc_sink_service + processorConfigurations: + + - processor: flatten + component: com.hurence.logisland.processor.FlatMap + type: processor + documentation: "extract from root record" + configuration: + keep.root.record: false + copy.root.record.fields: true + - processor: add_fields + component: com.hurence.logisland.processor.AddFields + type: processor + documentation: "rename fields for dynamic indexation in chronix : add *_s suffix" + configuration: + conflict.resolution.policy: overwrite_existing + record_key: ${partition} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/opencv.yml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/opencv.yml new file mode 100644 index 000000000..c0def1a37 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/opencv.yml @@ -0,0 +1,62 @@ +version: 1.2.0 +documentation: LogIsland computer vision sample + +engine: + component: com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine + configuration: + spark.app.name: OpenCV + spark.master: local[2] + spark.streaming.batchDuration: 200 + spark.streaming.kafka.maxRatePerPartition: 10000 + spark.streaming.timeout: -1 + + controllerServiceConfigurations: + + - controllerService: kafka_service + component: com.hurence.logisland.stream.spark.structured.provider.KafkaStructuredStreamProviderService + configuration: + kafka.input.topics: logisland_raw + kafka.output.topics: logisland_images + kafka.error.topics: logisland_errors + kafka.input.topics.serializer: com.hurence.logisland.serializer.BytesArraySerializer + kafka.output.topics.serializer: com.hurence.logisland.serializer.BytesArraySerializer + kafka.error.topics.serializer: com.hurence.logisland.serializer.JsonSerializer + kafka.metadata.broker.list: kafka:9092 + kafka.zookeeper.quorum: zookeeper:2181 + kafka.topic.autoCreate: true + kafka.topic.default.partitions: 2 + kafka.topic.default.replicationFactor: 1 + + streamConfigurations: + + - stream: parsing_stream + component: com.hurence.logisland.stream.spark.structured.StructuredStream + configuration: + read.topics.serializer: com.hurence.logisland.serializer.BytesArraySerializer + read.stream.service.provider: kafka_service + write.topics.serializer: com.hurence.logisland.serializer.BytesArraySerializer + write.stream.service.provider: kafka_service + processorConfigurations: + + - processor: contour_extraction + component: com.hurence.logisland.cv.processor.RunScript + configuration: + input.field: record_value + output.field: record_value + output.mode: overwrite + image.format: png + script.ns: com.hurence.logisland + script.function: ld_detect_edges + script.code: > + (ns com.hurence.logisland + (:refer-clojure :exclude [sort min merge reduce max compare repeat]) + (:require [opencv4.utils :refer :all]) + (:require [opencv4.core :refer :all]) + (:import [com.hurence.logisland.record Record])) + + (defn ld_detect_edges [mat] + (-> mat + (resize-by 0.5) + (cvt-color! COLOR_RGB2GRAY) + (canny! 300.0 100.0 3 true) + (bitwise-not!))) \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/remote-engine.yml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/remote-engine.yml new file mode 100644 index 000000000..9da496f55 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/remote-engine.yml @@ -0,0 +1,38 @@ +version: 1.2.0 +documentation: LogIsland remote controlled. + +engine: + component: com.hurence.logisland.engine.spark.RemoteApiStreamProcessingEngine + type: engine + documentation: Do some remote pipelines. + configuration: + spark.app.name: RemoteConnect + spark.master: local[2] + spark.driver.memory: 512M + spark.driver.cores: 1 + spark.executor.memory: 512M + spark.executor.instances: 2 + spark.executor.cores: 2 + spark.yarn.queue: default + spark.yarn.maxAppAttempts: 4 + spark.yarn.am.attemptFailuresValidityInterval: 1h + spark.yarn.max.executor.failures: 20 + spark.yarn.executor.failuresValidityInterval: 1h + spark.task.maxFailures: 8 + spark.serializer: org.apache.spark.serializer.KryoSerializer + spark.streaming.batchDuration: 2000 + spark.streaming.backpressure.enabled: false + spark.streaming.blockInterval: 500 + spark.streaming.kafka.maxRatePerPartition: 10000 + spark.streaming.timeout: -1 + spark.streaming.unpersist: false + spark.streaming.kafka.maxRetries: 3 + spark.streaming.ui.retainedBatches: 200 + spark.streaming.receiver.writeAheadLog.enable: false + spark.ui.port: 4040 + remote.api.baseUrl: http://localhost:3000 + remote.api.polling.rate: 5000 + remote.api.push.rate: 10000 + + + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/structured-stream.yml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/structured-stream.yml new file mode 100644 index 000000000..4db7dabe6 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/structured-stream.yml @@ -0,0 +1,76 @@ +version: 1.2.0 +documentation: LogIsland future factory job + +engine: + component: com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine + type: engine + documentation: Index some apache logs with logisland + configuration: + spark.app.name: FutureFactory + spark.master: local[4] + spark.driver.memory: 1G + spark.driver.cores: 1 + spark.executor.memory: 1G + spark.executor.instances: 4 + spark.executor.cores: 2 + spark.yarn.queue: default + spark.yarn.maxAppAttempts: 4 + spark.yarn.am.attemptFailuresValidityInterval: 1h + spark.yarn.max.executor.failures: 20 + spark.yarn.executor.failuresValidityInterval: 1h + spark.task.maxFailures: 8 + spark.serializer: org.apache.spark.serializer.KryoSerializer + #spark.serializer: org.apache.spark.serializer.JavaSerializer + spark.streaming.batchDuration: 2000 + spark.streaming.backpressure.enabled: false + spark.streaming.blockInterval: 500 + spark.streaming.kafka.maxRatePerPartition: 10000 + spark.streaming.timeout: -1 + spark.streaming.unpersist: false + spark.streaming.kafka.maxRetries: 3 + spark.streaming.ui.retainedBatches: 200 + spark.streaming.receiver.writeAheadLog.enable: false + spark.ui.port: 4040 + + controllerServiceConfigurations: + + - controllerService: mqtt_service + component: com.hurence.logisland.stream.spark.structured.provider.MQTTStructuredStreamProviderService + configuration: + # mqtt.broker.url: tcp://51.15.164.141:1883 + mqtt.broker.url: tcp://localhost:1883 + mqtt.persistence: memory + mqtt.client.id: logisland + mqtt.qos: 0 + mqtt.topic: Account123/# + mqtt.username: User123 + mqtt.password: Kapu12345678+ + mqtt.clean.session: true + mqtt.connection.timeout: 30 + mqtt.keep.alive: 60 + mqtt.version: 3 + + - controllerService: console_service + component: com.hurence.logisland.stream.spark.structured.provider.ConsoleStructuredStreamProviderService + + streamConfigurations: + + # indexing stream + - stream: indexing_stream + component: com.hurence.logisland.stream.spark.structured.StructuredStream + configuration: + read.topics.serializer: com.hurence.logisland.serializer.KuraProtobufSerializer + read.topics.client.service: mqtt_service + write.topics.serializer: none + write.topics.client.service: console_service + processorConfigurations: + + - processor: flatten + component: com.hurence.logisland.processor.FlatMap + type: processor + documentation: "extract metrics from root record" + configuration: + keep.root.record: false + copy.root.record.fields: true + leaf.record.type: record_metric + concat.fields: record_name diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/timeseries-structured-stream.yml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/timeseries-structured-stream.yml new file mode 100644 index 000000000..b257f36e9 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/timeseries-structured-stream.yml @@ -0,0 +1,99 @@ +version: 1.2.0 +documentation: LogIsland future factory job + +engine: + component: com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine + configuration: + spark.app.name: TimeseriesParsing + spark.master: local[2] + spark.streaming.batchDuration: 200 + spark.streaming.kafka.maxRatePerPartition: 10000 + controllerServiceConfigurations: + + + - controllerService: kafka_service + component: com.hurence.logisland.stream.spark.structured.provider.KafkaStructuredStreamProviderService + configuration: + kafka.input.topics: logisland_raw + kafka.output.topics: logisland_measures + kafka.error.topics: logisland_errors + kafka.input.topics.serializer: com.hurence.logisland.serializer.JsonSerializer + kafka.output.topics.serializer: com.hurence.logisland.serializer.JsonSerializer + kafka.error.topics.serializer: com.hurence.logisland.serializer.JsonSerializer + kafka.metadata.broker.list: localhost:9092 + kafka.zookeeper.quorum: localhost:2181 + kafka.topic.autoCreate: true + kafka.topic.default.partitions: 4 + kafka.topic.default.replicationFactor: 1 + + - controllerService: kafka_service_out + component: com.hurence.logisland.stream.spark.structured.provider.KafkaStructuredStreamProviderService + configuration: + kafka.input.topics: logisland_measures + kafka.output.topics: logisland_metrics + kafka.error.topics: logisland_errors + kafka.input.topics.serializer: com.hurence.logisland.serializer.JsonSerializer + kafka.output.topics.serializer: com.hurence.logisland.serializer.JsonSerializer + kafka.error.topics.serializer: com.hurence.logisland.serializer.JsonSerializer + kafka.metadata.broker.list: localhost:9092 + kafka.zookeeper.quorum: localhost:2181 + kafka.topic.autoCreate: true + kafka.topic.default.partitions: 4 + kafka.topic.default.replicationFactor: 1 + + streamConfigurations: + + # This stream take all raw events as lines comming from local files + # these lines are split into logisland records and sent into a kafka topic + - stream: parsing_stream + component: com.hurence.logisland.stream.spark.structured.StructuredStream + configuration: + read.topics.serializer: none + read.stream.service.provider: kafka_service + write.topics.serializer: com.hurence.logisland.serializer.JsonSerializer + write.stream.service.provider: kafka_service + processorConfigurations: + + - processor: historian_parser + component: com.hurence.logisland.processor.SplitText + configuration: + record.type: timeserie + value.regex: (\S+\s+\S+);(\S+);(\S+);(\S+) + value.fields: record_time,tagname,record_value,quality + + - processor: create_aliases + component: com.hurence.logisland.processor.NormalizeFields + configuration: + conflict.resolution.policy: keep_both_fields + record_name: tagname + + - processor: fields_types_converter + component: com.hurence.logisland.processor.ConvertFieldsType + configuration: + record_value: double + quality: float + + # This stream will perform a statefull groupBy operation on tagname + - stream: compaction_stream + component: com.hurence.logisland.stream.spark.structured.StructuredStream + configuration: + read.topics.key.serializer: com.hurence.logisland.serializer.StringSerializer + read.topics.serializer: com.hurence.logisland.serializer.JsonSerializer + read.stream.service.provider: kafka_service_out + write.topics.serializer: com.hurence.logisland.serializer.JsonSerializer + write.stream.service.provider: kafka_service_out + groupby: tagname + chunk.size: 50 + state.timeout.ms: 30000 + + processorConfigurations: + +# - processor: debug_1 +# component: com.hurence.logisland.processor.DebugStream + # Make one chronix chunk from all records + - processor: timeseries_converter + component: com.hurence.logisland.processor.ConvertToTimeseries + configuration: + groupby: tagname + metric: avg;max;min;trend;sax:7,0.01,10 + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/log4j.properties b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/log4j.properties new file mode 100644 index 000000000..754f1c2a8 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/log4j.properties @@ -0,0 +1,65 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +log4j.rootCategory=INFO, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +log4j.logger.org.apache.spark.repl.Main=WARN + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.spark_project.jetty=WARN +log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO +log4j.logger.org.apache.parquet=ERROR +log4j.logger.parquet=ERROR + +log4j.logger.org.apache.spark=WARN +log4j.logger.org.apache.spark.scheduler=WARN +log4j.logger.org.apache.spark.history=WARN +log4j.logger.org.apache.spark.streaming=WARN +log4j.logger.org.spark-project.jetty=WARN +log4j.logger.org.eclipse.jetty.server=OFF +log4j.logger.org.apache.spark.deploy.yarn=DEBUG +log4j.logger.io.netty=WARN +log4j.logger.org.apache.hadoop.ipc.Client=WARN +log4j.logger.org.apache.hadoop=WARN +log4j.logger.org.apache.hadoop.ipc.ProtobufRpcEngine=WARN +log4j.logger.parquet.hadoop=WARN +log4j.logger.org.apache.kafka=ERROR +log4j.logger.kafka=WARN +log4j.logger.org.elasticsearch=WARN +log4j.logger.com.hurence=DEBUG +log4j.logger.org.apache.zookeeper=ERROR +log4j.logger.org.I0Itec.zkclient=ERROR +log4j.logger.org.apache.spark.sql.execution.streaming.state.StateStore=WARN +log4j.logger.org.apache.spark.ContextCleaner=WARN +log4j.additivity.kafka.server=false +log4j.additivity.kafka.consumer.ZookeeperConsumerConnector=false + + + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL +log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/logback.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/logback.xml new file mode 100644 index 000000000..1c1e3e91f --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/logback.xml @@ -0,0 +1,58 @@ + + + + + + + + %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/pom.xml new file mode 100644 index 000000000..cd7474cd2 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/pom.xml @@ -0,0 +1,25 @@ + + + 4.0.0 + + com.hurence.logisland + logisland-engines + 1.2.0 + + pom + + + 2.12 + 2.12.10 + 2.6.6 + + + logisland-engine-spark_2_4plus_kafka_2_4plus + + + logisland-engine-spark_2_4plus_common + logisland-engine-spark_2_4_kafka_2_4 + + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml index 7f7c45021..3927640e6 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml @@ -22,7 +22,6 @@ http://www.w3.org/2001/XMLSchema-instance "> 2.3.3 0.10.2.1 2.11.8 - 2.11 2.6.6 diff --git a/logisland-core/logisland-engines/pom.xml b/logisland-core/logisland-engines/pom.xml index 10b200ad7..d6865c420 100644 --- a/logisland-core/logisland-engines/pom.xml +++ b/logisland-core/logisland-engines/pom.xml @@ -15,6 +15,7 @@ + logisland-engine-spark_2_4plus_kafka_2_4plus logisland-engine-spark_2_X logisland-engine-spark_1_6 logisland-engine-vanilla From 4a36bdbd8399e1c2cc39f94aba89ecb178607836 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Mon, 10 Feb 2020 15:39:20 +0100 Subject: [PATCH 05/43] Attempt to add -checkpointLocation option --- .../pom.xml | 9 +- .../spark/AbstractKafkaRecordStream.scala | 404 +++++++++--------- .../spark/KafkaRecordStreamDebugger.scala | 284 ++++++------ .../spark/KafkaRecordStreamHDFSBurner.scala | 330 +++++++------- .../KafkaRecordStreamParallelProcessing.scala | 348 +++++++-------- .../KafkaRecordStreamSQLAggregator.scala | 231 +++++----- ...KafkaStructuredStreamProviderService.scala | 9 +- .../StructuredStreamProviderService.scala | 8 +- .../logisland/util/spark/SparkUtils.scala | 1 + .../spark/structured/StructuredStream.scala | 31 -- .../logisland/runner/GlobalOptions.java | 13 + .../runner/StreamProcessingRunner.java | 23 + 12 files changed, 850 insertions(+), 841 deletions(-) create mode 100644 logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/GlobalOptions.java diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/pom.xml index f49996d4d..911e5047e 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/pom.xml +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/pom.xml @@ -28,9 +28,10 @@ http://www.w3.org/2001/XMLSchema-instance "> + com.typesafe.scala-logging - scala-logging-slf4j_${scala.binary.version} + scala-logging-slf4j_2.11 2.1.2 provided @@ -146,7 +147,13 @@ http://www.w3.org/2001/XMLSchema-instance "> com.hurence.logisland logisland-plugin-support ${project.version} + provided + + + com.hurence.logisland + logisland-bootstrap + ${project.version} provided diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/AbstractKafkaRecordStream.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/AbstractKafkaRecordStream.scala index 8f2da59a7..1ba79f9f1 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/AbstractKafkaRecordStream.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/AbstractKafkaRecordStream.scala @@ -55,7 +55,7 @@ import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySe import org.apache.spark.broadcast.Broadcast import org.apache.spark.groupon.metrics.UserMetricsSystem import org.apache.spark.rdd.RDD -import org.apache.spark.streaming.kafka.KafkaUtils; +//import org.apache.spark.streaming.kafka.KafkaUtils; //import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe //import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent //import org.apache.spark.streaming.kafka010.{CanCommitOffsets, KafkaUtils, OffsetRange} @@ -78,207 +78,207 @@ abstract class AbstractKafkaRecordStream extends AbstractRecordStream with Spark protected var controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink] = null protected var needMetricsReset = false - override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { - val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] - descriptors.add(ERROR_TOPICS) - descriptors.add(INPUT_TOPICS) - descriptors.add(OUTPUT_TOPICS) - descriptors.add(AVRO_INPUT_SCHEMA) - descriptors.add(AVRO_OUTPUT_SCHEMA) - descriptors.add(INPUT_SERIALIZER) - descriptors.add(OUTPUT_SERIALIZER) - descriptors.add(ERROR_SERIALIZER) - descriptors.add(KAFKA_TOPIC_AUTOCREATE) - descriptors.add(KAFKA_TOPIC_DEFAULT_PARTITIONS) - descriptors.add(KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR) - descriptors.add(KAFKA_METADATA_BROKER_LIST) - descriptors.add(KAFKA_ZOOKEEPER_QUORUM) - descriptors.add(KAFKA_MANUAL_OFFSET_RESET) - descriptors.add(KAFKA_BATCH_SIZE) - descriptors.add(KAFKA_LINGER_MS) - descriptors.add(KAFKA_ACKS) - descriptors.add(WINDOW_DURATION) - descriptors.add(SLIDE_DURATION) - Collections.unmodifiableList(descriptors) - } - - - override def setup(appName: String, ssc: StreamingContext, streamContext: StreamContext, engineContext: EngineContext) = { - this.appName = appName - this.ssc = ssc - this.streamContext = streamContext - this.engineContext = engineContext - - } - - override def getStreamContext(): StreamingContext = this.ssc - - override def start() = { - if (ssc == null) - throw new IllegalStateException("stream not initialized") - - try { - - // Define the Kafka parameters, broker list must be specified - val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString.split(",").toSet - val outputTopics = streamContext.getPropertyValue(OUTPUT_TOPICS).asString.split(",").toSet - val errorTopics = streamContext.getPropertyValue(ERROR_TOPICS).asString.split(",").toSet - val metricsTopics = DEFAULT_METRICS_TOPIC.getValue.split(",").toSet - - val topicAutocreate = streamContext.getPropertyValue(KAFKA_TOPIC_AUTOCREATE).asBoolean().booleanValue() - val topicDefaultPartitions = streamContext.getPropertyValue(KAFKA_TOPIC_DEFAULT_PARTITIONS).asInteger().intValue() - val topicDefaultReplicationFactor = streamContext.getPropertyValue(KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR).asInteger().intValue() - val brokerList = streamContext.getPropertyValue(KAFKA_METADATA_BROKER_LIST).asString - val zkQuorum = streamContext.getPropertyValue(KAFKA_ZOOKEEPER_QUORUM).asString - - val kafkaBatchSize = streamContext.getPropertyValue(KAFKA_BATCH_SIZE).asString - val kafkaLingerMs = streamContext.getPropertyValue(KAFKA_LINGER_MS).asString - val kafkaAcks = streamContext.getPropertyValue(KAFKA_ACKS).asString - val kafkaOffset = streamContext.getPropertyValue(KAFKA_MANUAL_OFFSET_RESET).asString - - - val kafkaSinkParams = Map( - ProducerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokerList, - ProducerConfig.CLIENT_ID_CONFIG -> appName, - ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getCanonicalName, - ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName, - ProducerConfig.ACKS_CONFIG -> kafkaAcks, - ProducerConfig.RETRIES_CONFIG -> "3", - ProducerConfig.LINGER_MS_CONFIG -> kafkaLingerMs, - ProducerConfig.BATCH_SIZE_CONFIG -> kafkaBatchSize, - ProducerConfig.RETRY_BACKOFF_MS_CONFIG -> "1000", - ProducerConfig.RECONNECT_BACKOFF_MS_CONFIG -> "1000") - - kafkaSink = ssc.sparkContext.broadcast(KafkaSink(kafkaSinkParams)) - controllerServiceLookupSink = ssc.sparkContext.broadcast( - ControllerServiceLookupSink(engineContext.getControllerServiceConfigurations) - ) - - // TODO deprecate topic creation here (must be done through the agent) -// if (topicAutocreate) { -//// val zkUtils = ZkUtils.apply(zkQuorum, 10000, 10000, JaasUtils.isZkSecurityEnabled) -//// createTopicsIfNeeded(zkUtils, inputTopics, topicDefaultPartitions, topicDefaultReplicationFactor) -//// createTopicsIfNeeded(zkUtils, outputTopics, topicDefaultPartitions, topicDefaultReplicationFactor) -//// createTopicsIfNeeded(zkUtils, errorTopics, topicDefaultPartitions, topicDefaultReplicationFactor) -//// createTopicsIfNeeded(zkUtils, metricsTopics, 1, 1) -// zkQuorum -// val zooKeeperClient : ZooKeeperClient = new ZooKeeperClient(zkQuorum, -// 1000, -// 1000: Int, -// 10: Int, -// , -// metricGroup: String, -// metricType: String) ) -// val kafkaZkClient : KafkaZkClient = new KafkaZkClient -// val adminZkClient : AdminZkClient = new AdminZkClient[kafkaZkClient] -// } - - - val kafkaParams = Map[String, Object]( - ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokerList, - ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer], - ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer], - ConsumerConfig.GROUP_ID_CONFIG -> appName, - ConsumerConfig.RECONNECT_BACKOFF_MS_CONFIG -> "50", - ConsumerConfig.RETRY_BACKOFF_MS_CONFIG -> "100", - ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> kafkaOffset, - ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> "false", - ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG -> "30000" - /*, - ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG -> "5000"*/ - ) - - - logger.info(s"starting Kafka direct stream on topics $inputTopics from $kafkaOffset offsets") - @transient val kafkaStream = KafkaUtils.createDirectStream[Array[Byte], Array[Byte]]( - ssc, - PreferConsistent, - Subscribe[Array[Byte], Array[Byte]](inputTopics, kafkaParams) - ) - - // do the parallel processing - - val stream = if (streamContext.getPropertyValue(WINDOW_DURATION).isSet) { - if (streamContext.getPropertyValue(SLIDE_DURATION).isSet) - kafkaStream.window( - Seconds(streamContext.getPropertyValue(WINDOW_DURATION).asLong()), - Seconds(streamContext.getPropertyValue(SLIDE_DURATION).asLong()) - ) - else - kafkaStream.window(Seconds(streamContext.getPropertyValue(WINDOW_DURATION).asLong())) - - } else kafkaStream - - - stream - .foreachRDD(rdd => { - - this.streamContext.getProcessContexts().clear(); - this.streamContext.getProcessContexts().addAll( - PipelineConfigurationBroadcastWrapper.getInstance().get(this.streamContext.getIdentifier)) - - if (!rdd.isEmpty()) { - - - val offsetRanges = process(rdd) - // some time later, after outputs have completed - if (offsetRanges.nonEmpty) { - // kafkaStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges.get) - - - kafkaStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges.get, new OffsetCommitCallback() { - def onComplete(m: java.util.Map[TopicPartition, OffsetAndMetadata], e: Exception) { - if (null != e) { - logger.error("error commiting offsets", e) - } - } - }) - - - needMetricsReset = true - } - else if (needMetricsReset) { - try { - - for (partitionId <- 0 to rdd.getNumPartitions) { - val pipelineMetricPrefix = streamContext.getIdentifier + "." + - "partition" + partitionId + "." - val pipelineTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + "Pipeline.processing_time_ms").time() - - streamContext.getProcessContexts.foreach(processorContext => { - UserMetricsSystem.timer(pipelineMetricPrefix + processorContext.getIdentifier + ".processing_time_ms") - .time() - .stop() - - ProcessorMetrics.resetMetrics(pipelineMetricPrefix + processorContext.getIdentifier + ".") - }) - pipelineTimerContext.stop() - } - } catch { - case ex: Throwable => - logger.error(s"exception : ${ex.toString}") - None - } finally { - needMetricsReset = false - } - } - } - - }) - } catch { - case ex: Throwable => - ex.printStackTrace() - logger.error("something bad happened, please check Kafka or Zookeeper health : {}", ex) - } - } - - - /** - * to be overriden by subclasses - * - * @param rdd - */ - def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] +// override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { +// val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] +// descriptors.add(ERROR_TOPICS) +// descriptors.add(INPUT_TOPICS) +// descriptors.add(OUTPUT_TOPICS) +// descriptors.add(AVRO_INPUT_SCHEMA) +// descriptors.add(AVRO_OUTPUT_SCHEMA) +// descriptors.add(INPUT_SERIALIZER) +// descriptors.add(OUTPUT_SERIALIZER) +// descriptors.add(ERROR_SERIALIZER) +// descriptors.add(KAFKA_TOPIC_AUTOCREATE) +// descriptors.add(KAFKA_TOPIC_DEFAULT_PARTITIONS) +// descriptors.add(KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR) +// descriptors.add(KAFKA_METADATA_BROKER_LIST) +// descriptors.add(KAFKA_ZOOKEEPER_QUORUM) +// descriptors.add(KAFKA_MANUAL_OFFSET_RESET) +// descriptors.add(KAFKA_BATCH_SIZE) +// descriptors.add(KAFKA_LINGER_MS) +// descriptors.add(KAFKA_ACKS) +// descriptors.add(WINDOW_DURATION) +// descriptors.add(SLIDE_DURATION) +// Collections.unmodifiableList(descriptors) +// } +// +// +// override def setup(appName: String, ssc: StreamingContext, streamContext: StreamContext, engineContext: EngineContext) = { +// this.appName = appName +// this.ssc = ssc +// this.streamContext = streamContext +// this.engineContext = engineContext +// +// } +// +// override def getStreamContext(): StreamingContext = this.ssc +// +// override def start() = { +// if (ssc == null) +// throw new IllegalStateException("stream not initialized") +// +// try { +// +// // Define the Kafka parameters, broker list must be specified +// val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString.split(",").toSet +// val outputTopics = streamContext.getPropertyValue(OUTPUT_TOPICS).asString.split(",").toSet +// val errorTopics = streamContext.getPropertyValue(ERROR_TOPICS).asString.split(",").toSet +// val metricsTopics = DEFAULT_METRICS_TOPIC.getValue.split(",").toSet +// +// val topicAutocreate = streamContext.getPropertyValue(KAFKA_TOPIC_AUTOCREATE).asBoolean().booleanValue() +// val topicDefaultPartitions = streamContext.getPropertyValue(KAFKA_TOPIC_DEFAULT_PARTITIONS).asInteger().intValue() +// val topicDefaultReplicationFactor = streamContext.getPropertyValue(KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR).asInteger().intValue() +// val brokerList = streamContext.getPropertyValue(KAFKA_METADATA_BROKER_LIST).asString +// val zkQuorum = streamContext.getPropertyValue(KAFKA_ZOOKEEPER_QUORUM).asString +// +// val kafkaBatchSize = streamContext.getPropertyValue(KAFKA_BATCH_SIZE).asString +// val kafkaLingerMs = streamContext.getPropertyValue(KAFKA_LINGER_MS).asString +// val kafkaAcks = streamContext.getPropertyValue(KAFKA_ACKS).asString +// val kafkaOffset = streamContext.getPropertyValue(KAFKA_MANUAL_OFFSET_RESET).asString +// +// +// val kafkaSinkParams = Map( +// ProducerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokerList, +// ProducerConfig.CLIENT_ID_CONFIG -> appName, +// ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getCanonicalName, +// ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName, +// ProducerConfig.ACKS_CONFIG -> kafkaAcks, +// ProducerConfig.RETRIES_CONFIG -> "3", +// ProducerConfig.LINGER_MS_CONFIG -> kafkaLingerMs, +// ProducerConfig.BATCH_SIZE_CONFIG -> kafkaBatchSize, +// ProducerConfig.RETRY_BACKOFF_MS_CONFIG -> "1000", +// ProducerConfig.RECONNECT_BACKOFF_MS_CONFIG -> "1000") +// +// kafkaSink = ssc.sparkContext.broadcast(KafkaSink(kafkaSinkParams)) +// controllerServiceLookupSink = ssc.sparkContext.broadcast( +// ControllerServiceLookupSink(engineContext.getControllerServiceConfigurations) +// ) +// +// // TODO deprecate topic creation here (must be done through the agent) +//// if (topicAutocreate) { +////// val zkUtils = ZkUtils.apply(zkQuorum, 10000, 10000, JaasUtils.isZkSecurityEnabled) +////// createTopicsIfNeeded(zkUtils, inputTopics, topicDefaultPartitions, topicDefaultReplicationFactor) +////// createTopicsIfNeeded(zkUtils, outputTopics, topicDefaultPartitions, topicDefaultReplicationFactor) +////// createTopicsIfNeeded(zkUtils, errorTopics, topicDefaultPartitions, topicDefaultReplicationFactor) +////// createTopicsIfNeeded(zkUtils, metricsTopics, 1, 1) +//// zkQuorum +//// val zooKeeperClient : ZooKeeperClient = new ZooKeeperClient(zkQuorum, +//// 1000, +//// 1000: Int, +//// 10: Int, +//// , +//// metricGroup: String, +//// metricType: String) ) +//// val kafkaZkClient : KafkaZkClient = new KafkaZkClient +//// val adminZkClient : AdminZkClient = new AdminZkClient[kafkaZkClient] +//// } +// +// +// val kafkaParams = Map[String, Object]( +// ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokerList, +// ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer], +// ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer], +// ConsumerConfig.GROUP_ID_CONFIG -> appName, +// ConsumerConfig.RECONNECT_BACKOFF_MS_CONFIG -> "50", +// ConsumerConfig.RETRY_BACKOFF_MS_CONFIG -> "100", +// ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> kafkaOffset, +// ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> "false", +// ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG -> "30000" +// /*, +// ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG -> "5000"*/ +// ) +// +// +// logger.info(s"starting Kafka direct stream on topics $inputTopics from $kafkaOffset offsets") +// @transient val kafkaStream = KafkaUtils.createDirectStream[Array[Byte], Array[Byte]]( +// ssc, +// PreferConsistent, +// Subscribe[Array[Byte], Array[Byte]](inputTopics, kafkaParams) +// ) +// +// // do the parallel processing +// +// val stream = if (streamContext.getPropertyValue(WINDOW_DURATION).isSet) { +// if (streamContext.getPropertyValue(SLIDE_DURATION).isSet) +// kafkaStream.window( +// Seconds(streamContext.getPropertyValue(WINDOW_DURATION).asLong()), +// Seconds(streamContext.getPropertyValue(SLIDE_DURATION).asLong()) +// ) +// else +// kafkaStream.window(Seconds(streamContext.getPropertyValue(WINDOW_DURATION).asLong())) +// +// } else kafkaStream +// +// +// stream +// .foreachRDD(rdd => { +// +// this.streamContext.getProcessContexts().clear(); +// this.streamContext.getProcessContexts().addAll( +// PipelineConfigurationBroadcastWrapper.getInstance().get(this.streamContext.getIdentifier)) +// +// if (!rdd.isEmpty()) { +// +// +// val offsetRanges = process(rdd) +// // some time later, after outputs have completed +// if (offsetRanges.nonEmpty) { +// // kafkaStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges.get) +// +// +// kafkaStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges.get, new OffsetCommitCallback() { +// def onComplete(m: java.util.Map[TopicPartition, OffsetAndMetadata], e: Exception) { +// if (null != e) { +// logger.error("error commiting offsets", e) +// } +// } +// }) +// +// +// needMetricsReset = true +// } +// else if (needMetricsReset) { +// try { +// +// for (partitionId <- 0 to rdd.getNumPartitions) { +// val pipelineMetricPrefix = streamContext.getIdentifier + "." + +// "partition" + partitionId + "." +// val pipelineTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + "Pipeline.processing_time_ms").time() +// +// streamContext.getProcessContexts.foreach(processorContext => { +// UserMetricsSystem.timer(pipelineMetricPrefix + processorContext.getIdentifier + ".processing_time_ms") +// .time() +// .stop() +// +// ProcessorMetrics.resetMetrics(pipelineMetricPrefix + processorContext.getIdentifier + ".") +// }) +// pipelineTimerContext.stop() +// } +// } catch { +// case ex: Throwable => +// logger.error(s"exception : ${ex.toString}") +// None +// } finally { +// needMetricsReset = false +// } +// } +// } +// +// }) +// } catch { +// case ex: Throwable => +// ex.printStackTrace() +// logger.error("something bad happened, please check Kafka or Zookeeper health : {}", ex) +// } +// } +// +// +// /** +// * to be overriden by subclasses +// * +// * @param rdd +// */ +// def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] /** diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamDebugger.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamDebugger.scala index 3af102811..4165e4e8b 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamDebugger.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamDebugger.scala @@ -40,152 +40,152 @@ import org.apache.avro.Schema import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD -import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange} +//import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange} import org.slf4j.LoggerFactory import scala.collection.JavaConversions._ import com.hurence.logisland.stream.StreamProperties._ -class KafkaRecordStreamDebugger extends AbstractKafkaRecordStream { - val logger = LoggerFactory.getLogger(this.getClass.getName) - - - /** - * launch the chain of processing for each partition of the RDD in parallel - * - * @param rdd - */ - override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { - if (!rdd.isEmpty()) { - // Cast the rdd to an interface that lets us get an array of OffsetRange - val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges - - val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString - val outputTopics = streamContext.getPropertyValue(OUTPUT_TOPICS).asString - val errorTopics = streamContext.getPropertyValue(ERROR_TOPICS).asString - val brokerList = streamContext.getPropertyValue(KAFKA_METADATA_BROKER_LIST).asString - - - rdd.foreachPartition(partition => { - if (partition.nonEmpty) { - /** - * index to get the correct offset range for the rdd partition we're working on - * This is safe because we haven't shuffled or otherwise disrupted partitioning, - * and the original input rdd partitions were 1:1 with kafka partitions - */ - val partitionId = TaskContext.get.partitionId() - val offsetRange = offsetRanges(TaskContext.get.partitionId) - - /** - * create serializers - */ - val deserializer = getSerializer( - streamContext.getPropertyValue(INPUT_SERIALIZER).asString, - streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) - val serializer = getSerializer( - streamContext.getPropertyValue(OUTPUT_SERIALIZER).asString, - streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) - val errorSerializer = getSerializer( - streamContext.getPropertyValue(ERROR_SERIALIZER).asString, - streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) - - /** - * process events by chaining output records - */ - var firstPass = true - var incomingEvents: util.Collection[Record] = Collections.emptyList() - var outgoingEvents: util.Collection[Record] = Collections.emptyList() - val processingMetrics: util.Collection[Record] = new util.ArrayList[Record]() - logger.info("start processing") - - streamContext.getProcessContexts.foreach(processorContext => { - val startTime = System.currentTimeMillis() - val processor = processorContext.getProcessor - - - if (firstPass) { - /** - * convert incoming Kafka messages into Records - * if there's no serializer we assume that we need to compute a Record from K/V - */ - incomingEvents = if ( - streamContext.getPropertyValue(INPUT_SERIALIZER).asString - == NO_SERIALIZER.getValue) { - // parser - partition.map(rawMessage => { - val key = if (rawMessage.key() != null) new String(rawMessage.key()) else "" - val value = if (rawMessage.value() != null) new String(rawMessage.value()) else "" - RecordUtils.getKeyValueRecord(key, value) - }).toList - } else { - // processor - deserializeRecords(partition, deserializer) - } - - firstPass = false - } else { - incomingEvents = outgoingEvents - } - - /** - * process incoming events - */ - outgoingEvents = processor.process(processorContext, incomingEvents) - - - }) - - - /** - * Do we make records compliant with a given Avro schema ? - */ - if (streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).isSet) { - try { - val strSchema = streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString() - val schema = RecordSchemaUtil.compileSchema(strSchema) - - - outgoingEvents = outgoingEvents.map(record => RecordSchemaUtil.convertToValidRecord(record, schema)) - } catch { - case t: Throwable => - logger.warn("something wrong while converting records " + - "to valid accordingly to provide Avro schema " + t.getMessage) - } - - } - - - logger.info("sending to kafka") - - /** - * push outgoing events and errors to Kafka - */ - kafkaSink.value.produce( - streamContext.getPropertyValue(OUTPUT_TOPICS).asString, - outgoingEvents.toList, - serializer - ) - - kafkaSink.value.produce( - streamContext.getPropertyValue(ERROR_TOPICS).asString, - outgoingEvents.filter(r => r.hasField(FieldDictionary.RECORD_ERRORS)).toList, - errorSerializer - ) - - logger.info("saving offsets") - - /** - * save latest offset to Zookeeper - */ - // zkSink.value.saveOffsetRangesToZookeeper(appName, offsetRange) - logger.info("processed " + outgoingEvents.size() + " messages") - } - }) - - return Some(offsetRanges) - } - None - } -} +//class KafkaRecordStreamDebugger extends AbstractKafkaRecordStream { +// val logger = LoggerFactory.getLogger(this.getClass.getName) +// +// +// /** +// * launch the chain of processing for each partition of the RDD in parallel +// * +// * @param rdd +// */ +// override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { +// if (!rdd.isEmpty()) { +// // Cast the rdd to an interface that lets us get an array of OffsetRange +// val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges +// +// val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString +// val outputTopics = streamContext.getPropertyValue(OUTPUT_TOPICS).asString +// val errorTopics = streamContext.getPropertyValue(ERROR_TOPICS).asString +// val brokerList = streamContext.getPropertyValue(KAFKA_METADATA_BROKER_LIST).asString +// +// +// rdd.foreachPartition(partition => { +// if (partition.nonEmpty) { +// /** +// * index to get the correct offset range for the rdd partition we're working on +// * This is safe because we haven't shuffled or otherwise disrupted partitioning, +// * and the original input rdd partitions were 1:1 with kafka partitions +// */ +// val partitionId = TaskContext.get.partitionId() +// val offsetRange = offsetRanges(TaskContext.get.partitionId) +// +// /** +// * create serializers +// */ +// val deserializer = getSerializer( +// streamContext.getPropertyValue(INPUT_SERIALIZER).asString, +// streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) +// val serializer = getSerializer( +// streamContext.getPropertyValue(OUTPUT_SERIALIZER).asString, +// streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) +// val errorSerializer = getSerializer( +// streamContext.getPropertyValue(ERROR_SERIALIZER).asString, +// streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) +// +// /** +// * process events by chaining output records +// */ +// var firstPass = true +// var incomingEvents: util.Collection[Record] = Collections.emptyList() +// var outgoingEvents: util.Collection[Record] = Collections.emptyList() +// val processingMetrics: util.Collection[Record] = new util.ArrayList[Record]() +// logger.info("start processing") +// +// streamContext.getProcessContexts.foreach(processorContext => { +// val startTime = System.currentTimeMillis() +// val processor = processorContext.getProcessor +// +// +// if (firstPass) { +// /** +// * convert incoming Kafka messages into Records +// * if there's no serializer we assume that we need to compute a Record from K/V +// */ +// incomingEvents = if ( +// streamContext.getPropertyValue(INPUT_SERIALIZER).asString +// == NO_SERIALIZER.getValue) { +// // parser +// partition.map(rawMessage => { +// val key = if (rawMessage.key() != null) new String(rawMessage.key()) else "" +// val value = if (rawMessage.value() != null) new String(rawMessage.value()) else "" +// RecordUtils.getKeyValueRecord(key, value) +// }).toList +// } else { +// // processor +// deserializeRecords(partition, deserializer) +// } +// +// firstPass = false +// } else { +// incomingEvents = outgoingEvents +// } +// +// /** +// * process incoming events +// */ +// outgoingEvents = processor.process(processorContext, incomingEvents) +// +// +// }) +// +// +// /** +// * Do we make records compliant with a given Avro schema ? +// */ +// if (streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).isSet) { +// try { +// val strSchema = streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString() +// val schema = RecordSchemaUtil.compileSchema(strSchema) +// +// +// outgoingEvents = outgoingEvents.map(record => RecordSchemaUtil.convertToValidRecord(record, schema)) +// } catch { +// case t: Throwable => +// logger.warn("something wrong while converting records " + +// "to valid accordingly to provide Avro schema " + t.getMessage) +// } +// +// } +// +// +// logger.info("sending to kafka") +// +// /** +// * push outgoing events and errors to Kafka +// */ +// kafkaSink.value.produce( +// streamContext.getPropertyValue(OUTPUT_TOPICS).asString, +// outgoingEvents.toList, +// serializer +// ) +// +// kafkaSink.value.produce( +// streamContext.getPropertyValue(ERROR_TOPICS).asString, +// outgoingEvents.filter(r => r.hasField(FieldDictionary.RECORD_ERRORS)).toList, +// errorSerializer +// ) +// +// logger.info("saving offsets") +// +// /** +// * save latest offset to Zookeeper +// */ +// // zkSink.value.saveOffsetRangesToZookeeper(appName, offsetRange) +// logger.info("processed " + outgoingEvents.size() + " messages") +// } +// }) +// +// return Some(offsetRanges) +// } +// None +// } +//} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamHDFSBurner.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamHDFSBurner.scala index 3cf4fdc3a..832639e1e 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamHDFSBurner.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamHDFSBurner.scala @@ -57,173 +57,173 @@ import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.spark.rdd.RDD import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession} -import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange} +//import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange} import org.slf4j.LoggerFactory -class KafkaRecordStreamHDFSBurner extends AbstractKafkaRecordStream { - - - private val logger = LoggerFactory.getLogger(classOf[KafkaRecordStreamHDFSBurner]) - - - override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { - val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] - - descriptors.addAll(super.getSupportedPropertyDescriptors()) - - descriptors.add(OUTPUT_FOLDER_PATH) - descriptors.add(OUTPUT_FORMAT) - descriptors.add(RECORD_TYPE) - descriptors.add(NUM_PARTITIONS) - descriptors.add(EXCLUDE_ERRORS) - descriptors.add(DATE_FORMAT) - descriptors.add(INPUT_FORMAT) - Collections.unmodifiableList(descriptors) - } - - private def sanitizeSchema(dataType: DataType): DataType = { - dataType match { - case structType: StructType => - DataTypes.createStructType(structType.fields.map(f => - DataTypes.createStructField(f.name.replaceAll("[:,-]", "_"), sanitizeSchema(f.dataType), f.nullable, f.metadata) - )) - case arrayType: ArrayType => - DataTypes.createArrayType(sanitizeSchema(arrayType.elementType), arrayType.containsNull) - case mapType: MapType => - DataTypes.createMapType(sanitizeSchema(mapType.keyType), sanitizeSchema(mapType.valueType), mapType.valueContainsNull) - case other => other - } - - - } - - override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { - if (!rdd.isEmpty()) { - // Cast the rdd to an interface that lets us get an array of OffsetRange - val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges - - // Get the singleton instance of SQLContext - val sqlContext = SparkSession - .builder() - .appName(appName) - .config(ssc.sparkContext.getConf) - .getOrCreate() - - - // this is used to implicitly convert an RDD to a DataFrame. - - val deserializer = getSerializer( - streamContext.getPropertyValue(INPUT_SERIALIZER).asString, - streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) - - - val records = rdd.mapPartitions(p => deserializeRecords(p, deserializer).iterator) - - - if (!records.isEmpty()) { - - - val sdf = new SimpleDateFormat(streamContext.getPropertyValue(DATE_FORMAT).asString) - - - val numPartitions = streamContext.getPropertyValue(NUM_PARTITIONS).asInteger() - val outputFormat = streamContext.getPropertyValue(OUTPUT_FORMAT).asString() - val doExcludeErrors = streamContext.getPropertyValue(EXCLUDE_ERRORS).asBoolean() - val recordType = streamContext.getPropertyValue(RECORD_TYPE).asString() - val outPath = streamContext.getPropertyValue(OUTPUT_FOLDER_PATH).asString() - - val records = rdd.mapPartitions(p => deserializeRecords(p, deserializer).iterator) - .filter(r => - r.hasField(FieldDictionary.RECORD_TYPE) && - r.getField(FieldDictionary.RECORD_TYPE).asString() == recordType) - .map(r => { - try { - if (r.hasField(FieldDictionary.RECORD_DAYTIME)) - r - else - r.setField(FieldDictionary.RECORD_DAYTIME, FieldType.STRING, sdf.format(r.getTime)) - } - catch { - case ex: Throwable => r - } - }) - - - if (!records.isEmpty()) { - var df: DataFrame = null; - val inputFormat = streamContext.getPropertyValue(INPUT_FORMAT).asString() - if (inputFormat.isEmpty) { - - val schema = SparkUtils.convertFieldsNameToSchema(records.take(1)(0)) - val rows = if (doExcludeErrors) { - records - .filter(r => !r.hasField(FieldDictionary.RECORD_ERRORS)) - .map(r => SparkUtils.convertToRow(r, schema)) - } else { - records.map(r => SparkUtils.convertToRow(r, schema)) - } - - - logger.info(schema.toString()) - df = sqlContext.createDataFrame(rows, schema) - } else { - if ("json".equals(inputFormat)) { - import sqlContext.implicits._ - val rdf = records.map(record => (record.getType, record.getField(FieldDictionary.RECORD_DAYTIME).asString)) - .toDF(FieldDictionary.RECORD_TYPE, FieldDictionary.RECORD_DAYTIME) - val json = sqlContext.read.json(records.map(record => record.getField(FieldDictionary.RECORD_VALUE).asString())) - val merged = rdf.rdd.zip(json.rdd) - .map { - case (rowLeft, rowRight) => Row.fromSeq(rowLeft.toSeq ++ rowRight.toSeq) - } - df = sqlContext.createDataFrame(merged, StructType(rdf.schema.fields ++ sanitizeSchema(json.schema).asInstanceOf[StructType].fields)) - } else { - throw new IllegalArgumentException(s"Input format $inputFormat is not supported") - } - } - - outputFormat match { - case FILE_FORMAT_PARQUET => - df.repartition(numPartitions) - .write - .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) - .mode(SaveMode.Append) - .parquet(outPath) - case FILE_FORMAT_JSON => - df.repartition(numPartitions) - .write - .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) - .mode(SaveMode.Append) - .json(outPath) - case FILE_FORMAT_ORC => - df.repartition(numPartitions) - .write - .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) - .mode(SaveMode.Append) - .orc(outPath) - case FILE_FORMAT_TXT => - df.repartition(numPartitions) - .write - .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) - .mode(SaveMode.Append) - .text(outPath) - case _ => - throw new IllegalArgumentException(s"$outputFormat not supported yet") - } - - /** - * save latest offset to Zookeeper - */ - // offsetRanges.foreach(offsetRange => zkSink.value.saveOffsetRangesToZookeeper(appName, offsetRange)) - } - - } - - return Some(offsetRanges) - } - None - } -} +//class KafkaRecordStreamHDFSBurner extends AbstractKafkaRecordStream { +// +// +// private val logger = LoggerFactory.getLogger(classOf[KafkaRecordStreamHDFSBurner]) +// +// +// override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { +// val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] +// +// descriptors.addAll(super.getSupportedPropertyDescriptors()) +// +// descriptors.add(OUTPUT_FOLDER_PATH) +// descriptors.add(OUTPUT_FORMAT) +// descriptors.add(RECORD_TYPE) +// descriptors.add(NUM_PARTITIONS) +// descriptors.add(EXCLUDE_ERRORS) +// descriptors.add(DATE_FORMAT) +// descriptors.add(INPUT_FORMAT) +// Collections.unmodifiableList(descriptors) +// } +// +// private def sanitizeSchema(dataType: DataType): DataType = { +// dataType match { +// case structType: StructType => +// DataTypes.createStructType(structType.fields.map(f => +// DataTypes.createStructField(f.name.replaceAll("[:,-]", "_"), sanitizeSchema(f.dataType), f.nullable, f.metadata) +// )) +// case arrayType: ArrayType => +// DataTypes.createArrayType(sanitizeSchema(arrayType.elementType), arrayType.containsNull) +// case mapType: MapType => +// DataTypes.createMapType(sanitizeSchema(mapType.keyType), sanitizeSchema(mapType.valueType), mapType.valueContainsNull) +// case other => other +// } +// +// +// } +// +// override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { +// if (!rdd.isEmpty()) { +// // Cast the rdd to an interface that lets us get an array of OffsetRange +// val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges +// +// // Get the singleton instance of SQLContext +// val sqlContext = SparkSession +// .builder() +// .appName(appName) +// .config(ssc.sparkContext.getConf) +// .getOrCreate() +// +// +// // this is used to implicitly convert an RDD to a DataFrame. +// +// val deserializer = getSerializer( +// streamContext.getPropertyValue(INPUT_SERIALIZER).asString, +// streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) +// +// +// val records = rdd.mapPartitions(p => deserializeRecords(p, deserializer).iterator) +// +// +// if (!records.isEmpty()) { +// +// +// val sdf = new SimpleDateFormat(streamContext.getPropertyValue(DATE_FORMAT).asString) +// +// +// val numPartitions = streamContext.getPropertyValue(NUM_PARTITIONS).asInteger() +// val outputFormat = streamContext.getPropertyValue(OUTPUT_FORMAT).asString() +// val doExcludeErrors = streamContext.getPropertyValue(EXCLUDE_ERRORS).asBoolean() +// val recordType = streamContext.getPropertyValue(RECORD_TYPE).asString() +// val outPath = streamContext.getPropertyValue(OUTPUT_FOLDER_PATH).asString() +// +// val records = rdd.mapPartitions(p => deserializeRecords(p, deserializer).iterator) +// .filter(r => +// r.hasField(FieldDictionary.RECORD_TYPE) && +// r.getField(FieldDictionary.RECORD_TYPE).asString() == recordType) +// .map(r => { +// try { +// if (r.hasField(FieldDictionary.RECORD_DAYTIME)) +// r +// else +// r.setField(FieldDictionary.RECORD_DAYTIME, FieldType.STRING, sdf.format(r.getTime)) +// } +// catch { +// case ex: Throwable => r +// } +// }) +// +// +// if (!records.isEmpty()) { +// var df: DataFrame = null; +// val inputFormat = streamContext.getPropertyValue(INPUT_FORMAT).asString() +// if (inputFormat.isEmpty) { +// +// val schema = SparkUtils.convertFieldsNameToSchema(records.take(1)(0)) +// val rows = if (doExcludeErrors) { +// records +// .filter(r => !r.hasField(FieldDictionary.RECORD_ERRORS)) +// .map(r => SparkUtils.convertToRow(r, schema)) +// } else { +// records.map(r => SparkUtils.convertToRow(r, schema)) +// } +// +// +// logger.info(schema.toString()) +// df = sqlContext.createDataFrame(rows, schema) +// } else { +// if ("json".equals(inputFormat)) { +// import sqlContext.implicits._ +// val rdf = records.map(record => (record.getType, record.getField(FieldDictionary.RECORD_DAYTIME).asString)) +// .toDF(FieldDictionary.RECORD_TYPE, FieldDictionary.RECORD_DAYTIME) +// val json = sqlContext.read.json(records.map(record => record.getField(FieldDictionary.RECORD_VALUE).asString())) +// val merged = rdf.rdd.zip(json.rdd) +// .map { +// case (rowLeft, rowRight) => Row.fromSeq(rowLeft.toSeq ++ rowRight.toSeq) +// } +// df = sqlContext.createDataFrame(merged, StructType(rdf.schema.fields ++ sanitizeSchema(json.schema).asInstanceOf[StructType].fields)) +// } else { +// throw new IllegalArgumentException(s"Input format $inputFormat is not supported") +// } +// } +// +// outputFormat match { +// case FILE_FORMAT_PARQUET => +// df.repartition(numPartitions) +// .write +// .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) +// .mode(SaveMode.Append) +// .parquet(outPath) +// case FILE_FORMAT_JSON => +// df.repartition(numPartitions) +// .write +// .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) +// .mode(SaveMode.Append) +// .json(outPath) +// case FILE_FORMAT_ORC => +// df.repartition(numPartitions) +// .write +// .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) +// .mode(SaveMode.Append) +// .orc(outPath) +// case FILE_FORMAT_TXT => +// df.repartition(numPartitions) +// .write +// .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) +// .mode(SaveMode.Append) +// .text(outPath) +// case _ => +// throw new IllegalArgumentException(s"$outputFormat not supported yet") +// } +// +// /** +// * save latest offset to Zookeeper +// */ +// // offsetRanges.foreach(offsetRange => zkSink.value.saveOffsetRangesToZookeeper(appName, offsetRange)) +// } +// +// } +// +// return Some(offsetRanges) +// } +// None +// } +//} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamParallelProcessing.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamParallelProcessing.scala index 4fef443f3..7cede3283 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamParallelProcessing.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamParallelProcessing.scala @@ -43,184 +43,184 @@ import org.apache.kafka.common.errors.OffsetOutOfRangeException import org.apache.spark.TaskContext import org.apache.spark.groupon.metrics.{SparkMeter, UserMetricsSystem} import org.apache.spark.rdd.RDD -import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, OffsetRange} +//import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, OffsetRange} import org.slf4j.LoggerFactory import scala.collection.JavaConversions._ import com.hurence.logisland.stream.StreamProperties._ -class KafkaRecordStreamParallelProcessing extends AbstractKafkaRecordStream { - val logger = LoggerFactory.getLogger(this.getClass) - - override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { - val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] - - descriptors.addAll(super.getSupportedPropertyDescriptors()) - Collections.unmodifiableList(descriptors) - } - - /** - * launch the chain of processing for each partition of the RDD in parallel - * - * @param rdd - */ - override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { - if (!rdd.isEmpty()) { - // Cast the rdd to an interface that lets us get an array of OffsetRange - val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges - - rdd.foreachPartition(partition => { - try { - if (partition.nonEmpty) { - /** - * index to get the correct offset range for the rdd partition we're working on - * This is safe because we haven't shuffled or otherwise disrupted partitioning, - * and the original input rdd partitions were 1:1 with kafka partitions - */ - val partitionId = TaskContext.get.partitionId() - val offsetRange = offsetRanges(TaskContext.get.partitionId) - - val pipelineMetricPrefix = streamContext.getIdentifier + "." + - "partition" + partitionId + "." - val pipelineTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + "Pipeline.processing_time_ms" ).time() - - - /** - * create serializers - */ - val deserializer = getSerializer( - streamContext.getPropertyValue(INPUT_SERIALIZER).asString, - streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) - val serializer = getSerializer( - streamContext.getPropertyValue(OUTPUT_SERIALIZER).asString, - streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) - val errorSerializer = getSerializer( - streamContext.getPropertyValue(ERROR_SERIALIZER).asString, - streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) - - /** - * process events by chaining output records - */ - var firstPass = true - var incomingEvents: util.Collection[Record] = Collections.emptyList() - var outgoingEvents: util.Collection[Record] = Collections.emptyList() - - streamContext.getProcessContexts.foreach(processorContext => { - val startTime = System.currentTimeMillis() - val processor = processorContext.getProcessor - - val processorTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + - processorContext.getIdentifier + ".processing_time_ms").time() - /** - * convert incoming Kafka messages into Records - * if there's no serializer we assume that we need to compute a Record from K/V - */ - if (firstPass) { - incomingEvents = if ( - streamContext.getPropertyValue(INPUT_SERIALIZER).asString - == NO_SERIALIZER.getValue) { - // parser - partition.map(rawMessage => { - val key = if (rawMessage.key() != null) new String(rawMessage.key()) else "" - val value = if (rawMessage.value() != null) new String(rawMessage.value()) else "" - RecordUtils.getKeyValueRecord(key, value) - }).toList - } else { - // processor - deserializeRecords(partition, deserializer) - } - - firstPass = false - } else { - incomingEvents = outgoingEvents - } - - /** - * process incoming events - */ - if (processor.hasControllerService) { - val controllerServiceLookup = controllerServiceLookupSink.value.getControllerServiceLookup() - processorContext.setControllerServiceLookup(controllerServiceLookup) - } - - if (!processor.isInitialized) { - processor.init(processorContext) - } - - outgoingEvents = processor.process(processorContext, incomingEvents) - - /** - * compute metrics - */ - ProcessorMetrics.computeMetrics( - pipelineMetricPrefix + processorContext.getIdentifier + ".", - incomingEvents, - outgoingEvents, - offsetRange.fromOffset, - offsetRange.untilOffset, - System.currentTimeMillis() - startTime) - - processorTimerContext.stop() - }) - - - /** - * Do we make records compliant with a given Avro schema ? - */ - if (streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).isSet) { - try { - val strSchema = streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString() - val schema = RecordSchemaUtil.compileSchema(strSchema) - - outgoingEvents = outgoingEvents.map(record => RecordSchemaUtil.convertToValidRecord(record, schema)) - } catch { - case t: Throwable => - logger.warn("something wrong while converting records " + - "to valid accordingly to provide Avro schema " + t.getMessage) - } - - } - - /** - * push outgoing events and errors to Kafka - */ - kafkaSink.value.produce( - streamContext.getPropertyValue(OUTPUT_TOPICS).asString, - outgoingEvents.toList, - serializer - ) - - kafkaSink.value.produce( - streamContext.getPropertyValue(ERROR_TOPICS).asString, - outgoingEvents.filter(r => r.hasField(FieldDictionary.RECORD_ERRORS)).toList, - errorSerializer - ) - - pipelineTimerContext.stop() - } - } - catch { - case ex: OffsetOutOfRangeException => - val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString - val brokerList = streamContext.getPropertyValue(KAFKA_METADATA_BROKER_LIST).asString - /* val latestOffsetsString = zkSink.value.loadOffsetRangesFromZookeeper( - brokerList, - appName, - inputTopics.split(",").toSet) - .map(t => s"${t._1.topic}_${t._1.partition}:${t._2}") - .mkString(", ") - val offestsString = offsetRanges - .map(o => s"${o.topic}_${o.partition}:${o.fromOffset}/${o.untilOffset}") - .mkString(", ") - logger.error(s"unable to process partition. current Offsets $offestsString latest offsets $latestOffsetsString")*/ - logger.error(s"exception : ${ex.toString}") - - } - }) - Some(offsetRanges) - } - else None - } -} +//class KafkaRecordStreamParallelProcessing extends AbstractKafkaRecordStream { +// val logger = LoggerFactory.getLogger(this.getClass) +// +// override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { +// val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] +// +// descriptors.addAll(super.getSupportedPropertyDescriptors()) +// Collections.unmodifiableList(descriptors) +// } +// +// /** +// * launch the chain of processing for each partition of the RDD in parallel +// * +// * @param rdd +// */ +// override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { +// if (!rdd.isEmpty()) { +// // Cast the rdd to an interface that lets us get an array of OffsetRange +// val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges +// +// rdd.foreachPartition(partition => { +// try { +// if (partition.nonEmpty) { +// /** +// * index to get the correct offset range for the rdd partition we're working on +// * This is safe because we haven't shuffled or otherwise disrupted partitioning, +// * and the original input rdd partitions were 1:1 with kafka partitions +// */ +// val partitionId = TaskContext.get.partitionId() +// val offsetRange = offsetRanges(TaskContext.get.partitionId) +// +// val pipelineMetricPrefix = streamContext.getIdentifier + "." + +// "partition" + partitionId + "." +// val pipelineTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + "Pipeline.processing_time_ms" ).time() +// +// +// /** +// * create serializers +// */ +// val deserializer = getSerializer( +// streamContext.getPropertyValue(INPUT_SERIALIZER).asString, +// streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) +// val serializer = getSerializer( +// streamContext.getPropertyValue(OUTPUT_SERIALIZER).asString, +// streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) +// val errorSerializer = getSerializer( +// streamContext.getPropertyValue(ERROR_SERIALIZER).asString, +// streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) +// +// /** +// * process events by chaining output records +// */ +// var firstPass = true +// var incomingEvents: util.Collection[Record] = Collections.emptyList() +// var outgoingEvents: util.Collection[Record] = Collections.emptyList() +// +// streamContext.getProcessContexts.foreach(processorContext => { +// val startTime = System.currentTimeMillis() +// val processor = processorContext.getProcessor +// +// val processorTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + +// processorContext.getIdentifier + ".processing_time_ms").time() +// /** +// * convert incoming Kafka messages into Records +// * if there's no serializer we assume that we need to compute a Record from K/V +// */ +// if (firstPass) { +// incomingEvents = if ( +// streamContext.getPropertyValue(INPUT_SERIALIZER).asString +// == NO_SERIALIZER.getValue) { +// // parser +// partition.map(rawMessage => { +// val key = if (rawMessage.key() != null) new String(rawMessage.key()) else "" +// val value = if (rawMessage.value() != null) new String(rawMessage.value()) else "" +// RecordUtils.getKeyValueRecord(key, value) +// }).toList +// } else { +// // processor +// deserializeRecords(partition, deserializer) +// } +// +// firstPass = false +// } else { +// incomingEvents = outgoingEvents +// } +// +// /** +// * process incoming events +// */ +// if (processor.hasControllerService) { +// val controllerServiceLookup = controllerServiceLookupSink.value.getControllerServiceLookup() +// processorContext.setControllerServiceLookup(controllerServiceLookup) +// } +// +// if (!processor.isInitialized) { +// processor.init(processorContext) +// } +// +// outgoingEvents = processor.process(processorContext, incomingEvents) +// +// /** +// * compute metrics +// */ +// ProcessorMetrics.computeMetrics( +// pipelineMetricPrefix + processorContext.getIdentifier + ".", +// incomingEvents, +// outgoingEvents, +// offsetRange.fromOffset, +// offsetRange.untilOffset, +// System.currentTimeMillis() - startTime) +// +// processorTimerContext.stop() +// }) +// +// +// /** +// * Do we make records compliant with a given Avro schema ? +// */ +// if (streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).isSet) { +// try { +// val strSchema = streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString() +// val schema = RecordSchemaUtil.compileSchema(strSchema) +// +// outgoingEvents = outgoingEvents.map(record => RecordSchemaUtil.convertToValidRecord(record, schema)) +// } catch { +// case t: Throwable => +// logger.warn("something wrong while converting records " + +// "to valid accordingly to provide Avro schema " + t.getMessage) +// } +// +// } +// +// /** +// * push outgoing events and errors to Kafka +// */ +// kafkaSink.value.produce( +// streamContext.getPropertyValue(OUTPUT_TOPICS).asString, +// outgoingEvents.toList, +// serializer +// ) +// +// kafkaSink.value.produce( +// streamContext.getPropertyValue(ERROR_TOPICS).asString, +// outgoingEvents.filter(r => r.hasField(FieldDictionary.RECORD_ERRORS)).toList, +// errorSerializer +// ) +// +// pipelineTimerContext.stop() +// } +// } +// catch { +// case ex: OffsetOutOfRangeException => +// val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString +// val brokerList = streamContext.getPropertyValue(KAFKA_METADATA_BROKER_LIST).asString +// /* val latestOffsetsString = zkSink.value.loadOffsetRangesFromZookeeper( +// brokerList, +// appName, +// inputTopics.split(",").toSet) +// .map(t => s"${t._1.topic}_${t._1.partition}:${t._2}") +// .mkString(", ") +// val offestsString = offsetRanges +// .map(o => s"${o.topic}_${o.partition}:${o.fromOffset}/${o.untilOffset}") +// .mkString(", ") +// logger.error(s"unable to process partition. current Offsets $offestsString latest offsets $latestOffsetsString")*/ +// logger.error(s"exception : ${ex.toString}") +// +// } +// }) +// Some(offsetRanges) +// } +// else None +// } +//} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamSQLAggregator.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamSQLAggregator.scala index bdc723ede..64ce57caa 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamSQLAggregator.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamSQLAggregator.scala @@ -1,18 +1,3 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ /** * Copyright (C) 2016 Hurence (bailet.thomas@gmail.com) * @@ -42,119 +27,119 @@ import org.apache.avro.Schema import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession -import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange} +//import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange} import org.slf4j.LoggerFactory import scala.collection.JavaConversions._ import com.hurence.logisland.stream.StreamProperties._ -@Tags(Array("stream", "SQL", "query", "record")) -@CapabilityDescription("This is a stream capable of SQL query interpretations.") -class KafkaRecordStreamSQLAggregator extends AbstractKafkaRecordStream { - - private val logger = LoggerFactory.getLogger(classOf[KafkaRecordStreamSQLAggregator]) - - - override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { - val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] - descriptors.addAll(super.getSupportedPropertyDescriptors()) - descriptors.add(MAX_RESULTS_COUNT) - descriptors.add(SQL_QUERY) - descriptors.add(OUTPUT_RECORD_TYPE) - Collections.unmodifiableList(descriptors) - } - - override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { - if (!rdd.isEmpty()) { - // Cast the rdd to an interface that lets us get an array of OffsetRange - // val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges - - val sqlContext = SparkSession - .builder() - .appName(appName) - .config(ssc.sparkContext.getConf) - .getOrCreate() - - // this is used to implicitly convert an RDD to a DataFrame. - @transient lazy val deserializer = getSerializer( - streamContext.getPropertyValue(INPUT_SERIALIZER).asString, - streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) - - val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString - - //here how to handle elements that are not successfully deserialized ??? - //currently we lose them ! - //I think we should create an ErrorRecord containing key, value. - val records: RDD[Record] = rdd.mapPartitions(p => deserializeRecords(p, deserializer).iterator) - - /** - * get a Dataframe schema (either from an Avro schema or from the first record) - */ - val schema = try { - val parser = new Schema.Parser - val schema = parser.parse(streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) - SparkUtils.convertAvroSchemaToDataframeSchema(schema) - } - catch { - case e: Exception => - logger.error("unable to add schema :{}", e.getMessage) - SparkUtils.convertFieldsNameToSchema(records.take(1)(0)) - } - - if (!records.isEmpty()) { - - val rows = records.filter(r => !r.hasField(FieldDictionary.RECORD_ERRORS)) - .map(r => SparkUtils.convertToRow(r, schema)) - - - sqlContext.createDataFrame(rows, schema).createOrReplaceTempView(inputTopics) - - - - - val query = streamContext.getPropertyValue(SQL_QUERY).asString() - val outputRecordType = streamContext.getPropertyValue(OUTPUT_RECORD_TYPE).asString() - - sqlContext.sql(query).rdd - .foreachPartition(rows => { - val outgoingEvents = rows.map(row => SparkUtils.convertToRecord(row, outputRecordType)).toList - /** - * create serializers - */ - val serializer = getSerializer( - streamContext.getPropertyValue(OUTPUT_SERIALIZER).asString, - streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) - val errorSerializer = getSerializer( - streamContext.getPropertyValue(ERROR_SERIALIZER).asString, - streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) - - - - - /** - * push outgoing events and errors to Kafka - */ - kafkaSink.value.produce( - streamContext.getPropertyValue(OUTPUT_TOPICS).asString, - outgoingEvents, - serializer - ) - - kafkaSink.value.produce( - streamContext.getPropertyValue(ERROR_TOPICS).asString, - outgoingEvents.filter(r => r.hasField(FieldDictionary.RECORD_ERRORS)), - errorSerializer - ) - - }) - - - } - return None //Some(offsetRanges) - } - None - } -} +//@Tags(Array("stream", "SQL", "query", "record")) +//@CapabilityDescription("This is a stream capable of SQL query interpretations.") +//class KafkaRecordStreamSQLAggregator extends AbstractKafkaRecordStream { +// +// private val logger = LoggerFactory.getLogger(classOf[KafkaRecordStreamSQLAggregator]) +// +// +// override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { +// val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] +// descriptors.addAll(super.getSupportedPropertyDescriptors()) +// descriptors.add(MAX_RESULTS_COUNT) +// descriptors.add(SQL_QUERY) +// descriptors.add(OUTPUT_RECORD_TYPE) +// Collections.unmodifiableList(descriptors) +// } +// +// override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { +// if (!rdd.isEmpty()) { +// // Cast the rdd to an interface that lets us get an array of OffsetRange +// // val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges +// +// val sqlContext = SparkSession +// .builder() +// .appName(appName) +// .config(ssc.sparkContext.getConf) +// .getOrCreate() +// +// // this is used to implicitly convert an RDD to a DataFrame. +// @transient lazy val deserializer = getSerializer( +// streamContext.getPropertyValue(INPUT_SERIALIZER).asString, +// streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) +// +// val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString +// +// //here how to handle elements that are not successfully deserialized ??? +// //currently we lose them ! +// //I think we should create an ErrorRecord containing key, value. +// val records: RDD[Record] = rdd.mapPartitions(p => deserializeRecords(p, deserializer).iterator) +// +// /** +// * get a Dataframe schema (either from an Avro schema or from the first record) +// */ +// val schema = try { +// val parser = new Schema.Parser +// val schema = parser.parse(streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) +// SparkUtils.convertAvroSchemaToDataframeSchema(schema) +// } +// catch { +// case e: Exception => +// logger.error("unable to add schema :{}", e.getMessage) +// SparkUtils.convertFieldsNameToSchema(records.take(1)(0)) +// } +// +// if (!records.isEmpty()) { +// +// val rows = records.filter(r => !r.hasField(FieldDictionary.RECORD_ERRORS)) +// .map(r => SparkUtils.convertToRow(r, schema)) +// +// +// sqlContext.createDataFrame(rows, schema).createOrReplaceTempView(inputTopics) +// +// +// +// +// val query = streamContext.getPropertyValue(SQL_QUERY).asString() +// val outputRecordType = streamContext.getPropertyValue(OUTPUT_RECORD_TYPE).asString() +// +// sqlContext.sql(query).rdd +// .foreachPartition(rows => { +// val outgoingEvents = rows.map(row => SparkUtils.convertToRecord(row, outputRecordType)).toList +// /** +// * create serializers +// */ +// val serializer = getSerializer( +// streamContext.getPropertyValue(OUTPUT_SERIALIZER).asString, +// streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) +// val errorSerializer = getSerializer( +// streamContext.getPropertyValue(ERROR_SERIALIZER).asString, +// streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) +// +// +// +// +// /** +// * push outgoing events and errors to Kafka +// */ +// kafkaSink.value.produce( +// streamContext.getPropertyValue(OUTPUT_TOPICS).asString, +// outgoingEvents, +// serializer +// ) +// +// kafkaSink.value.produce( +// streamContext.getPropertyValue(ERROR_TOPICS).asString, +// outgoingEvents.filter(r => r.hasField(FieldDictionary.RECORD_ERRORS)), +// errorSerializer +// ) +// +// }) +// +// +// } +// return None //Some(offsetRanges) +// } +// None +// } +//} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/KafkaStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/KafkaStructuredStreamProviderService.scala index f081e4046..fa55cd6d9 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/KafkaStructuredStreamProviderService.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/KafkaStructuredStreamProviderService.scala @@ -38,6 +38,7 @@ import com.hurence.logisland.annotation.lifecycle.OnEnabled import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} +import com.hurence.logisland.runner.GlobalOptions import com.hurence.logisland.stream.StreamContext import com.hurence.logisland.stream.StreamProperties._ import com.hurence.logisland.util.kafka.KafkaSink @@ -249,6 +250,11 @@ class KafkaStructuredStreamProviderService() extends AbstractControllerService w import df.sparkSession.implicits._ + var checkpointLocation = "checkpoints" + if (GlobalOptions.checkpointLocation != null) { + checkpointLocation = GlobalOptions.checkpointLocation + } + // Write key-value data from a DataFrame to a specific Kafka topic specified in an option df .map(r => { (r.getField(FieldDictionary.RECORD_KEY).asString(), r.getField(FieldDictionary.RECORD_VALUE).asBytes()) @@ -259,8 +265,7 @@ class KafkaStructuredStreamProviderService() extends AbstractControllerService w .format("kafka") .option("kafka.bootstrap.servers", brokerList) .option("topic", outputTopics.mkString(",")) - .option("checkpointLocation", "checkpoints") - + .option("checkpointLocation", checkpointLocation) } private def getOrElse[T](record: Record, field: String, defaultValue: T): T = { diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala index 1fb834060..3c6926a13 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala @@ -36,6 +36,7 @@ import java.util.Date import com.hurence.logisland.controller.ControllerService import com.hurence.logisland.record._ +import com.hurence.logisland.runner.GlobalOptions import com.hurence.logisland.serializer.{JsonSerializer, NoopSerializer, RecordSerializer, SerializerProvider} import com.hurence.logisland.stream.StreamContext import com.hurence.logisland.stream.StreamProperties._ @@ -277,10 +278,15 @@ trait StructuredStreamProviderService extends ControllerService { // do the parallel processing val df2 = df.mapPartitions(record => record.map(record => serializeRecords(serializer, keySerializer, record))) + var checkpointLocation = "checkpoints/" + streamContext.getIdentifier + if (GlobalOptions.checkpointLocation != null) { + checkpointLocation = GlobalOptions.checkpointLocation + } + write(df2, controllerServiceLookupSink, streamContext) .queryName(streamContext.getIdentifier) // .outputMode("update") - .option("checkpointLocation", "checkpoints/" + streamContext.getIdentifier) + .option("checkpointLocation", checkpointLocation) .start() // .processAllAvailable() diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/SparkUtils.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/SparkUtils.scala index b6691a424..412ec6262 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/SparkUtils.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/SparkUtils.scala @@ -51,6 +51,7 @@ import java.util import java.util.Date import com.hurence.logisland.record._ +import com.typesafe.scalalogging.LazyLogging import com.typesafe.scalalogging.slf4j.LazyLogging import org.apache.avro.Schema import org.apache.avro.Schema.Type diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala index d5ebc9427..e6b74d296 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala @@ -1,33 +1,3 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ /** * Copyright (C) 2016 Hurence (support@hurence.com) * @@ -62,7 +32,6 @@ import org.apache.spark.sql.{Dataset, SQLContext, SparkSession} import org.apache.spark.streaming.StreamingContext import org.slf4j.LoggerFactory - class StructuredStream extends AbstractRecordStream with SparkRecordStream { diff --git a/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/GlobalOptions.java b/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/GlobalOptions.java new file mode 100644 index 000000000..ffd4498c5 --- /dev/null +++ b/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/GlobalOptions.java @@ -0,0 +1,13 @@ +package com.hurence.logisland.runner; + +/** + * Holds command line options + */ +public class GlobalOptions { + + // Databricks mode + public static boolean databricks = false; + + // Path to checkpoint directory + public static String checkpointLocation = null; +} diff --git a/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/StreamProcessingRunner.java b/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/StreamProcessingRunner.java index ca361882a..f98f4ba44 100644 --- a/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/StreamProcessingRunner.java +++ b/logisland-core/logisland-framework/logisland-bootstrap/src/main/java/com/hurence/logisland/runner/StreamProcessingRunner.java @@ -51,6 +51,7 @@ public static void main(String[] args) { Option help = new Option("help", helpMsg); options.addOption(help); + // Configuration file OptionBuilder.withArgName("conf"); OptionBuilder.withLongOpt("config-file"); OptionBuilder.isRequired(); @@ -59,6 +60,7 @@ public static void main(String[] args) { Option conf = OptionBuilder.create("conf"); options.addOption(conf); + // Databricks mode OptionBuilder.withArgName("databricks"); OptionBuilder.withLongOpt("databricks-mode"); OptionBuilder.isRequired(false); @@ -67,6 +69,15 @@ public static void main(String[] args) { Option databricks = OptionBuilder.create("databricks"); options.addOption(databricks); + // Checkpoint directory + OptionBuilder.withArgName("chkploc"); + OptionBuilder.withLongOpt("checkpoint-location"); + OptionBuilder.isRequired(false); + OptionBuilder.hasArg(true); + OptionBuilder.withDescription("Checkpoint location used by some engines"); + Option checkpointLocation = OptionBuilder.create("chkploc"); + options.addOption(checkpointLocation); + Optional engineInstance = Optional.empty(); try { System.out.println(BannerLoader.loadBanner()); @@ -84,10 +95,22 @@ public static void main(String[] args) { sessionConf = ConfigReader.loadConfig(configFile); } else { logger.info("Running in databricks mode"); + GlobalOptions.databricks = true; sessionConf = loadConfigFromSharedFS(configFile); } logger.info("Configuration loaded"); + // Get checkpoint location if any + boolean chkploc = line.hasOption("chkploc"); + + if (databricksMode && !chkploc) { + logger.error("Databricks mode requires checkpoint location to be set"); + System.exit(-1); + } + + GlobalOptions.checkpointLocation = line.getOptionValue("chkploc"); + logger.info("Using checkpoint location: " + GlobalOptions.checkpointLocation); + // instantiate engine and all the processor from the config engineInstance = ComponentFactory.getEngineContext(sessionConf.getEngine()); if (!engineInstance.isPresent()) { From f44321a6bfb3b0cc4ff9ce9215833241a3398d08 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Tue, 11 Feb 2020 14:13:34 +0100 Subject: [PATCH 06/43] Remove building of spark2.4_kafka2.4 engine for the moment --- logisland-assembly/src/assembly/shared-dependencies.xml | 2 +- logisland-core/logisland-engines/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/logisland-assembly/src/assembly/shared-dependencies.xml b/logisland-assembly/src/assembly/shared-dependencies.xml index 9c1b5de08..10737fe3b 100644 --- a/logisland-assembly/src/assembly/shared-dependencies.xml +++ b/logisland-assembly/src/assembly/shared-dependencies.xml @@ -26,7 +26,7 @@ - com.hurence.logisland:logisland-engine-spark_2_4_kafka_2_4 + com.hurence.logisland:logisland-engine-spark_2_1 com.hurence.logisland:logisland-engine-spark_2_3 com.hurence.logisland:logisland-engine-spark_1_6 diff --git a/logisland-core/logisland-engines/pom.xml b/logisland-core/logisland-engines/pom.xml index d6865c420..aafa5e9dd 100644 --- a/logisland-core/logisland-engines/pom.xml +++ b/logisland-core/logisland-engines/pom.xml @@ -15,7 +15,7 @@ - logisland-engine-spark_2_4plus_kafka_2_4plus + logisland-engine-spark_2_X logisland-engine-spark_1_6 logisland-engine-vanilla From bfdf3ec6f04f70325271f74699972b1530c266f6 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Tue, 11 Feb 2020 14:37:26 +0100 Subject: [PATCH 07/43] Added spark 1.4 support in 2.x engine. Also added azure eventhubs spark streaming dependencies --- .../src/assembly/shared-dependencies.xml | 1 + .../logisland-engine-spark_2_4/pom.xml | 217 ++++++++++++++++++ .../logisland/util/spark/Spark24Platform.java | 30 +++ ...hurence.logisland.util.spark.SparkPlatform | 1 + .../logisland-engine-spark_2_common/pom.xml | 7 + .../logisland-engine-spark_2_X/pom.xml | 1 + 6 files changed, 257 insertions(+) create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/src/main/resources/META-INF/services/com.hurence.logisland.util.spark.SparkPlatform diff --git a/logisland-assembly/src/assembly/shared-dependencies.xml b/logisland-assembly/src/assembly/shared-dependencies.xml index 10737fe3b..229a51ab8 100644 --- a/logisland-assembly/src/assembly/shared-dependencies.xml +++ b/logisland-assembly/src/assembly/shared-dependencies.xml @@ -29,6 +29,7 @@ com.hurence.logisland:logisland-engine-spark_2_1 com.hurence.logisland:logisland-engine-spark_2_3 + com.hurence.logisland:logisland-engine-spark_2_4 com.hurence.logisland:logisland-engine-spark_1_6 com.hurence.logisland:logisland-engine-vanilla diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml new file mode 100644 index 000000000..d610d24ef --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml @@ -0,0 +1,217 @@ + + + 4.0.0 + + com.hurence.logisland + logisland-engine-spark_2_X + 1.2.0 + + logisland-engine-spark_2_4 + jar + + + + + 2.11 + 2.4.4 + 0.10.2.1 + 2.11.8 + + + + + + + + + org.apache.kafka + kafka_${scala.binary.version} + ${kafka.version} + true + runtime + + + + org.apache.kafka + kafka-clients + ${kafka.version} + true + runtime + + + org.apache.bahir + spark-sql-streaming-mqtt_2.11 + ${spark.version} + runtime + true + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + provided + + + com.google.guava + guava + + + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-mllib_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-streaming-kafka-0-10_${scala.binary.version} + ${spark.version} + runtime + true + + + org.apache.spark + spark-sql-kafka-0-10_${scala.binary.version} + ${spark.version} + runtime + true + + + org.apache.spark + spark-streaming-kafka-assembly_${scala.binary.version} + ${spark.version} + runtime + true + + + + + + + com.hurence.logisland + logisland-engine-spark_2_common + ${project.version} + true + + + org.apache.spark + spark-sql_${scala.binary.version} + provided + + + + org.scala-lang + scala-library + ${scala.version} + provided + true + + + com.banzaicloud + spark-metrics_2.11 + 2.4-1.0.5 + + + io.prometheus + simpleclient + 0.0.23 + + + io.prometheus + simpleclient_dropwizard + 0.0.23 + + + io.prometheus + simpleclient_pushgateway + 0.0.23 + + + + + + + + + org.immutables.tools + maven-shade-plugin + 4 + + + package + + shade + + + + + com.fasterxml.jackson.datatype:jackson-datatype-jsr310 + com.fasterxml.jackson.datatype:jackson-datatype-jdk8 + com.hurence.logisland:logisland-engine-spark_2_common + *:* + + + com.fasterxml.jackson.core:* + com.fasterxml.jackson.databind:* + com.fasterxml.jackson.jaxrs*:* + com.fasterxml.jackson.module:jackson-module-jaxb-annotations + org.scala-lang:* + org.scalatest:* + org.apache.zookeeper:* + com.google.guava:* + org.apache.commons:* + org.slf4j:* + log4j:* + org.yaml:* + org.eclipse.jetty:* + org.glassfish.hk2*:* + org.glassfish.jersey*:* + + + + + *:* + + META-INF/license/** + META-INF/* + META-INF/maven/** + LICENSE + NOTICE + /*.txt + build.properties + + + + + + + + + + + + + + + banzaicloud-github + https://raw.github.com/banzaicloud/spark-metrics/master/maven-repo/releases + + + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java new file mode 100644 index 000000000..4383f2597 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java @@ -0,0 +1,30 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.util.spark; + +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; + +public class Spark24Platform implements SparkPlatform { + @Override + public Dataset createStreamingDataFrame(SQLContext sqlContext, RDD catalystRows, StructType schema) { + return sqlContext.internalCreateDataFrame(catalystRows, schema, true); + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/src/main/resources/META-INF/services/com.hurence.logisland.util.spark.SparkPlatform b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/src/main/resources/META-INF/services/com.hurence.logisland.util.spark.SparkPlatform new file mode 100644 index 000000000..405b9bf4e --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/src/main/resources/META-INF/services/com.hurence.logisland.util.spark.SparkPlatform @@ -0,0 +1 @@ +com.hurence.logisland.util.spark.Spark24Platform \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml index 3927640e6..a8a1cfb1f 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml @@ -23,6 +23,7 @@ http://www.w3.org/2001/XMLSchema-instance "> 0.10.2.1 2.11.8 2.6.6 + 2.3.14.1 @@ -368,6 +369,12 @@ http://www.w3.org/2001/XMLSchema-instance "> 5.1.3.RELEASE + + com.microsoft.azure + azure-eventhubs-spark_${scala.binary.version} + ${eventhubs.version} + + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_X/pom.xml index d25cadccc..5a5ca030b 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/pom.xml +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/pom.xml @@ -22,6 +22,7 @@ logisland-engine-spark_2_common logisland-engine-spark_2_1 logisland-engine-spark_2_3 + logisland-engine-spark_2_4 From bee69e0a61362b34af0be645f1b5f7581c7b44c4 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Thu, 13 Feb 2020 14:26:24 +0100 Subject: [PATCH 08/43] Saving intermediate step towards new service to support azure event hubs --- .../logisland/util/spark/Spark24Platform.java | 2 +- .../logisland/stream/spark/package.scala | 150 ++++++++- ...tHubsStructuredStreamProviderService.scala | 287 ++++++++++++++++ ...bsStructuredStreamProviderServiceTest.java | 311 ++++++++++++++++++ 4 files changed, 748 insertions(+), 2 deletions(-) create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala create mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderServiceTest.java diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java index 4383f2597..9b1e30c09 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java @@ -1,5 +1,5 @@ /** - * Copyright (C) 2016 Hurence (support@hurence.com) + * Copyright (C) 2020 Hurence (support@hurence.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala index 3109b849c..437ebdedd 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala @@ -33,7 +33,7 @@ package com.hurence.logisland.stream import com.hurence.logisland.component.{AllowableValue, PropertyDescriptor} import com.hurence.logisland.serializer._ import com.hurence.logisland.stream.spark.structured.provider.StructuredStreamProviderService -import com.hurence.logisland.validator.StandardValidators +import com.hurence.logisland.validator.{StandardValidators, ValidationResult, Validator} /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -557,5 +557,153 @@ object StreamProperties { .defaultValue("aggregation") .build + ////////////////////////////////////// + // Azure event hubs options + ////////////////////////////////////// + + val EVENTHUBS_NAMESPACE: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("eventhubs.nameSpace") + .description("EventHubs namespace.") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(true) + .build + + val EVENTHUBS_MAX_EVENTS_PER_TRIGGER: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("eventhubs.maxEventsPerTrigger") + .description("Rate limit on maximum number of events processed per trigger interval. The specified total number" + + " of events will be proportionally split across partitions of different volume.") + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .required(false) + .build + + val EVENTHUBS_MAX_OPERATION_TIMEOUT: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("eventhubs.operationTimeout") + .description("The amount of time Event Hub API calls will be retried before throwing an exception.") + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .required(false) + .build + val EVENTHUBS_MAX_THREAD_POOL_SIZE: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("eventhubs.threadPoolSize") + .description("Sets the size of thread pool.") + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .required(false) + .build + + val EVENTHUBS_READ_EVENT_HUB: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("eventhubs.read.eventHub") + .description("EventHub to read from.") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(false) + .build + + val EVENTHUBS_READ_SAS_KEY_NAME: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("eventhubs.read.sasKeyName") + .description("SAS key name for read eventhub.") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(false) + .build + + val EVENTHUBS_READ_SAS_KEY: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("eventhubs.read.sasKey") + .description("SAS key for read eventhub.") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(false) + .build + + val EVENTHUBS_READ_CONSUMER_GROUP: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("eventhubs.read.consumerGroup") + .description("Consumer group name.") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(false) + .build + + val EVENTHUBS_READ_POSITION_START_OF_STREAM = "start-of-stream" + val EVENTHUBS_READ_POSITION_END_OF_STREAM = "end-of-stream" + val EVENTHUBS_READ_POSITION_INSTANT_NOW = "instant-now" + + // Validator for EVENTHUBS_READ_POSITION + val EVENTHUBS_READ_POSITION_VALIDATOR: Validator = new Validator() { + override def validate(subject: String, value: String): ValidationResult = { + + // Accepted values are long, or start-of-stream or end-of-stream + var ok : Boolean = false + // Recognized string? + if ( (value == EVENTHUBS_READ_POSITION_START_OF_STREAM) || (value == EVENTHUBS_READ_POSITION_END_OF_STREAM) + || (value == EVENTHUBS_READ_POSITION_INSTANT_NOW) ) { + ok = true + } + // Long value? + try { + value.toLong + ok = true + } catch { + case e: Exception => // Not a long; + } + new ValidationResult.Builder().subject(subject).input(value).valid(ok) + .explanation(subject + " should be a long or " + EVENTHUBS_READ_POSITION_START_OF_STREAM + " or " + + EVENTHUBS_READ_POSITION_END_OF_STREAM + " or " + EVENTHUBS_READ_POSITION_INSTANT_NOW).build} + } + + val EVENTHUBS_READ_POSITION_TYPE_NAME = "eventhubs.read.positionType" + val EVENTHUBS_READ_POSITION: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("eventhubs.read.position") + .description("Start event position. This may be either " + EVENTHUBS_READ_POSITION_START_OF_STREAM + ", " + + EVENTHUBS_READ_POSITION_END_OF_STREAM + " or a long value. If this is a long value, " + + EVENTHUBS_READ_POSITION_TYPE_NAME + " should be filled to define the meaning of the value. Default value is " + + EVENTHUBS_READ_POSITION_END_OF_STREAM) + .addValidator(EVENTHUBS_READ_POSITION_VALIDATOR) + .required(false) + .defaultValue(EVENTHUBS_READ_POSITION_END_OF_STREAM) + .build + + val EVENTHUBS_READ_POSITION_TYPE_OFFSET = "offset" + val EVENTHUBS_READ_POSITION_TYPE_SEQUENCE_NUMBER = "sequenceNumber" + val EVENTHUBS_READ_POSITION_TYPE_EPOCH_MILLIS = "epochMillis" + + val EVENTHUBS_READ_POSITION_TYPE: PropertyDescriptor = new PropertyDescriptor.Builder() + .name(EVENTHUBS_READ_POSITION_TYPE_NAME) + .description("Specifies the type of the " + EVENTHUBS_READ_POSITION.getName + " value when it is a long value. " + + "This can be " + EVENTHUBS_READ_POSITION_TYPE_OFFSET + ", " + EVENTHUBS_READ_POSITION_TYPE_SEQUENCE_NUMBER + + " or " + EVENTHUBS_READ_POSITION_TYPE_EPOCH_MILLIS + ". Default value is " + EVENTHUBS_READ_POSITION_TYPE_OFFSET) + .allowableValues(EVENTHUBS_READ_POSITION_TYPE_OFFSET, EVENTHUBS_READ_POSITION_TYPE_SEQUENCE_NUMBER, + EVENTHUBS_READ_POSITION_TYPE_EPOCH_MILLIS) + .required(false) + .defaultValue(EVENTHUBS_READ_POSITION_TYPE_OFFSET) + .build + + val EVENTHUBS_READ_RECEIVER_TIMEOUT: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("eventhubs.read.receiverTimeout") + .description("The amount of time Event Hub receive calls will be retried before throwing an exception.") + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .required(false) + .build + + val EVENTHUBS_READ_PREFETCH_COUNT: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("eventhubs.read.prefetchCount") + .description("Sets the prefetch count for the underlying receiver and controls how many events are received in advance.") + .addValidator(StandardValidators.INTEGER_VALIDATOR) + .required(false) + .build + + val EVENTHUBS_WRITE_EVENT_HUB: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("eventhubs.write.eventHub") + .description("EventHub to write to.") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(false) + .build + + val EVENTHUBS_WRITE_SAS_KEY_NAME: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("eventhubs.write.sasKeyName") + .description("SAS key name for write eventhub.") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(false) + .build + + val EVENTHUBS_WRITE_SAS_KEY: PropertyDescriptor = new PropertyDescriptor.Builder() + .name("eventhubs.write.sasKey") + .description("SAS key for write eventhub.") + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .required(false) + .build } diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala new file mode 100644 index 000000000..2bc766f93 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala @@ -0,0 +1,287 @@ +/** + * Copyright (C) 2020 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.stream.spark.structured.provider + +import java.util +import java.util.Collections + +import com.hurence.logisland.annotation.documentation.CapabilityDescription +import com.hurence.logisland.annotation.lifecycle.OnEnabled +import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} +import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} +import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} +import com.hurence.logisland.stream.StreamContext +import com.hurence.logisland.stream.StreamProperties._ +import com.hurence.logisland.util.spark.ControllerServiceLookupSink +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.eventhubs.{ConnectionStringBuilder, EventHubsConf} +import org.apache.spark.sql.{Dataset, SparkSession} + +/** + * Service to allow reading/writing from/to azure event hub with structured streams + * Developed using documentation at: + * https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/structured-streaming-eventhubs-integration.md + */ +@CapabilityDescription("Provides a ways to use azure event hubs as input or output in StructuredStream streams") +class AzureEventHubsStructuredStreamProviderService() extends AbstractControllerService with StructuredStreamProviderService { + + var namespace : String = null + var readPositionString: String = null + var readPositionLong: Long = 0L + var readPositionIsString: Boolean = true + var readPositionType : String = null + var readEventHub : String = null + var readSasKeyName : String = null + var readSasKey : String = null + var readConsumerGroup : String = null + var writeEventHub : String = null + var writeSasKeyName : String = null + var writeSasKey : String = null + +// var maxEventPerTrigger : Int = Int.MaxValue +// var maxOperationTimeout : Int = null +// var threadPoolSize : Int = null +// var readReceiverTimeout : Int = null +// var readPrefetchCount : Int = null + + var properties : Map[String, Any] = Map[String, Any]() + + @OnEnabled + @throws[InitializationException] + override def init(context: ControllerServiceInitializationContext): Unit = { + super.init(context) + this.synchronized { + try { + + // namespace + if (!context.getPropertyValue(EVENTHUBS_NAMESPACE).isSet) { + throw new InitializationException("EventHubs service " + EVENTHUBS_NAMESPACE.getName + " not specified.") + } + namespace = context.getPropertyValue(EVENTHUBS_NAMESPACE).asString() + + // readEventHub and writeEventHub + if (!context.getPropertyValue(EVENTHUBS_READ_EVENT_HUB).isSet && + !context.getPropertyValue(EVENTHUBS_WRITE_EVENT_HUB).isSet) { + throw new InitializationException("EventHubs service must at least have a read or write event hub set.") + } + + if (context.getPropertyValue(EVENTHUBS_READ_EVENT_HUB).isSet) { + readEventHub = context.getPropertyValue(EVENTHUBS_READ_EVENT_HUB).asString() + } + + if (context.getPropertyValue(EVENTHUBS_WRITE_EVENT_HUB).isSet) { + writeEventHub = context.getPropertyValue(EVENTHUBS_WRITE_EVENT_HUB).asString() + } + + // maxEventPerTrigger + if (context.getPropertyValue(EVENTHUBS_MAX_EVENTS_PER_TRIGGER).isSet) { + properties += (EVENTHUBS_MAX_EVENTS_PER_TRIGGER.getName + -> context.getPropertyValue(EVENTHUBS_MAX_EVENTS_PER_TRIGGER).asInteger().toInt) + } + + // maxOperationTimeout + if (context.getPropertyValue(EVENTHUBS_MAX_OPERATION_TIMEOUT).isSet) { + properties += (EVENTHUBS_MAX_OPERATION_TIMEOUT.getName + -> context.getPropertyValue(EVENTHUBS_MAX_OPERATION_TIMEOUT).asInteger().toInt) + } + + // threadPoolSize + if (context.getPropertyValue(EVENTHUBS_MAX_THREAD_POOL_SIZE).isSet) { + properties += (EVENTHUBS_MAX_THREAD_POOL_SIZE.getName + -> context.getPropertyValue(EVENTHUBS_MAX_THREAD_POOL_SIZE).asInteger().toInt) + } + + if ((readEventHub == null) && (writeEventHub == null)) { + throw new InitializationException("EventHubs service must at least have a read or write event hub set.") + } + + // Get read config properties + if (readEventHub != null) { + + // readPosition + val readPosition : Any = context.getPropertyValue(EVENTHUBS_READ_POSITION).asString() + + if ( (readPosition == EVENTHUBS_READ_POSITION_START_OF_STREAM) + || (readPosition == EVENTHUBS_READ_POSITION_END_OF_STREAM) + || (readPosition == EVENTHUBS_READ_POSITION_INSTANT_NOW)) { + readPositionIsString = true + readPositionString = readPosition.asInstanceOf[String] + } else { + readPositionIsString = false + readPositionLong = readPosition.asInstanceOf[String].toLong + } + + // readPositionType + readPositionType = context.getPropertyValue(EVENTHUBS_READ_POSITION_TYPE).asString() + + // readSasKeyName + if (!context.getPropertyValue(EVENTHUBS_READ_SAS_KEY_NAME).isSet) { + throw new InitializationException("EventHubs service read event hub requires " + + EVENTHUBS_READ_SAS_KEY_NAME.getName) + } + readSasKeyName = context.getPropertyValue(EVENTHUBS_READ_SAS_KEY_NAME).asString() + + // readSasKey + if (!context.getPropertyValue(EVENTHUBS_READ_SAS_KEY).isSet) { + throw new InitializationException("EventHubs service read event hub requires " + + EVENTHUBS_READ_SAS_KEY.getName) + } + readSasKey = context.getPropertyValue(EVENTHUBS_READ_SAS_KEY).asString() + + // readConsumerGroup + if (context.getPropertyValue(EVENTHUBS_READ_CONSUMER_GROUP).isSet) { + readConsumerGroup = context.getPropertyValue(EVENTHUBS_READ_CONSUMER_GROUP).asString() + } + + // readReceiverTimeout + if (context.getPropertyValue(EVENTHUBS_READ_RECEIVER_TIMEOUT).isSet) { + properties += (EVENTHUBS_READ_RECEIVER_TIMEOUT.getName + -> context.getPropertyValue(EVENTHUBS_READ_RECEIVER_TIMEOUT).asInteger().toInt) + } + + // readPrefetchCount + if (context.getPropertyValue(EVENTHUBS_READ_PREFETCH_COUNT).isSet) { + properties += (EVENTHUBS_READ_PREFETCH_COUNT.getName + -> context.getPropertyValue(EVENTHUBS_READ_PREFETCH_COUNT).asInteger().toInt) + } + } + + // Get write config properties + if (writeEventHub != null) { + + // writeSasKeyName + if (!context.getPropertyValue(EVENTHUBS_WRITE_SAS_KEY_NAME).isSet) { + throw new InitializationException("EventHubs service write event hub requires " + + EVENTHUBS_WRITE_SAS_KEY_NAME.getName) + } + writeSasKeyName = context.getPropertyValue(EVENTHUBS_WRITE_SAS_KEY_NAME).asString() + + // writeSasKey + if (!context.getPropertyValue(EVENTHUBS_WRITE_SAS_KEY).isSet) { + throw new InitializationException("EventHubs service write event hub requires " + + EVENTHUBS_WRITE_SAS_KEY.getName) + } + writeSasKey = context.getPropertyValue(EVENTHUBS_WRITE_SAS_KEY).asString() + } + + } catch { + case e: Exception => + throw new InitializationException(e) + } + } + } + + /** + * Allows subclasses to register which property descriptor objects are + * supported. + * + * @return PropertyDescriptor objects this processor currently supports + */ + override def getSupportedPropertyDescriptors() = { + val descriptors = new util.ArrayList[PropertyDescriptor] + descriptors.add(EVENTHUBS_NAMESPACE) + descriptors.add(EVENTHUBS_MAX_EVENTS_PER_TRIGGER) + descriptors.add(EVENTHUBS_MAX_OPERATION_TIMEOUT) + descriptors.add(EVENTHUBS_MAX_THREAD_POOL_SIZE) + descriptors.add(EVENTHUBS_READ_EVENT_HUB) + descriptors.add(EVENTHUBS_READ_SAS_KEY_NAME) + descriptors.add(EVENTHUBS_READ_SAS_KEY) + descriptors.add(EVENTHUBS_READ_CONSUMER_GROUP) + descriptors.add(EVENTHUBS_READ_POSITION) + descriptors.add(EVENTHUBS_READ_POSITION_TYPE) + descriptors.add(EVENTHUBS_READ_RECEIVER_TIMEOUT) + descriptors.add(EVENTHUBS_READ_PREFETCH_COUNT) + descriptors.add(EVENTHUBS_WRITE_EVENT_HUB) + descriptors.add(EVENTHUBS_WRITE_SAS_KEY_NAME) + descriptors.add(EVENTHUBS_WRITE_SAS_KEY) + Collections.unmodifiableList(descriptors) + } + + /** + * create a streaming DataFrame that represents data received + * + * @param spark + * @param streamContext + * @return DataFrame currently loaded + */ + override def read(spark: SparkSession, streamContext: StreamContext) = { + import spark.implicits._ + + implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] + + val connectionString = ConnectionStringBuilder() + .setNamespaceName(namespace) + .setEventHubName(readEventHub) + .setSasKeyName(readSasKeyName) + .setSasKey(readSasKey) + .build + + var eventHubsConf = EventHubsConf(connectionString) + + logger.info(s"Starting azure event hubs structured stream on event hub $readEventHub in $namespace namespace") + val df = spark.readStream + .format("eventhubs") + .options(eventHubsConf.toMap) + .load() + .selectExpr("CAST(key AS STRING)", "CAST(value AS BINARY)") + // .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") + .as[(String, Array[Byte])] + .map(r => { +// new StandardRecord(inputTopics.head) +// .setField(FieldDictionary.RECORD_KEY, FieldType.STRING, r._1) +// .setField(FieldDictionary.RECORD_VALUE, FieldType.BYTES, r._2) + new StandardRecord("").asInstanceOf[Record]; + }) + + df + } + + /** + * create a streaming DataFrame that represents data received + * + * @param streamContext + * @return DataFrame currently loaded + */ + override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext) = { + + + // val sender = df.sparkSession.sparkContext.broadcast(KafkaSink(kafkaSinkParams)) + + import df.sparkSession.implicits._ + + val connectionString = ConnectionStringBuilder() + .setNamespaceName(namespace) + .setEventHubName(readEventHub) + .setSasKeyName(readSasKeyName) + .setSasKey(readSasKey) + .build + + var eventHubsConf = EventHubsConf(connectionString) + + logger.info(s"Starting azure event hubs structured stream to event hub $readEventHub in $namespace namespace") + + // Write key-value data from a DataFrame to a specific Kafka topic specified in an option + df .map(r => { + (r.getField(FieldDictionary.RECORD_KEY).asString(), r.getField(FieldDictionary.RECORD_VALUE).asBytes()) + }) + .as[(String, Array[Byte])] + .toDF("key","value") + .writeStream + .format("eventhubs") + .options(eventHubsConf.toMap) + .option("checkpointLocation", "checkpoints") + } +} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderServiceTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderServiceTest.java new file mode 100644 index 000000000..36668a356 --- /dev/null +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderServiceTest.java @@ -0,0 +1,311 @@ +package com.hurence.logisland.stream.spark.structured.provider; + +import com.hurence.logisland.component.InitializationException; +import com.hurence.logisland.util.runner.TestRunner; +import com.hurence.logisland.util.runner.TestRunners; +import com.hurence.logisland.stream.StreamProperties; +import org.junit.jupiter.api.Test; + +import static org.junit.Assert.fail; + +public class AzureEventHubsStructuredStreamProviderServiceTest { + + @Test + public void testConfig() throws InitializationException { + + boolean error = false; + // Missing namespace + try { + error = false; + final AzureEventHubsStructuredStreamProviderService service = + new AzureEventHubsStructuredStreamProviderService(); + // Any processor will do it, we won't use it but we need a real processor to be instantiated + final TestRunner runner = TestRunners.newTestRunner("com.hurence.logisland.processor.datastore.BulkPut"); + runner.addControllerService("eventhubs_service", service); + runner.enableControllerService(service); + error = true; + fail("Namespace not defined: this should have failed"); + } catch (AssertionError e) { + if (error) { + fail(e.getMessage()); + } else { + System.out.println(e.getMessage()); + } + } + + // Namespace but missing read or write hub + try { + error = false; + final AzureEventHubsStructuredStreamProviderService service = + new AzureEventHubsStructuredStreamProviderService(); + final TestRunner runner = TestRunners.newTestRunner("com.hurence.logisland.processor.datastore.BulkPut"); + runner.addControllerService("eventhubs_service", service); + runner.setProperty(service, StreamProperties.EVENTHUBS_NAMESPACE().getName(), "namespace"); + runner.enableControllerService(service); + error = true; + fail("Namespace defined but missing read or write hub: this should have failed"); + } catch (AssertionError e) { + if (error) { + fail(e.getMessage()); + } else { + System.out.println(e.getMessage()); + } + } + + /** + * READ EVENT HUB ONLY + */ + + // Namespace, read hub but missing sasKeyName + try { + error = false; + final AzureEventHubsStructuredStreamProviderService service = + new AzureEventHubsStructuredStreamProviderService(); + final TestRunner runner = TestRunners.newTestRunner("com.hurence.logisland.processor.datastore.BulkPut"); + runner.addControllerService("eventhubs_service", service); + runner.setProperty(service, StreamProperties.EVENTHUBS_NAMESPACE().getName(), "namespace"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_EVENT_HUB().getName(), "read_hub"); + runner.enableControllerService(service); + error = true; + fail("Read hub defined but missing sasKeyName: this should have failed"); + } catch (AssertionError e) { + if (error) { + fail(e.getMessage()); + } else { + System.out.println(e.getMessage()); + } + } + + // Namespace, read hub, sasKeyName but missing sasKey + try { + error = false; + final AzureEventHubsStructuredStreamProviderService service = + new AzureEventHubsStructuredStreamProviderService(); + final TestRunner runner = TestRunners.newTestRunner("com.hurence.logisland.processor.datastore.BulkPut"); + runner.addControllerService("eventhubs_service", service); + runner.setProperty(service, StreamProperties.EVENTHUBS_NAMESPACE().getName(), "namespace"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_EVENT_HUB().getName(), "read_hub"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY_NAME().getName(), "read_sas_key_name"); + runner.enableControllerService(service); + error = true; + fail("Read hub defined, sasKeyName defined but missing sasKey: this should have failed"); + } catch (AssertionError e) { + if (error) { + fail(e.getMessage()); + } else { + System.out.println(e.getMessage()); + } + } + + // Namespace, read hub, sasKeyName and sasKey -> should be ok + try { + final AzureEventHubsStructuredStreamProviderService service = + new AzureEventHubsStructuredStreamProviderService(); + final TestRunner runner = TestRunners.newTestRunner("com.hurence.logisland.processor.datastore.BulkPut"); + runner.addControllerService("eventhubs_service", service); + runner.setProperty(service, StreamProperties.EVENTHUBS_NAMESPACE().getName(), "namespace"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_EVENT_HUB().getName(), "read_hub"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY_NAME().getName(), "read_sas_key_name"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY().getName(), "read_sas_key"); + runner.enableControllerService(service); + error = true; + System.out.println("Read hub defined, sasKeyName, sasKey defined: ok"); + } catch (AssertionError e) { + fail("Read hub defined, sasKeyName, sasKey defined: this should have passed"); + } + + // Bad read position value + try { + error = false; + final AzureEventHubsStructuredStreamProviderService service = + new AzureEventHubsStructuredStreamProviderService(); + final TestRunner runner = TestRunners.newTestRunner("com.hurence.logisland.processor.datastore.BulkPut"); + runner.addControllerService("eventhubs_service", service); + runner.setProperty(service, StreamProperties.EVENTHUBS_NAMESPACE().getName(), "namespace"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_EVENT_HUB().getName(), "read_hub"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY_NAME().getName(), "read_sas_key_name"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY().getName(), "read_sas_key"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_POSITION().getName(), "bad 0123456789 value"); + runner.enableControllerService(service); + error = true; + fail("Bad read position value: this should have failed"); + } catch (AssertionError e) { + if (error) { + fail(e.getMessage()); + } else { + System.out.println(e.getMessage()); + } + } + + // Bad read position type + try { + error = false; + final AzureEventHubsStructuredStreamProviderService service = + new AzureEventHubsStructuredStreamProviderService(); + final TestRunner runner = TestRunners.newTestRunner("com.hurence.logisland.processor.datastore.BulkPut"); + runner.addControllerService("eventhubs_service", service); + runner.setProperty(service, StreamProperties.EVENTHUBS_NAMESPACE().getName(), "namespace"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_EVENT_HUB().getName(), "read_hub"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY_NAME().getName(), "read_sas_key_name"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY().getName(), "read_sas_key"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_POSITION_TYPE().getName(), "bad value"); + runner.enableControllerService(service); + error = true; + fail("Bad read position type value: this should have failed"); + } catch (AssertionError e) { + if (error) { + fail(e.getMessage()); + } else { + System.out.println(e.getMessage()); + } + } + + // Set all read properties + try { + final AzureEventHubsStructuredStreamProviderService service = + new AzureEventHubsStructuredStreamProviderService(); + final TestRunner runner = TestRunners.newTestRunner("com.hurence.logisland.processor.datastore.BulkPut"); + runner.addControllerService("eventhubs_service", service); + runner.setProperty(service, StreamProperties.EVENTHUBS_NAMESPACE().getName(), "namespace"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_EVENT_HUB().getName(), "read_hub"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY_NAME().getName(), "read_sas_key_name"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY().getName(), "read_sas_key"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_POSITION_TYPE().getName(), "sequenceNumber"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_CONSUMER_GROUP().getName(), "consumerGroup"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_RECEIVER_TIMEOUT().getName(), "123"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_PREFETCH_COUNT().getName(), "456"); + runner.enableControllerService(service); + System.out.println("All read properties set: ok"); + } catch (AssertionError e) { + fail("All read properties set: this should have passed"); + } + + /** + * WRITE EVENT HUB ONLY + */ + + // Namespace, write hub but missing sasKeyName + try { + error = false; + final AzureEventHubsStructuredStreamProviderService service = + new AzureEventHubsStructuredStreamProviderService(); + final TestRunner runner = TestRunners.newTestRunner("com.hurence.logisland.processor.datastore.BulkPut"); + runner.addControllerService("eventhubs_service", service); + runner.setProperty(service, StreamProperties.EVENTHUBS_NAMESPACE().getName(), "namespace"); + runner.setProperty(service, StreamProperties.EVENTHUBS_WRITE_EVENT_HUB().getName(), "write_hub"); + runner.enableControllerService(service); + error = true; + fail("Write hub defined but missing sasKeyName: this should have failed"); + } catch (AssertionError e) { + if (error) { + fail(e.getMessage()); + } else { + System.out.println(e.getMessage()); + } + } + + // Namespace, write hub, sasKeyName but missing sasKey + try { + error = false; + final AzureEventHubsStructuredStreamProviderService service = + new AzureEventHubsStructuredStreamProviderService(); + final TestRunner runner = TestRunners.newTestRunner("com.hurence.logisland.processor.datastore.BulkPut"); + runner.addControllerService("eventhubs_service", service); + runner.setProperty(service, StreamProperties.EVENTHUBS_NAMESPACE().getName(), "namespace"); + runner.setProperty(service, StreamProperties.EVENTHUBS_WRITE_EVENT_HUB().getName(), "write_hub"); + runner.setProperty(service, StreamProperties.EVENTHUBS_WRITE_SAS_KEY_NAME().getName(), "write_sas_key_name"); + runner.enableControllerService(service); + error = true; + fail("Write hub defined, sasKeyName defined but missing sasKey: this should have failed"); + } catch (AssertionError e) { + if (error) { + fail(e.getMessage()); + } else { + System.out.println(e.getMessage()); + } + } + + // Namespace, write hub, sasKeyName and sasKey -> should be ok + try { + final AzureEventHubsStructuredStreamProviderService service = + new AzureEventHubsStructuredStreamProviderService(); + final TestRunner runner = TestRunners.newTestRunner("com.hurence.logisland.processor.datastore.BulkPut"); + runner.addControllerService("eventhubs_service", service); + runner.setProperty(service, StreamProperties.EVENTHUBS_NAMESPACE().getName(), "namespace"); + runner.setProperty(service, StreamProperties.EVENTHUBS_WRITE_EVENT_HUB().getName(), "write_hub"); + runner.setProperty(service, StreamProperties.EVENTHUBS_WRITE_SAS_KEY_NAME().getName(), "write_sas_key_name"); + runner.setProperty(service, StreamProperties.EVENTHUBS_WRITE_SAS_KEY().getName(), "write_sas_key"); + runner.enableControllerService(service); + System.out.println("Write hub defined, sasKeyName, sasKey defined: ok"); + } catch (AssertionError e) { + fail("Write hub defined, sasKeyName, sasKey defined: this should have passed"); + } + + /** + * BOTH READ AND WRITE EVENT HUBS + */ + + // Both read and write hubs, minimum set of properties needed + try { + final AzureEventHubsStructuredStreamProviderService service = + new AzureEventHubsStructuredStreamProviderService(); + final TestRunner runner = TestRunners.newTestRunner("com.hurence.logisland.processor.datastore.BulkPut"); + runner.addControllerService("eventhubs_service", service); + runner.setProperty(service, StreamProperties.EVENTHUBS_NAMESPACE().getName(), "namespace"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_EVENT_HUB().getName(), "read_hub"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY_NAME().getName(), "read_sas_key_name"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY().getName(), "read_sas_key"); + runner.setProperty(service, StreamProperties.EVENTHUBS_WRITE_EVENT_HUB().getName(), "write_hub"); + runner.setProperty(service, StreamProperties.EVENTHUBS_WRITE_SAS_KEY_NAME().getName(), "write_sas_key_name"); + runner.setProperty(service, StreamProperties.EVENTHUBS_WRITE_SAS_KEY().getName(), "write_sas_key"); + runner.enableControllerService(service); + System.out.println("Read and Write hub defined with their key properties defined: ok"); + } catch (AssertionError e) { + fail("Read and Write hub defined with their key properties defined: this should have passed"); + } + + // Bad read position value as long -> ok + try { + final AzureEventHubsStructuredStreamProviderService service = + new AzureEventHubsStructuredStreamProviderService(); + final TestRunner runner = TestRunners.newTestRunner("com.hurence.logisland.processor.datastore.BulkPut"); + runner.addControllerService("eventhubs_service", service); + runner.setProperty(service, StreamProperties.EVENTHUBS_NAMESPACE().getName(), "namespace"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_EVENT_HUB().getName(), "read_hub"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY_NAME().getName(), "read_sas_key_name"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY().getName(), "read_sas_key"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_POSITION().getName(), "1234"); + runner.enableControllerService(service); + System.out.println("Read position is a long: ok"); + } catch (AssertionError e) { + fail("Read position as long should haven been ok"); + } + + // Set all possible read and write properties + try { + final AzureEventHubsStructuredStreamProviderService service = + new AzureEventHubsStructuredStreamProviderService(); + final TestRunner runner = TestRunners.newTestRunner("com.hurence.logisland.processor.datastore.BulkPut"); + runner.addControllerService("eventhubs_service", service); + runner.setProperty(service, StreamProperties.EVENTHUBS_NAMESPACE().getName(), "namespace"); + runner.setProperty(service, StreamProperties.EVENTHUBS_MAX_EVENTS_PER_TRIGGER().getName(), "987"); + runner.setProperty(service, StreamProperties.EVENTHUBS_MAX_OPERATION_TIMEOUT().getName(), "654"); + runner.setProperty(service, StreamProperties.EVENTHUBS_MAX_THREAD_POOL_SIZE().getName(), "321"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_EVENT_HUB().getName(), "read_hub"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY_NAME().getName(), "read_sas_key_name"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY().getName(), "read_sas_key"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_POSITION().getName(), "8963"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_POSITION_TYPE().getName(), "offset"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_CONSUMER_GROUP().getName(), "consumerGroup"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_RECEIVER_TIMEOUT().getName(), "8436"); + runner.setProperty(service, StreamProperties.EVENTHUBS_READ_PREFETCH_COUNT().getName(), "4723"); + runner.setProperty(service, StreamProperties.EVENTHUBS_WRITE_EVENT_HUB().getName(), "write_hub"); + runner.setProperty(service, StreamProperties.EVENTHUBS_WRITE_SAS_KEY_NAME().getName(), "write_sas_key_name"); + runner.setProperty(service, StreamProperties.EVENTHUBS_WRITE_SAS_KEY().getName(), "write_sas_key"); + runner.enableControllerService(service); + System.out.println("All read and write properties set: ok"); + } catch (AssertionError e) { + fail("All read and write properties set: this should have passed"); + } + } +} From da0ce2e97933806e1827cb8deb3bb54e353e39cc Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Thu, 13 Feb 2020 16:33:28 +0100 Subject: [PATCH 09/43] Eventhbubs service: all config properties read --- .../logisland/stream/spark/package.scala | 14 +- ...tHubsStructuredStreamProviderService.scala | 122 ++++++++++++++---- ...bsStructuredStreamProviderServiceTest.java | 4 +- 3 files changed, 107 insertions(+), 33 deletions(-) diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala index 437ebdedd..6412f724c 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala @@ -572,18 +572,18 @@ object StreamProperties { .name("eventhubs.maxEventsPerTrigger") .description("Rate limit on maximum number of events processed per trigger interval. The specified total number" + " of events will be proportionally split across partitions of different volume.") - .addValidator(StandardValidators.INTEGER_VALIDATOR) + .addValidator(StandardValidators.LONG_VALIDATOR) .required(false) .build - val EVENTHUBS_MAX_OPERATION_TIMEOUT: PropertyDescriptor = new PropertyDescriptor.Builder() + val EVENTHUBS_OPERATION_TIMEOUT: PropertyDescriptor = new PropertyDescriptor.Builder() .name("eventhubs.operationTimeout") - .description("The amount of time Event Hub API calls will be retried before throwing an exception.") - .addValidator(StandardValidators.INTEGER_VALIDATOR) + .description("The amount of time (in milliseconds) Event Hub API calls will be retried before throwing an exception.") + .addValidator(StandardValidators.LONG_VALIDATOR) .required(false) .build - val EVENTHUBS_MAX_THREAD_POOL_SIZE: PropertyDescriptor = new PropertyDescriptor.Builder() + val EVENTHUBS_THREAD_POOL_SIZE: PropertyDescriptor = new PropertyDescriptor.Builder() .name("eventhubs.threadPoolSize") .description("Sets the size of thread pool.") .addValidator(StandardValidators.INTEGER_VALIDATOR) @@ -674,8 +674,8 @@ object StreamProperties { val EVENTHUBS_READ_RECEIVER_TIMEOUT: PropertyDescriptor = new PropertyDescriptor.Builder() .name("eventhubs.read.receiverTimeout") - .description("The amount of time Event Hub receive calls will be retried before throwing an exception.") - .addValidator(StandardValidators.INTEGER_VALIDATOR) + .description("The amount of time (in milliseconds) Event Hub receive calls will be retried before throwing an exception.") + .addValidator(StandardValidators.LONG_VALIDATOR) .required(false) .build diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala index 2bc766f93..d262f9b91 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala @@ -15,6 +15,7 @@ */ package com.hurence.logisland.stream.spark.structured.provider +import java.time.{Duration, Instant} import java.util import java.util.Collections @@ -22,12 +23,12 @@ import com.hurence.logisland.annotation.documentation.CapabilityDescription import com.hurence.logisland.annotation.lifecycle.OnEnabled import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} -import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} +import com.hurence.logisland.record.{FieldDictionary, Record, StandardRecord} import com.hurence.logisland.stream.StreamContext import com.hurence.logisland.stream.StreamProperties._ import com.hurence.logisland.util.spark.ControllerServiceLookupSink import org.apache.spark.broadcast.Broadcast -import org.apache.spark.eventhubs.{ConnectionStringBuilder, EventHubsConf} +import org.apache.spark.eventhubs.{ConnectionStringBuilder, EventHubsConf, EventPosition} import org.apache.spark.sql.{Dataset, SparkSession} /** @@ -51,12 +52,6 @@ class AzureEventHubsStructuredStreamProviderService() extends AbstractController var writeSasKeyName : String = null var writeSasKey : String = null -// var maxEventPerTrigger : Int = Int.MaxValue -// var maxOperationTimeout : Int = null -// var threadPoolSize : Int = null -// var readReceiverTimeout : Int = null -// var readPrefetchCount : Int = null - var properties : Map[String, Any] = Map[String, Any]() @OnEnabled @@ -89,19 +84,19 @@ class AzureEventHubsStructuredStreamProviderService() extends AbstractController // maxEventPerTrigger if (context.getPropertyValue(EVENTHUBS_MAX_EVENTS_PER_TRIGGER).isSet) { properties += (EVENTHUBS_MAX_EVENTS_PER_TRIGGER.getName - -> context.getPropertyValue(EVENTHUBS_MAX_EVENTS_PER_TRIGGER).asInteger().toInt) + -> context.getPropertyValue(EVENTHUBS_MAX_EVENTS_PER_TRIGGER).asLong().toLong) } - // maxOperationTimeout - if (context.getPropertyValue(EVENTHUBS_MAX_OPERATION_TIMEOUT).isSet) { - properties += (EVENTHUBS_MAX_OPERATION_TIMEOUT.getName - -> context.getPropertyValue(EVENTHUBS_MAX_OPERATION_TIMEOUT).asInteger().toInt) + // operationTimeout + if (context.getPropertyValue(EVENTHUBS_OPERATION_TIMEOUT).isSet) { + properties += (EVENTHUBS_OPERATION_TIMEOUT.getName + -> context.getPropertyValue(EVENTHUBS_OPERATION_TIMEOUT).asLong().toLong) } // threadPoolSize - if (context.getPropertyValue(EVENTHUBS_MAX_THREAD_POOL_SIZE).isSet) { - properties += (EVENTHUBS_MAX_THREAD_POOL_SIZE.getName - -> context.getPropertyValue(EVENTHUBS_MAX_THREAD_POOL_SIZE).asInteger().toInt) + if (context.getPropertyValue(EVENTHUBS_THREAD_POOL_SIZE).isSet) { + properties += (EVENTHUBS_THREAD_POOL_SIZE.getName + -> context.getPropertyValue(EVENTHUBS_THREAD_POOL_SIZE).asInteger().toInt) } if ((readEventHub == null) && (writeEventHub == null)) { @@ -194,8 +189,8 @@ class AzureEventHubsStructuredStreamProviderService() extends AbstractController val descriptors = new util.ArrayList[PropertyDescriptor] descriptors.add(EVENTHUBS_NAMESPACE) descriptors.add(EVENTHUBS_MAX_EVENTS_PER_TRIGGER) - descriptors.add(EVENTHUBS_MAX_OPERATION_TIMEOUT) - descriptors.add(EVENTHUBS_MAX_THREAD_POOL_SIZE) + descriptors.add(EVENTHUBS_OPERATION_TIMEOUT) + descriptors.add(EVENTHUBS_THREAD_POOL_SIZE) descriptors.add(EVENTHUBS_READ_EVENT_HUB) descriptors.add(EVENTHUBS_READ_SAS_KEY_NAME) descriptors.add(EVENTHUBS_READ_SAS_KEY) @@ -210,6 +205,82 @@ class AzureEventHubsStructuredStreamProviderService() extends AbstractController Collections.unmodifiableList(descriptors) } + /** + * Applies the defined service configuration to the passed event hub configuration object + * @param eventHubsConf + */ + def applyConfig(eventHubsConf: EventHubsConf, forRead : Boolean): Unit = { + + if (forRead) { + + /** + * Properties only for read + */ + + if (readConsumerGroup != null) { + eventHubsConf.setConsumerGroup(readConsumerGroup) + } + + if (readPositionIsString != null) { + // Read position is a string + readPositionString match { + case EVENTHUBS_READ_POSITION_START_OF_STREAM => + eventHubsConf.setStartingPosition(EventPosition.fromStartOfStream) + case EVENTHUBS_READ_POSITION_END_OF_STREAM => + eventHubsConf.setStartingPosition(EventPosition.fromEndOfStream) + case EVENTHUBS_READ_POSITION_INSTANT_NOW => + eventHubsConf.setStartingPosition(EventPosition.fromEnqueuedTime(Instant.now())) + case _ => throw new IllegalStateException("Unsupported read position string value: " + readPositionString) + } + } else { + // Read position is a long, let's use it according to its meaning defined in readPositionType + readPositionType match { + case EVENTHUBS_READ_POSITION_TYPE_OFFSET => + eventHubsConf.setStartingPosition(EventPosition.fromOffset(readPositionLong.toString)) + case EVENTHUBS_READ_POSITION_TYPE_SEQUENCE_NUMBER => + eventHubsConf.setStartingPosition(EventPosition.fromSequenceNumber(readPositionLong)) + case EVENTHUBS_READ_POSITION_TYPE_EPOCH_MILLIS => + eventHubsConf.setStartingPosition(EventPosition.fromEnqueuedTime(Instant.ofEpochMilli(readPositionLong))) + case _ => throw new IllegalStateException("Unsupported read position type value: " + readPositionType) + } + } + + // readReceiverTimeout + val optionLong = properties.get(EVENTHUBS_READ_RECEIVER_TIMEOUT.getName).asInstanceOf[Option[Long]] + if (optionLong.isDefined) { + eventHubsConf.setReceiverTimeout(Duration.ofMillis(optionLong.get)) + } + + // readPrefetchCount + val optionInt : Option[Int] = properties.get(EVENTHUBS_READ_PREFETCH_COUNT.getName).asInstanceOf[Option[Int]] + if (optionInt.isDefined) { + eventHubsConf.setPrefetchCount(optionInt.get) + } + } + + /** + * Properties for both read or write + */ + + // maxEventPerTrigger + var optionLong : Option[Long] = properties.get(EVENTHUBS_MAX_EVENTS_PER_TRIGGER.getName).asInstanceOf[Option[Long]] + if (optionLong.isDefined) { + eventHubsConf.setMaxEventsPerTrigger(optionLong.get) + } + + // operationTimeout + optionLong = properties.get(EVENTHUBS_OPERATION_TIMEOUT.getName).asInstanceOf[Option[Long]] + if (optionLong.isDefined) { + eventHubsConf.setOperationTimeout(Duration.ofMillis(optionLong.get)) + } + + // maxEventPerTrigger + val optionInt : Option[Int] = properties.get(EVENTHUBS_THREAD_POOL_SIZE.getName).asInstanceOf[Option[Int]] + if (optionInt.isDefined) { + eventHubsConf.setThreadPoolSize(optionInt.get) + } + } + /** * create a streaming DataFrame that represents data received * @@ -229,7 +300,8 @@ class AzureEventHubsStructuredStreamProviderService() extends AbstractController .setSasKey(readSasKey) .build - var eventHubsConf = EventHubsConf(connectionString) + val eventHubsConf = EventHubsConf(connectionString) + applyConfig(eventHubsConf, true) logger.info(s"Starting azure event hubs structured stream on event hub $readEventHub in $namespace namespace") val df = spark.readStream @@ -264,17 +336,18 @@ class AzureEventHubsStructuredStreamProviderService() extends AbstractController val connectionString = ConnectionStringBuilder() .setNamespaceName(namespace) - .setEventHubName(readEventHub) - .setSasKeyName(readSasKeyName) - .setSasKey(readSasKey) + .setEventHubName(writeEventHub) + .setSasKeyName(writeSasKeyName) + .setSasKey(writeSasKey) .build - var eventHubsConf = EventHubsConf(connectionString) + val eventHubsConf = EventHubsConf(connectionString) + applyConfig(eventHubsConf, false) logger.info(s"Starting azure event hubs structured stream to event hub $readEventHub in $namespace namespace") // Write key-value data from a DataFrame to a specific Kafka topic specified in an option - df .map(r => { + df.map(r => { (r.getField(FieldDictionary.RECORD_KEY).asString(), r.getField(FieldDictionary.RECORD_VALUE).asBytes()) }) .as[(String, Array[Byte])] @@ -282,6 +355,7 @@ class AzureEventHubsStructuredStreamProviderService() extends AbstractController .writeStream .format("eventhubs") .options(eventHubsConf.toMap) + // TODO update checkpoint with global value .option("checkpointLocation", "checkpoints") } } diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderServiceTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderServiceTest.java index 36668a356..e622b28f8 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderServiceTest.java +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderServiceTest.java @@ -289,8 +289,8 @@ public void testConfig() throws InitializationException { runner.addControllerService("eventhubs_service", service); runner.setProperty(service, StreamProperties.EVENTHUBS_NAMESPACE().getName(), "namespace"); runner.setProperty(service, StreamProperties.EVENTHUBS_MAX_EVENTS_PER_TRIGGER().getName(), "987"); - runner.setProperty(service, StreamProperties.EVENTHUBS_MAX_OPERATION_TIMEOUT().getName(), "654"); - runner.setProperty(service, StreamProperties.EVENTHUBS_MAX_THREAD_POOL_SIZE().getName(), "321"); + runner.setProperty(service, StreamProperties.EVENTHUBS_OPERATION_TIMEOUT().getName(), "654"); + runner.setProperty(service, StreamProperties.EVENTHUBS_THREAD_POOL_SIZE().getName(), "321"); runner.setProperty(service, StreamProperties.EVENTHUBS_READ_EVENT_HUB().getName(), "read_hub"); runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY_NAME().getName(), "read_sas_key_name"); runner.setProperty(service, StreamProperties.EVENTHUBS_READ_SAS_KEY().getName(), "read_sas_key"); From 1075c46e4ff5eebf86da73a20e7e3a31d3bf2fd8 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Thu, 13 Feb 2020 17:14:32 +0100 Subject: [PATCH 10/43] First attempt to have azure event hubs service running, will have to test now... --- ...EventHubsStructuredStreamProviderService.scala | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala index d262f9b91..2bc26afbf 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala @@ -23,7 +23,7 @@ import com.hurence.logisland.annotation.documentation.CapabilityDescription import com.hurence.logisland.annotation.lifecycle.OnEnabled import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} -import com.hurence.logisland.record.{FieldDictionary, Record, StandardRecord} +import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} import com.hurence.logisland.stream.StreamContext import com.hurence.logisland.stream.StreamProperties._ import com.hurence.logisland.util.spark.ControllerServiceLookupSink @@ -308,13 +308,14 @@ class AzureEventHubsStructuredStreamProviderService() extends AbstractController .format("eventhubs") .options(eventHubsConf.toMap) .load() - .selectExpr("CAST(key AS STRING)", "CAST(value AS BINARY)") +// .select($"body" cast "string") + .selectExpr("CAST(sequenceNumber AS STRING)", "CAST(body AS BINARY)") // .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") .as[(String, Array[Byte])] .map(r => { -// new StandardRecord(inputTopics.head) -// .setField(FieldDictionary.RECORD_KEY, FieldType.STRING, r._1) -// .setField(FieldDictionary.RECORD_VALUE, FieldType.BYTES, r._2) + new StandardRecord(readEventHub) + .setField(FieldDictionary.RECORD_KEY, FieldType.STRING, r._1) + .setField(FieldDictionary.RECORD_VALUE, FieldType.BYTES, r._2) new StandardRecord("").asInstanceOf[Record]; }) @@ -329,9 +330,6 @@ class AzureEventHubsStructuredStreamProviderService() extends AbstractController */ override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext) = { - - // val sender = df.sparkSession.sparkContext.broadcast(KafkaSink(kafkaSinkParams)) - import df.sparkSession.implicits._ val connectionString = ConnectionStringBuilder() @@ -352,6 +350,7 @@ class AzureEventHubsStructuredStreamProviderService() extends AbstractController }) .as[(String, Array[Byte])] .toDF("key","value") + .selectExpr("partitionKey", "body") .writeStream .format("eventhubs") .options(eventHubsConf.toMap) From ab6a25405d443bb63c38204c8693880cfeeb01db Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Thu, 13 Feb 2020 17:23:14 +0100 Subject: [PATCH 11/43] Fixed spark 2.4 module pom mqtt dependency --- .../logisland-engine-spark_2_4/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml index d610d24ef..24d43e6dd 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml @@ -43,7 +43,7 @@ http://www.w3.org/2001/XMLSchema-instance "> org.apache.bahir spark-sql-streaming-mqtt_2.11 - ${spark.version} + 2.4.0 runtime true From 34911946966ad59ad934463dd0da6a3d103707c2 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Fri, 14 Feb 2020 11:20:11 +0100 Subject: [PATCH 12/43] Using checkpoint location in azure structured stream service. Also fixed a bug where read position configiguration property cannot be anything else than string --- .../logisland-engine-spark_2_common/pom.xml | 7 +++++++ ...ventHubsStructuredStreamProviderService.scala | 16 ++++++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml index a8a1cfb1f..ffa0e7101 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml @@ -375,6 +375,13 @@ http://www.w3.org/2001/XMLSchema-instance "> ${eventhubs.version} + + com.hurence.logisland + logisland-bootstrap + ${project.version} + provided + + diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala index 2bc26afbf..08c30f7a5 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala @@ -24,6 +24,7 @@ import com.hurence.logisland.annotation.lifecycle.OnEnabled import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} +import com.hurence.logisland.runner.GlobalOptions import com.hurence.logisland.stream.StreamContext import com.hurence.logisland.stream.StreamProperties._ import com.hurence.logisland.util.spark.ControllerServiceLookupSink @@ -221,7 +222,7 @@ class AzureEventHubsStructuredStreamProviderService() extends AbstractController eventHubsConf.setConsumerGroup(readConsumerGroup) } - if (readPositionIsString != null) { + if (readPositionIsString) { // Read position is a string readPositionString match { case EVENTHUBS_READ_POSITION_START_OF_STREAM => @@ -342,19 +343,22 @@ class AzureEventHubsStructuredStreamProviderService() extends AbstractController val eventHubsConf = EventHubsConf(connectionString) applyConfig(eventHubsConf, false) - logger.info(s"Starting azure event hubs structured stream to event hub $readEventHub in $namespace namespace") + logger.info(s"Starting azure event hubs structured stream to event hub $writeEventHub in $namespace namespace") + var checkpointLocation : String = "checkpoints" + if (GlobalOptions.checkpointLocation != null) { + checkpointLocation = GlobalOptions.checkpointLocation + logger.info(s"Writing to event hub using checkpointLocation: $writeEventHub") + } // Write key-value data from a DataFrame to a specific Kafka topic specified in an option df.map(r => { (r.getField(FieldDictionary.RECORD_KEY).asString(), r.getField(FieldDictionary.RECORD_VALUE).asBytes()) }) .as[(String, Array[Byte])] - .toDF("key","value") - .selectExpr("partitionKey", "body") + .toDF("partitionKey","body") .writeStream .format("eventhubs") .options(eventHubsConf.toMap) - // TODO update checkpoint with global value - .option("checkpointLocation", "checkpoints") + .option("checkpointLocation", checkpointLocation) } } From 65d2f46c2a9e218644789c65d50ac3fcbda85e94 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Fri, 14 Feb 2020 16:50:41 +0100 Subject: [PATCH 13/43] Support spark 2.4 engine in logisland.sh script --- logisland-resources/src/main/resources/bin/logisland.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/logisland-resources/src/main/resources/bin/logisland.sh b/logisland-resources/src/main/resources/bin/logisland.sh index cbc50d8fd..3228276ff 100755 --- a/logisland-resources/src/main/resources/bin/logisland.sh +++ b/logisland-resources/src/main/resources/bin/logisland.sh @@ -273,8 +273,8 @@ main() { *) compare_versions ${SPARK_VERSION} 2.3.0 case $? in 2) engine_jar=`ls ${lib_dir}/engines/logisland-engine-spark_2_1-*.jar` ;; - *) engine_jar=`ls ${lib_dir}/engines/logisland-engine-spark_2_3-*.jar` ;; - + 0) engine_jar=`ls ${lib_dir}/engines/logisland-engine-spark_2_3-*.jar` ;; + *) engine_jar=`ls ${lib_dir}/engines/logisland-engine-spark_2_4-*.jar` ;; esac ;; esac From dc360dd007d24d2b3a7fc782e93fe04a0a72a163 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Mon, 17 Feb 2020 15:59:28 +0100 Subject: [PATCH 14/43] Update version of spark-sql-streaming-mqtt_2.11 dependency --- .../logisland-engine-spark_2_common/pom.xml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml index ffa0e7101..efe4a0f42 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml @@ -199,10 +199,9 @@ http://www.w3.org/2001/XMLSchema-instance "> org.apache.bahir spark-sql-streaming-mqtt_2.11 - 2.3.2 + 2.4.0 - org.apache.kafka connect-api From 086aeef0f1c18c65a07d9e69f110e141c4fba617 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Mon, 17 Feb 2020 16:49:58 +0100 Subject: [PATCH 15/43] Removed useless parenthesis --- .../AzureEventHubsStructuredStreamProviderService.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala index 08c30f7a5..216c4be4b 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala @@ -38,7 +38,7 @@ import org.apache.spark.sql.{Dataset, SparkSession} * https://github.com/Azure/azure-event-hubs-spark/blob/master/docs/structured-streaming-eventhubs-integration.md */ @CapabilityDescription("Provides a ways to use azure event hubs as input or output in StructuredStream streams") -class AzureEventHubsStructuredStreamProviderService() extends AbstractControllerService with StructuredStreamProviderService { +class AzureEventHubsStructuredStreamProviderService extends AbstractControllerService with StructuredStreamProviderService { var namespace : String = null var readPositionString: String = null From 7ffb97845ffc060e876270e5a7e5f6f9a394be3c Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Mon, 17 Feb 2020 18:00:07 +0100 Subject: [PATCH 16/43] Added spark 2 common jar to logisland doc dependency --- .../logisland-engine-spark_2_4/pom.xml | 1 - logisland-documentation/pom.xml | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml index 24d43e6dd..0376a5033 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml @@ -107,7 +107,6 @@ http://www.w3.org/2001/XMLSchema-instance "> com.hurence.logisland logisland-engine-spark_2_common ${project.version} - true org.apache.spark diff --git a/logisland-documentation/pom.xml b/logisland-documentation/pom.xml index 58adb0103..6165e23c3 100644 --- a/logisland-documentation/pom.xml +++ b/logisland-documentation/pom.xml @@ -520,6 +520,11 @@ THIS MODULE DOCUMENTATION DEPENDENCIES logisland-engine-spark_2_3 ${project.version} + + com.hurence.logisland + logisland-engine-spark_2_4 + ${project.version} + org.apache.spark spark-network-common_${scala.binary.version} From 280e34dbcec67c1d00ce679544df098b503d2ada Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Tue, 18 Feb 2020 15:05:56 +0100 Subject: [PATCH 17/43] Use right scala version --- .../logisland-engine-spark_2_common/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml index efe4a0f42..b7e3adde3 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml @@ -51,7 +51,7 @@ http://www.w3.org/2001/XMLSchema-instance "> org.apache.kafka - kafka_2.11 + kafka_${scala.binary.version} ${kafka.version} compile @@ -198,7 +198,7 @@ http://www.w3.org/2001/XMLSchema-instance "> org.apache.bahir - spark-sql-streaming-mqtt_2.11 + spark-sql-streaming-mqtt_${scala.binary.version} 2.4.0 From 8d0ea785e30ca6f9d7d450be6606759038e1e679 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Tue, 18 Feb 2020 15:58:04 +0100 Subject: [PATCH 18/43] Explicitly adding bahir-common dependency in spark2.4 engine, as a workaround to class not found for org.apache.bahir.utils.Logging when using AzureEventHubsStructuredStreamProviderService --- .../logisland-engine-spark_2_common/pom.xml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml index b7e3adde3..5311d3dee 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml @@ -202,6 +202,18 @@ http://www.w3.org/2001/XMLSchema-instance "> 2.4.0 + + + org.apache.bahir + bahir-common_${scala.binary.version} + 2.4.0 + + org.apache.kafka connect-api From f59dabc77cb27051b123e59e8e5d239b95df253a Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Wed, 19 Feb 2020 15:00:43 +0100 Subject: [PATCH 19/43] Set checkpointlocation in structured stream --- .../provider/StructuredStreamProviderService.scala | 1 + .../provider/StructuredStreamProviderService.scala | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala index 3c6926a13..47721a9ac 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala @@ -281,6 +281,7 @@ trait StructuredStreamProviderService extends ControllerService { var checkpointLocation = "checkpoints/" + streamContext.getIdentifier if (GlobalOptions.checkpointLocation != null) { checkpointLocation = GlobalOptions.checkpointLocation + logger.info(s"Saving structured stream using checkpointLocation: $checkpointLocation") } write(df2, controllerServiceLookupSink, streamContext) diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala index befcb2f0b..fc4af3809 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala @@ -36,6 +36,7 @@ import java.util.Date import com.hurence.logisland.controller.ControllerService import com.hurence.logisland.record._ +import com.hurence.logisland.runner.GlobalOptions import com.hurence.logisland.serializer.{JsonSerializer, NoopSerializer, RecordSerializer, SerializerProvider} import com.hurence.logisland.stream.StreamContext import com.hurence.logisland.stream.StreamProperties._ @@ -278,10 +279,16 @@ trait StructuredStreamProviderService extends ControllerService { // do the parallel processing val df2 = df.mapPartitions(record => record.map(record => serializeRecords(serializer, keySerializer, record))) + var checkpointLocation : String = "checkpoints/" + streamContext.getIdentifier + if (GlobalOptions.checkpointLocation != null) { + checkpointLocation = GlobalOptions.checkpointLocation + logger.info(s"Saving structured stream using checkpointLocation: $checkpointLocation") + } + write(df2, controllerServiceLookupSink, streamContext) .queryName(streamContext.getIdentifier) // .outputMode("update") - .option("checkpointLocation", "checkpoints/" + streamContext.getIdentifier) + .option("checkpointLocation", checkpointLocation) .start() // .processAllAvailable() From bbf270e5d2525c6eb92105658b217402bb1c2963 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Fri, 21 Feb 2020 14:48:17 +0100 Subject: [PATCH 20/43] Added spark deploy mode property to support sparl standlone cluster deploy mode --- .../engine/spark/KafkaStreamProcessingEngine.scala | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/engine/spark/KafkaStreamProcessingEngine.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/engine/spark/KafkaStreamProcessingEngine.scala index c7ee30d8c..8d94921e4 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/engine/spark/KafkaStreamProcessingEngine.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/engine/spark/KafkaStreamProcessingEngine.scala @@ -104,7 +104,14 @@ object KafkaStreamProcessingEngine { .name("spark.yarn.deploy-mode") .description("The yarn deploy mode") .required(false) - // .allowableValues("client", "cluster") + .allowableValues("client", "cluster") + .build + + val SPARK_DEPLOYMODE = new PropertyDescriptor.Builder() + .name("spark.deploy-mode") + .description("The spark standalone cluster deploy mode") + .required(false) + .allowableValues("client", "cluster") .build val SPARK_YARN_QUEUE = new PropertyDescriptor.Builder() @@ -487,6 +494,7 @@ class KafkaStreamProcessingEngine extends AbstractProcessingEngine { descriptors.add(KafkaStreamProcessingEngine.SPARK_MASTER) descriptors.add(KafkaStreamProcessingEngine.SPARK_MONITORING_DRIVER_PORT) descriptors.add(KafkaStreamProcessingEngine.SPARK_YARN_DEPLOYMODE) + descriptors.add(KafkaStreamProcessingEngine.SPARK_DEPLOYMODE) descriptors.add(KafkaStreamProcessingEngine.SPARK_YARN_QUEUE) descriptors.add(KafkaStreamProcessingEngine.SPARK_DRIVER_MEMORY) descriptors.add(KafkaStreamProcessingEngine.SPARK_EXECUTOR_MEMORY) From 2a719c5d6c8edc7a1a7a10f56b9c3b27c8098a8d Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Fri, 21 Feb 2020 15:42:21 +0100 Subject: [PATCH 21/43] Introudcing support of spark standalone cluster in logisland.sh --- .../src/main/resources/bin/logisland.sh | 187 ++++++++++++++++-- 1 file changed, 169 insertions(+), 18 deletions(-) diff --git a/logisland-resources/src/main/resources/bin/logisland.sh b/logisland-resources/src/main/resources/bin/logisland.sh index 3228276ff..1abbb1d12 100755 --- a/logisland-resources/src/main/resources/bin/logisland.sh +++ b/logisland-resources/src/main/resources/bin/logisland.sh @@ -279,11 +279,13 @@ main() { ;; esac - - export SPARK_PRINT_LAUNCH_COMMAND=1 echo "Detected spark version ${SPARK_VERSION}. We'll automatically plug in engine jar ${engine_jar}" APP_NAME=`awk '{ if( $1 == "spark.app.name:" ){ print $2 } }' ${CONF_FILE}` + + # + # YARN mode? + # MODE=`awk '{ if( $1 == "spark.master:" ){ print $2 } }' ${CONF_FILE}` case ${MODE} in "yarn") @@ -303,31 +305,54 @@ main() { MODE=${MODE}-${EXTRA_MODE} fi ;; - esac + esac + # + # MESOS mode? + # if [[ "${MODE}" =~ "mesos" ]] then SPARK_MASTER=${MODE} MODE="mesos" fi + # + # Spark standalone mode? + # + if [[ "${MODE}" =~ ^spark://.* ]] # Starts with spark:// (spark standalone url) + then + SPARK_MASTER=${MODE} + EXTRA_MODE=`awk '{ if( $1 == "spark.deploy-mode:" ){ print $2 } }' ${CONF_FILE}` + if [[ -z "${EXTRA_MODE}" ]] + then + echo "The property \"spark.deploy-mode\" is missing in config file \"${CONF_FILE}\"" + exit 1 + fi + if [[ ! ${EXTRA_MODE} = "cluster" && ! ${EXTRA_MODE} = "client" ]] + then + echo "The property \"spark.deploy-mode\" value \"${EXTRA_MODE}\" is not supported" + exit 1 + else + MODE=spark-${EXTRA_MODE} + fi + fi if [[ ! -z "${VERBOSE_OPTIONS}" ]] then - echo "Starting with mode \"${MODE}\" on master \"${SPARK_MASTER}\"" + echo "Starting with run mode \"${MODE}\" on master \"${SPARK_MASTER}\"" fi case ${MODE} in local*) ${SPARK_HOME}/bin/spark-submit ${VERBOSE_OPTIONS} ${YARN_CLUSTER_OPTIONS} \ - --driver-library-path ${OPENCV_NATIVE_LIB_PATH} \ - --conf spark.driver.extraJavaOptions="-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005 -Dlog4j.configuration=\"file:${lib_dir}/../conf/log4j.properties\"" \ - --conf spark.executor.extraJavaOptions="-Dlog4j.configuration=\"file:${lib_dir}/../conf/log4j.properties\"" \ - --conf spark.metrics.namespace="${APP_NAME}" \ - --conf spark.metrics.conf="${lib_dir}/../monitoring/metrics.properties" \ + --driver-library-path ${OPENCV_NATIVE_LIB_PATH} \ + --conf spark.driver.extraJavaOptions="-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005 -Dlog4j.configuration=\"file:${lib_dir}/../conf/log4j.properties\"" \ + --conf spark.executor.extraJavaOptions="-Dlog4j.configuration=\"file:${lib_dir}/../conf/log4j.properties\"" \ + --conf spark.metrics.namespace="${APP_NAME}" \ + --conf spark.metrics.conf="${lib_dir}/../monitoring/metrics.properties" \ --class ${app_mainclass} \ --jars ${app_classpath} ${engine_jar} \ -conf ${CONF_FILE} @@ -335,8 +360,7 @@ main() { ;; yarn-cluster) - YARN_CLUSTER_OPTIONS="--master yarn --deploy-mode cluster --files ${CONF_FILE}#logisland-configuration.yml,${CONF_DIR}/../monitoring/jmx_prometheus_javaagent-0.10.jar#jmx_prometheus_javaagent-0.10.jar,${CONF_DIR}/../monitoring/spark-prometheus.yml#spark-prometheus.yml,${CONF_DIR}/../monitoring/metrics.properties#metrics.properties,${CONF_DIR}/log4j.properties#log4j.properties --conf spark.metrics.namespace=\"${APP_NAME}\" --conf \"spark.executor.extraJavaOptions=-Dlog4j.configuration=log4j.properties\" --conf spark.ui.showConsoleProgress=false" - + YARN_CLUSTER_OPTIONS="--master yarn --deploy-mode cluster --files ${CONF_FILE}#logisland-configuration.yml,${CONF_DIR}/../monitoring/jmx_prometheus_javaagent-0.10.jar#jmx_prometheus_javaagent-0.10.jar,${CONF_DIR}/../monitoring/spark-prometheus.yml#spark-prometheus.yml,${CONF_DIR}/../monitoring/metrics.properties#metrics.properties,${CONF_DIR}/log4j.properties#log4j.properties --conf spark.metrics.namespace=\"${APP_NAME}\" --conf spark.ui.showConsoleProgress=false" if [[ ! -z "${YARN_APP_NAME}" ]] then @@ -415,7 +439,6 @@ main() { YARN_CLUSTER_OPTIONS="${YARN_CLUSTER_OPTIONS} --conf spark.task.maxFailures=${SPARK_TASK_MAX_FAILURES}" fi - PROPERTIES_FILE_PATH=`awk '{ if( $1 == "spark.properties.file.path:" ){ print $2 } }' ${CONF_FILE}` if [[ ! -z "${PROPERTIES_FILE_PATH}" ]] then @@ -440,7 +463,7 @@ main() { --conf "${EXTRA_PROCESSOR_JAVA_OPTIONS}" \ --class ${app_mainclass} \ --jars ${app_classpath} ${engine_jar} \ - -conf ${CONF_FILE} + -conf ${CONF_FILE} ;; yarn-client) @@ -470,14 +493,13 @@ main() { --conf spark.metrics.conf=./metrics.properties \ --class ${app_mainclass} \ --jars ${app_classpath} ${engine_jar} \ - -conf ${CONF_FILE} + -conf ${CONF_FILE} ;; mesos) MESOS_OPTIONS="--master ${SPARK_MASTER} --conf spark.metrics.namespace=\"${APP_NAME}\"" - DRIVER_CORES=`awk '{ if( $1 == "spark.driver.cores:" ){ print $2 } }' ${CONF_FILE}` if [[ ! -z "${DRIVER_CORES}" ]] then @@ -517,12 +539,10 @@ main() { MESOS_NATIVE_JAVA_LIBRARY=`awk '{ if( $1 == "java.library.path:" ){ print $2 } }' ${CONF_FILE}` - - #--deploy-mode cluster \ #--supervise \ #--executor-memory 20G \ - # --total-executor-cores 100 \ + # --total-executor-cores 100 \ export MESOS_NATIVE_JAVA_LIBRARY="${MESOS_NATIVE_JAVA_LIBRARY}" ${SPARK_HOME}/bin/spark-submit ${VERBOSE_OPTIONS} ${MESOS_OPTIONS} \ @@ -532,6 +552,137 @@ main() { -conf ${CONF_FILE} ;; + spark-cluster) + SPARK_CLUSTER_OPTIONS="--master ${SPARK_MASTER} --deploy-mode cluster --files ${CONF_FILE}#logisland-configuration.yml,${CONF_DIR}/../monitoring/jmx_prometheus_javaagent-0.10.jar#jmx_prometheus_javaagent-0.10.jar,${CONF_DIR}/../monitoring/spark-prometheus.yml#spark-prometheus.yml,${CONF_DIR}/../monitoring/metrics.properties#metrics.properties,${CONF_DIR}/log4j.properties#log4j.properties --conf spark.metrics.namespace=\"${APP_NAME}\" --conf spark.ui.showConsoleProgress=false" + + if [[ ! -z "${SPARK_APP_NAME}" ]] + then + SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --name ${SPARK_APP_NAME}" + else + SPARK_APP_NAME=`awk '{ if( $1 == "spark.app.name:" ){ print $2 } }' ${CONF_FILE}` + if [[ ! -z "${SPARK_APP_NAME}" ]] + then + SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --name ${SPARK_APP_NAME}" + fi + fi + + DRIVER_CORES=`awk '{ if( $1 == "spark.driver.cores:" ){ print $2 } }' ${CONF_FILE}` + if [[ ! -z "${DRIVER_CORES}" ]] + then + SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --driver-cores ${DRIVER_CORES}" + fi + + DRIVER_MEMORY=`awk '{ if( $1 == "spark.driver.memory:" ){ print $2 } }' ${CONF_FILE}` + if [[ ! -z "${DRIVER_MEMORY}" ]] + then + SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --driver-memory ${DRIVER_MEMORY}" + fi + + EXECUTORS_CORES=`awk '{ if( $1 == "spark.executor.cores:" ){ print $2 } }' ${CONF_FILE}` + if [[ ! -z "${EXECUTORS_CORES}" ]] + then + SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --executor-cores ${EXECUTORS_CORES}" + fi + + EXECUTORS_MEMORY=`awk '{ if( $1 == "spark.executor.memory:" ){ print $2 } }' ${CONF_FILE}` + if [[ ! -z "${EXECUTORS_MEMORY}" ]] + then + SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --executor-memory ${EXECUTORS_MEMORY}" + fi + + EXECUTORS_INSTANCES=`awk '{ if( $1 == "spark.executor.instances:" ){ print $2 } }' ${CONF_FILE}` + if [[ ! -z "${EXECUTORS_INSTANCES}" ]] + then + SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --num-executors ${EXECUTORS_INSTANCES}" + fi + + SPARK_TASK_MAX_FAILURES=`awk '{ if( $1 == "spark.task.maxFailures:" ){ print $2 } }' ${CONF_FILE}` + if [[ ! -z "${SPARK_TASK_MAX_FAILURES}" ]] + then + SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --conf spark.task.maxFailures=${SPARK_TASK_MAX_FAILURES}" + fi + + PROPERTIES_FILE_PATH=`awk '{ if( $1 == "spark.properties.file.path:" ){ print $2 } }' ${CONF_FILE}` + if [[ ! -z "${PROPERTIES_FILE_PATH}" ]] + then + SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --properties-file ${PROPERTIES_FILE_PATH}" + fi + + SPARK_MONITORING_DRIVER_PORT=`awk '{ if( $1 == "spark.monitoring.driver.port:" ){ print $2 } }' ${CONF_FILE}` + if [[ -z "${SPARK_MONITORING_DRIVER_PORT}" ]] + then + EXTRA_DRIVER_JAVA_OPTIONS='spark.driver.extraJavaOptions=-Dlog4j.configuration=log4j.properties' + EXTRA_PROCESSOR_JAVA_OPTIONS='spark.executor.extraJavaOptions=-Dlog4j.configuration=log4j.properties' + else + EXTRA_DRIVER_JAVA_OPTIONS='spark.driver.extraJavaOptions=-Dlog4j.configuration=log4j.properties -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=0 -Dcom.sun.management.jmxremote.rmi.port=0 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -javaagent:./jmx_prometheus_javaagent-0.10.jar='${SPARK_MONITORING_DRIVER_PORT}':./spark-prometheus.yml' + EXTRA_PROCESSOR_JAVA_OPTIONS='spark.executor.extraJavaOptions=-Dlog4j.configuration=log4j.properties -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=0 -Dcom.sun.management.jmxremote.rmi.port=0 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -javaagent:./jmx_prometheus_javaagent-0.10.jar='${SPARK_MONITORING_DRIVER_PORT}':./spark-prometheus.yml' + fi + + CONF_FILE="logisland-configuration.yml" + + ${SPARK_HOME}/bin/spark-submit ${VERBOSE_OPTIONS} ${SPARK_CLUSTER_OPTIONS} \ + --conf "${EXTRA_DRIVER_JAVA_OPTIONS}" \ + --conf "${EXTRA_PROCESSOR_JAVA_OPTIONS}" \ + --class ${app_mainclass} \ + --jars ${app_classpath} ${engine_jar} \ + -conf ${CONF_FILE} + ;; + + spark-client) + SPARK_CLUSTER_OPTIONS="--master ${SPARK_MASTER} --deploy-mode client --conf spark.metrics.namespace=\"${APP_NAME}\"" + + if [[ ! -z "${SPARK_APP_NAME}" ]] + then + SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --name ${SPARK_APP_NAME}" + else + SPARK_APP_NAME=`awk '{ if( $1 == "spark.app.name:" ){ print $2 } }' ${CONF_FILE}` + if [[ ! -z "${SPARK_APP_NAME}" ]] + then + SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --name ${SPARK_APP_NAME}" + fi + fi + + DRIVER_CORES=`awk '{ if( $1 == "spark.driver.cores:" ){ print $2 } }' ${CONF_FILE}` + if [[ ! -z "${DRIVER_CORES}" ]] + then + SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --driver-cores ${DRIVER_CORES}" + fi + + DRIVER_MEMORY=`awk '{ if( $1 == "spark.driver.memory:" ){ print $2 } }' ${CONF_FILE}` + if [[ ! -z "${DRIVER_MEMORY}" ]] + then + SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --driver-memory ${DRIVER_MEMORY}" + fi + + PROPERTIES_FILE_PATH=`awk '{ if( $1 == "spark.properties.file.path:" ){ print $2 } }' ${CONF_FILE}` + if [[ ! -z "${PROPERTIES_FILE_PATH}" ]] + then + SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --properties-file ${PROPERTIES_FILE_PATH}" + fi + + SPARK_MONITORING_DRIVER_PORT=`awk '{ if( $1 == "spark.monitoring.driver.port:" ){ print $2 } }' ${CONF_FILE}` + if [[ -z "${SPARK_MONITORING_DRIVER_PORT}" ]] + then + EXTRA_DRIVER_JAVA_OPTIONS='spark.driver.extraJavaOptions=-Dlog4j.configuration=log4j.properties' + EXTRA_PROCESSOR_JAVA_OPTIONS='spark.executor.extraJavaOptions=-Dlog4j.configuration=log4j.properties' + else + EXTRA_DRIVER_JAVA_OPTIONS='spark.driver.extraJavaOptions=-Dlog4j.configuration=log4j.properties -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=0 -Dcom.sun.management.jmxremote.rmi.port=0 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -javaagent:'${CONF_DIR}'/../monitoring/jmx_prometheus_javaagent-0.10.jar='${SPARK_MONITORING_DRIVER_PORT}':'${CONF_DIR}'/../monitoring/spark-prometheus.yml' + EXTRA_PROCESSOR_JAVA_OPTIONS='spark.executor.extraJavaOptions=-Dlog4j.configuration=log4j.properties -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=0 -Dcom.sun.management.jmxremote.rmi.port=0 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -javaagent:./jmx_prometheus_javaagent-0.10.jar='${SPARK_MONITORING_DRIVER_PORT}':./spark-prometheus.yml' + fi + + ${SPARK_HOME}/bin/spark-submit ${VERBOSE_OPTIONS} ${SPARK_CLUSTER_OPTIONS} \ + --conf spark.metrics.conf="${CONF_DIR}/../monitoring/metrics.properties" \ + --conf "${EXTRA_DRIVER_JAVA_OPTIONS}" \ + --conf "${EXTRA_PROCESSOR_JAVA_OPTIONS}" \ + --class ${app_mainclass} \ + --jars ${app_classpath} ${engine_jar} \ + -conf ${CONF_FILE} + ;; + + *) + echo "Unsupported run mode: ${MODE}" + ;; + esac fi } From 6bd742be7957e1e10be736c9d933e750652c330c Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Wed, 26 Feb 2020 16:06:59 +0100 Subject: [PATCH 22/43] Fix for ClassNotFoundException in databricks environment on executor: the plugin registry is not filled as the current thread context classloader is a repl.ExecutorClassLoader, so class of plugins cannot be loaded --- .../logisland/classloading/PluginLoader.java | 11 +++++-- .../AutoProxiedSerializablePlugin.java | 32 ++++++++++++++++++- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/logisland-core/logisland-framework/logisland-plugin-support/src/main/java/com/hurence/logisland/classloading/PluginLoader.java b/logisland-core/logisland-framework/logisland-plugin-support/src/main/java/com/hurence/logisland/classloading/PluginLoader.java index 70977b2f7..a778b321a 100644 --- a/logisland-core/logisland-framework/logisland-plugin-support/src/main/java/com/hurence/logisland/classloading/PluginLoader.java +++ b/logisland-core/logisland-framework/logisland-plugin-support/src/main/java/com/hurence/logisland/classloading/PluginLoader.java @@ -49,15 +49,20 @@ public class PluginLoader { static { JarFile.registerUrlProtocolHandler(); - scanAndRegisterPlugins(); + scanAndRegisterPlugins(null); } /** * Scan for plugins. */ - private static void scanAndRegisterPlugins() { + public static void scanAndRegisterPlugins(ClassLoader startClassLoader) { Set urls = new HashSet<>(); - ClassLoader cl = Thread.currentThread().getContextClassLoader(); + ClassLoader cl = null; + if (startClassLoader == null) { + cl = Thread.currentThread().getContextClassLoader(); + } else { + cl = startClassLoader; + } while (cl != null) { if (cl instanceof URLClassLoader) { urls.addAll(Arrays.asList(((URLClassLoader) cl).getURLs())); diff --git a/logisland-core/logisland-framework/logisland-plugin-support/src/main/java/com/hurence/logisland/classloading/serialization/AutoProxiedSerializablePlugin.java b/logisland-core/logisland-framework/logisland-plugin-support/src/main/java/com/hurence/logisland/classloading/serialization/AutoProxiedSerializablePlugin.java index 78895c93e..2b8eba2c9 100644 --- a/logisland-core/logisland-framework/logisland-plugin-support/src/main/java/com/hurence/logisland/classloading/serialization/AutoProxiedSerializablePlugin.java +++ b/logisland-core/logisland-framework/logisland-plugin-support/src/main/java/com/hurence/logisland/classloading/serialization/AutoProxiedSerializablePlugin.java @@ -17,6 +17,8 @@ import com.hurence.logisland.classloading.PluginLoader; import com.hurence.logisland.classloading.PluginProxy; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.ByteArrayInputStream; import java.io.InvalidObjectException; @@ -29,6 +31,9 @@ * @author amarziali */ public class AutoProxiedSerializablePlugin implements Serializable { + + private static final Logger logger = LoggerFactory.getLogger(PluginLoader.class); + private final String originalClassName; private final byte[] content; @@ -40,8 +45,33 @@ public AutoProxiedSerializablePlugin(String originalClassName, byte[] content) { Object readResolve() throws ObjectStreamException { try { return PluginProxy.create(new ClassLoaderAwareObjectInputStream(new ByteArrayInputStream(content), - PluginLoader.getRegistry().getOrDefault(originalClassName, Thread.currentThread().getContextClassLoader())).readObject()); + PluginLoader.getRegistry().getOrDefault(originalClassName, + Thread.currentThread().getContextClassLoader())).readObject()); } catch (Exception e) { + + if (e instanceof ClassNotFoundException) + { + logger.info("Could not create plugin proxy for " + originalClassName + ": forcing plugin registry load " + + "and retrying..."); + /** + * In databricks environment, on executors, the static code of PluginLoader is not called (as objects are + * sent through closures?) and the current thread context classloader is a repl.ExecutorClassLoader which + * is not a URLClassLoader and has not parent. Thus, in order to fill the plugin registry once for all + * (which is not yet filled and that is why we get a ClassNotFoundException here) for the executor VM, + * we force the scanAndRegisterPlugins call using the classloader of the current class, who will allow to + * find the logisland jars. Then we try to redo the plugin proxy creation that failed. + */ + PluginLoader.scanAndRegisterPlugins(AutoProxiedSerializablePlugin.class.getClassLoader()); + + try { + return PluginProxy.create(new ClassLoaderAwareObjectInputStream(new ByteArrayInputStream(content), + PluginLoader.getRegistry().getOrDefault(originalClassName, + Thread.currentThread().getContextClassLoader())).readObject()); + } catch (Exception ex) { + logger.error("Could create plugin proxy for " + originalClassName + " after having forced plugin " + + "registry loaded"); + } + } throw new InvalidObjectException("Unable to resolve plugin proxy class: " + e.getMessage()); } } From 337f895118e514e1cd81c1c5265a643bf085bd06 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Wed, 26 Feb 2020 16:47:26 +0100 Subject: [PATCH 23/43] Need spark 2.4.0 in spark 2.4 engine to align with latest supported version of azure event hub spark structured stream source/sink --- .../logisland-engine-spark_2_4/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml index 0376a5033..0c8699875 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml @@ -15,7 +15,7 @@ http://www.w3.org/2001/XMLSchema-instance "> 2.11 - 2.4.4 + 2.4.0 0.10.2.1 2.11.8 From 7f5c0f7ea1b2775eb70f51487965be5b91be08e9 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Thu, 27 Feb 2020 15:59:34 +0100 Subject: [PATCH 24/43] Use offset as record key when reading from azure event hub. Also removing some debug traces and adding some info logs --- ...tHubsStructuredStreamProviderService.scala | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala index 216c4be4b..4ce4c8cd1 100644 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/AzureEventHubsStructuredStreamProviderService.scala @@ -304,20 +304,20 @@ class AzureEventHubsStructuredStreamProviderService extends AbstractControllerSe val eventHubsConf = EventHubsConf(connectionString) applyConfig(eventHubsConf, true) - logger.info(s"Starting azure event hubs structured stream on event hub $readEventHub in $namespace namespace") + val options = eventHubsConf.toMap + val optionsString = options.toString() + + logger.info(s"Starting azure event hubs structured stream on event hub $readEventHub in $namespace namespace with configuration:\n$optionsString") val df = spark.readStream .format("eventhubs") - .options(eventHubsConf.toMap) + .options(options) .load() -// .select($"body" cast "string") - .selectExpr("CAST(sequenceNumber AS STRING)", "CAST(body AS BINARY)") - // .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") + .selectExpr("CAST(offset AS STRING)", "CAST(body AS BINARY)") .as[(String, Array[Byte])] .map(r => { new StandardRecord(readEventHub) .setField(FieldDictionary.RECORD_KEY, FieldType.STRING, r._1) .setField(FieldDictionary.RECORD_VALUE, FieldType.BYTES, r._2) - new StandardRecord("").asInstanceOf[Record]; }) df @@ -343,19 +343,20 @@ class AzureEventHubsStructuredStreamProviderService extends AbstractControllerSe val eventHubsConf = EventHubsConf(connectionString) applyConfig(eventHubsConf, false) - logger.info(s"Starting azure event hubs structured stream to event hub $writeEventHub in $namespace namespace") - var checkpointLocation : String = "checkpoints" if (GlobalOptions.checkpointLocation != null) { checkpointLocation = GlobalOptions.checkpointLocation - logger.info(s"Writing to event hub using checkpointLocation: $writeEventHub") } - // Write key-value data from a DataFrame to a specific Kafka topic specified in an option + + logger.info(s"Starting azure event hubs structured stream to event hub $writeEventHub in " + + s"$namespace namespace with checkpointLocation $checkpointLocation") + + // Write key-value data from a DataFrame to a specific event hub specified in an option df.map(r => { (r.getField(FieldDictionary.RECORD_KEY).asString(), r.getField(FieldDictionary.RECORD_VALUE).asBytes()) }) .as[(String, Array[Byte])] - .toDF("partitionKey","body") + .toDF("partitionKey", "body") .writeStream .format("eventhubs") .options(eventHubsConf.toMap) From 2e1de7d421ee2be4a6a170b92698e97371d0d9ed Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Thu, 27 Feb 2020 16:11:59 +0100 Subject: [PATCH 25/43] Remove not finished and useless engine for spark 2.4+ and kafka 2.4+ --- .../pom.xml | 217 ------ .../logisland/util/spark/Spark24Platform.java | 30 - ...hurence.logisland.util.spark.SparkPlatform | 1 - .../pom.xml | 420 ----------- .../AbstractKafkaConnectComponent.java | 223 ------ .../com/hurence/logisland/connect/Utils.java | 109 --- .../converter/LogIslandRecordConverter.java | 232 ------ .../connect/sink/KafkaConnectStreamSink.java | 139 ---- .../logisland/connect/sink/NullSink.java | 96 --- .../connect/sink/SimpleSinkTaskContext.java | 81 --- .../source/KafkaConnectStreamSource.java | 286 -------- .../KafkaConnectStreamSourceProvider.java | 73 -- .../connect/source/SimplePartition.java | 51 -- .../logisland/connect/source/SimpleRDD.java | 64 -- .../source/timed/ClockSourceConnector.java | 136 ---- .../connect/source/timed/ClockSourceTask.java | 250 ------- ...PipelineConfigurationBroadcastWrapper.java | 80 --- .../engine/spark/remote/RemoteApiClient.java | 247 ------- .../remote/RemoteApiComponentFactory.java | 250 ------- .../engine/spark/remote/model/Component.java | 186 ----- .../engine/spark/remote/model/DataFlow.java | 144 ---- .../engine/spark/remote/model/Pipeline.java | 108 --- .../engine/spark/remote/model/Processor.java | 66 -- .../engine/spark/remote/model/Property.java | 147 ---- .../engine/spark/remote/model/Service.java | 66 -- .../engine/spark/remote/model/Stream.java | 94 --- .../engine/spark/remote/model/Versioned.java | 125 ---- .../util/spark/ProcessorMetrics.java | 123 ---- .../util/spark/ProtoBufRegistrator.java | 30 - .../util/spark/SparkConfigReader.java | 59 -- .../logisland/util/spark/SparkPlatform.java | 27 - .../org.apache.spark.metrics.sink.KafkaSink | 1 - .../spark/KafkaStreamProcessingEngine.scala | 659 ------------------ .../RemoteApiStreamProcessingEngine.scala | 198 ------ .../spark/AbstractKafkaRecordStream.scala | 344 --------- .../stream/spark/DummyRecordStream.scala | 68 -- .../spark/KafkaRecordStreamDebugger.scala | 191 ----- .../spark/KafkaRecordStreamHDFSBurner.scala | 229 ------ .../KafkaRecordStreamParallelProcessing.scala | 226 ------ .../KafkaRecordStreamSQLAggregator.scala | 145 ---- .../stream/spark/SparkRecordStream.scala | 34 - .../logisland/stream/spark/package.scala | 546 --------------- .../KafkaConnectBaseProviderService.scala | 112 --- ...ConnectStructuredSinkProviderService.scala | 122 ---- ...nnectStructuredSourceProviderService.scala | 83 --- .../stream/spark/provider/package.scala | 126 ---- .../spark/structured/StructuredStream.scala | 184 ----- ...nsoleStructuredStreamProviderService.scala | 183 ----- ...KafkaStructuredStreamProviderService.scala | 280 -------- ...lFileStructuredStreamProviderService.scala | 167 ----- .../MQTTStructuredStreamProviderService.scala | 174 ----- .../RateStructuredStreamProviderService.scala | 202 ------ .../StructuredStreamProviderService.scala | 398 ----------- .../logisland/util/kafka/KafkaReporter.scala | 224 ------ .../logisland/util/kafka/KafkaSink.scala | 76 -- .../util/mqtt/MQTTStreamSource.scala | 240 ------- .../logisland/util/mqtt/MessageStore.scala | 109 --- .../spark/ControllerServiceLookupSink.scala | 52 -- .../logisland/util/spark/SparkUtils.scala | 273 -------- .../apache/spark/metrics/sink/KafkaSink.scala | 91 --- .../logisland/connect/KafkaConnectTest.java | 84 --- .../LogIslandRecordConverterTest.java | 129 ---- .../logisland/connect/fake/FakeConnector.java | 116 --- .../logisland/connect/fake/TestSink.java | 57 -- .../logisland/connect/fake/TestSinkTask.java | 55 -- ...stractStreamProcessingIntegrationTest.java | 246 ------- ...mmaticStreamProcessingIntegrationTest.java | 170 ----- .../RecordStreamProcessingDebuggerTest.java | 282 -------- .../logisland/engine/RemoteApiEngineTest.java | 85 --- .../logisland/engine/SparkEngineConfTest.java | 182 ----- .../logisland/engine/StreamDebuggerTest.java | 79 --- .../spark/remote/RemoteApiClientTest.java | 90 --- .../spark/remote/mock/MockProcessor.java | 37 - .../remote/mock/MockServiceController.java | 29 - .../engine/spark/remote/mock/MockStream.java | 29 - .../structured/StructuredStreamTest.java | 83 --- ...leStructuredStreamProviderServiceTest.java | 59 -- .../ProviderServiceAsReaderRunner.java | 121 ---- .../resources/conf/kafka-connect-stream.yml | 138 ---- .../src/test/resources/conf/opencv.yml | 62 -- .../src/test/resources/conf/remote-engine.yml | 38 - .../test/resources/conf/structured-stream.yml | 76 -- .../conf/timeseries-structured-stream.yml | 99 --- .../src/test/resources/log4j.properties | 65 -- .../src/test/resources/logback.xml | 58 -- .../pom.xml | 25 - logisland-core/logisland-engines/pom.xml | 1 - 87 files changed, 12392 deletions(-) delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/pom.xml delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/resources/META-INF/services/com.hurence.logisland.util.spark.SparkPlatform delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/pom.xml delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/AbstractKafkaConnectComponent.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/Utils.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/converter/LogIslandRecordConverter.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/KafkaConnectStreamSink.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/NullSink.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/SimpleSinkTaskContext.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSource.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSourceProvider.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimplePartition.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimpleRDD.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceConnector.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceTask.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/PipelineConfigurationBroadcastWrapper.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiClient.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiComponentFactory.java delete mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Component.java delete mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/DataFlow.java delete mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Pipeline.java delete mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Processor.java delete mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Property.java delete mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Service.java delete mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Stream.java delete mode 100755 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Versioned.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProcessorMetrics.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProtoBufRegistrator.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkConfigReader.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkPlatform.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/resources/META-INF/services/org.apache.spark.metrics.sink.KafkaSink delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/KafkaStreamProcessingEngine.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/RemoteApiStreamProcessingEngine.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/AbstractKafkaRecordStream.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/DummyRecordStream.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamDebugger.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamHDFSBurner.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamParallelProcessing.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamSQLAggregator.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/SparkRecordStream.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectBaseProviderService.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSinkProviderService.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSourceProviderService.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/package.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/ConsoleStructuredStreamProviderService.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/KafkaStructuredStreamProviderService.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderService.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/MQTTStructuredStreamProviderService.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/RateStructuredStreamProviderService.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaReporter.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaSink.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MQTTStreamSource.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MessageStore.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/ControllerServiceLookupSink.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/SparkUtils.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/org/apache/spark/metrics/sink/KafkaSink.scala delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/KafkaConnectTest.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/converter/LogIslandRecordConverterTest.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/FakeConnector.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSink.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSinkTask.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/AbstractStreamProcessingIntegrationTest.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/ProgrammaticStreamProcessingIntegrationTest.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RecordStreamProcessingDebuggerTest.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RemoteApiEngineTest.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/SparkEngineConfTest.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/StreamDebuggerTest.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/RemoteApiClientTest.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockProcessor.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockServiceController.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockStream.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/StructuredStreamTest.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderServiceTest.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/ProviderServiceAsReaderRunner.java delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/kafka-connect-stream.yml delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/opencv.yml delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/remote-engine.yml delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/structured-stream.yml delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/timeseries-structured-stream.yml delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/log4j.properties delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/logback.xml delete mode 100644 logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/pom.xml diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/pom.xml deleted file mode 100644 index f92399f6b..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/pom.xml +++ /dev/null @@ -1,217 +0,0 @@ - - - 4.0.0 - - com.hurence.logisland - logisland-engine-spark_2_4plus_kafka_2_4plus - 1.2.0 - - logisland-engine-spark_2_4_kafka_2_4 - jar - - - - - 2.12 - 2.4.4 - 2.4.0 - 2.12.10 - - - - - - - - - org.apache.kafka - kafka_${scala.binary.version} - ${kafka.version} - true - runtime - - - - org.apache.kafka - kafka-clients - ${kafka.version} - true - runtime - - - org.apache.bahir - spark-sql-streaming-mqtt_2.11 - 2.2.0 - runtime - true - - - - org.apache.spark - spark-core_${scala.binary.version} - ${spark.version} - provided - - - com.google.guava - guava - - - - - org.apache.spark - spark-streaming_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-mllib_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-streaming-kafka - ${spark.version} - runtime - true - - - org.apache.spark - spark-sql-kafka - ${spark.version} - runtime - true - - - org.apache.spark - spark-streaming-kafka-assembly_${scala.binary.version} - ${spark.version} - runtime - true - - - - - - - com.hurence.logisland - logisland-engine-spark_2_common - ${project.version} - true - - - org.apache.spark - spark-sql_${scala.binary.version} - provided - - - - org.scala-lang - scala-library - ${scala.version} - provided - true - - - com.banzaicloud - spark-metrics_2.11 - 2.3-1.1.0 - - - io.prometheus - simpleclient - 0.0.23 - - - io.prometheus - simpleclient_dropwizard - 0.0.23 - - - io.prometheus - simpleclient_pushgateway - 0.0.23 - - - - - - - - - org.immutables.tools - maven-shade-plugin - 4 - - - package - - shade - - - - - com.fasterxml.jackson.datatype:jackson-datatype-jsr310 - com.fasterxml.jackson.datatype:jackson-datatype-jdk8 - com.hurence.logisland:logisland-engine-spark_2_common - *:* - - - com.fasterxml.jackson.core:* - com.fasterxml.jackson.databind:* - com.fasterxml.jackson.jaxrs*:* - com.fasterxml.jackson.module:jackson-module-jaxb-annotations - org.scala-lang:* - org.scalatest:* - org.apache.zookeeper:* - com.google.guava:* - org.apache.commons:* - org.slf4j:* - log4j:* - org.yaml:* - org.eclipse.jetty:* - org.glassfish.hk2*:* - org.glassfish.jersey*:* - - - - - *:* - - META-INF/license/** - META-INF/* - META-INF/maven/** - LICENSE - NOTICE - /*.txt - build.properties - - - - - - - - - - - - - - - banzaicloud-github - https://raw.github.com/banzaicloud/spark-metrics/master/maven-repo/releases - - - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java deleted file mode 100644 index 4383f2597..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/java/com/hurence/logisland/util/spark/Spark24Platform.java +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.util.spark; - -import org.apache.spark.rdd.RDD; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.StructType; - -public class Spark24Platform implements SparkPlatform { - @Override - public Dataset createStreamingDataFrame(SQLContext sqlContext, RDD catalystRows, StructType schema) { - return sqlContext.internalCreateDataFrame(catalystRows, schema, true); - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/resources/META-INF/services/com.hurence.logisland.util.spark.SparkPlatform b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/resources/META-INF/services/com.hurence.logisland.util.spark.SparkPlatform deleted file mode 100644 index 405b9bf4e..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4_kafka_2_4/src/main/resources/META-INF/services/com.hurence.logisland.util.spark.SparkPlatform +++ /dev/null @@ -1 +0,0 @@ -com.hurence.logisland.util.spark.Spark24Platform \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/pom.xml deleted file mode 100644 index 911e5047e..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/pom.xml +++ /dev/null @@ -1,420 +0,0 @@ - - - 4.0.0 - - com.hurence.logisland - logisland-engine-spark_2_4plus_kafka_2_4plus - 1.2.0 - - logisland-engine-spark_2_4plus_common - jar - - - - 1.5.16 - 4.12 - 3.1.5 - 3.0.4 - 0.3.5 - 2.12 - 2.4.4 - 2.4.0 - 2.12.10 - 2.6.6 - - - - - - - - com.typesafe.scala-logging - scala-logging-slf4j_2.11 - 2.1.2 - provided - - - org.scala-lang - scala-compiler - ${scala.version} - - - - - org.apache.kafka - kafka-clients - ${kafka.version} - compile - - - - org.apache.kafka - kafka - ${kafka.version} - compile - - - - - org.apache.spark - spark-core_${scala.binary.version} - ${spark.version} - provided - - - com.google.guava - guava - - - - - org.apache.spark - spark-streaming_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-mllib_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-streaming-kafka - ${spark.version} - - - org.apache.spark - spark-sql-kafka - ${spark.version} - - - - - org.apache.kafka - kafka_${scala.binary.version} - test - ${kafka.version} - test - - - org.apache.kafka - kafka-clients - ${kafka.version} - test - test - - - org.scalatest - scalatest_${scala.binary.version} - 3.1.0 - test - - - - - - - - com.hurence.logisland - logisland-api - ${project.version} - provided - - - com.hurence.logisland - logisland-utils - ${project.version} - provided - - - com.hurence.logisland - logisland-processor-timeseries - ${project.version} - provided - - - - com.hurence.logisland - logisland-plugin-support - ${project.version} - provided - - - - com.hurence.logisland - logisland-bootstrap - ${project.version} - provided - - - - com.groupon.dse - spark-metrics - 2.0.0 - - - - - - org.apache.spark - spark-core_${scala.binary.version} - provided - - - - org.apache.spark - spark-sql_${scala.binary.version} - provided - - - - org.apache.spark - spark-streaming_${scala.binary.version} - provided - - - - org.apache.bahir - spark-sql-streaming-mqtt_${scala.binary.version} - 2.4.0 - - - - org.apache.kafka - connect-api - ${kafka.version} - - - - org.apache.kafka - connect-runtime - ${kafka.version} - - - com.fasterxml.jackson.jaxrs - jackson-jaxrs-json-provider - - - - - org.apache.kafka - connect-json - ${kafka.version} - - - - - com.fasterxml.jackson.core - jackson-core - ${jackson.version} - provided - - - - com.fasterxml.jackson.core - jackson-databind - provided - ${jackson.version} - - - - com.fasterxml.jackson.core - jackson-annotations - provided - - - - com.fasterxml.jackson.module - jackson-module-parameter-names - ${jackson.version} - - - - com.fasterxml.jackson.datatype - jackson-datatype-jdk8 - ${jackson.version} - - - - com.fasterxml.jackson.datatype - jackson-datatype-jsr310 - ${jackson.version} - - - - org.apache.commons - commons-csv - - - - org.hibernate - hibernate-validator - 5.1.3.Final - - - - - - org.glassfish - javax.el - 3.0.0 - - - - - - - org.scala-lang - scala-compiler - provided - - - - - - - junit - junit - test - - - - ch.qos.logback - logback-classic - test - - - - - org.apache.kafka - kafka_${scala.binary.version} - ${kafka.version} - - - - - - org.apache.kafka - kafka_${scala.binary.version} - test - test - - - - - org.apache.kafka - kafka-clients - test - test - - - - io.swagger - swagger-core - ${swagger-core-version} - - - - - com.hurence.logisland - logisland-processor-common - ${project.version} - test - - - - - - com.squareup.okhttp3 - okhttp-urlconnection - 3.10.0 - - - - - - com.squareup.okhttp3 - mockwebserver - 3.10.0 - test - - - - - org.springframework - spring-context-support - 5.1.3.RELEASE - - - - - - - - net.alchim31.maven - scala-maven-plugin - 4.3.1 - - - scala-compile-first - process-resources - - add-source - compile - - - - scala-test-compile-first - process-test-resources - - testCompile - - - - - - ${scala.binary.version} - ${scala.version} - incremental - - -unchecked - -deprecation - - - -Xms64m - -Xms1024m - -Xmx1024m - -XX:PermSize=${PermGen} - -XX:MaxPermSize=${MaxPermGen} - - - -source - ${maven.compiler.source} - -target - ${maven.compiler.source} - - - - - - - - - - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/AbstractKafkaConnectComponent.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/AbstractKafkaConnectComponent.java deleted file mode 100644 index f5a25b979..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/AbstractKafkaConnectComponent.java +++ /dev/null @@ -1,223 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect; - - -import com.hurence.logisland.classloading.PluginProxy; -import com.hurence.logisland.component.ComponentFactory; -import com.hurence.logisland.connect.source.KafkaConnectStreamSourceProvider; -import org.apache.kafka.connect.connector.Connector; -import org.apache.kafka.connect.connector.ConnectorContext; -import org.apache.kafka.connect.connector.Task; -import org.apache.kafka.connect.errors.DataException; -import org.apache.kafka.connect.json.JsonConverter; -import org.apache.kafka.connect.storage.Converter; -import org.apache.kafka.connect.storage.OffsetBackingStore; -import org.apache.spark.sql.SQLContext; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.*; -import java.util.concurrent.atomic.AtomicBoolean; - -/** - * Kafka connect to spark sql streaming bridge. - * - * @author amarziali - */ -public abstract class AbstractKafkaConnectComponent { - - - private final static Logger LOGGER = LoggerFactory.getLogger(AbstractKafkaConnectComponent.class); - protected final T connector; - protected final List tasks = new ArrayList<>(); - protected final OffsetBackingStore offsetBackingStore; - protected final AtomicBoolean startWatch = new AtomicBoolean(false); - protected final String connectorName; - private final Map connectorProperties; - - protected final SQLContext sqlContext; - protected final Converter keyConverter; - protected final Converter valueConverter; - protected final int maxTasks; - protected final String streamId; - - - /** - * Base constructor. Should be called by {@link KafkaConnectStreamSourceProvider} - * - * @param sqlContext the spark sql context. - * @param connectorProperties the connector related properties. - * @param keyConverter the converter for the data key - * @param valueConverter the converter for the data body - * @param offsetBackingStore the backing store implementation (can be in-memory, file based, kafka based, etc...) - * @param maxTasks the maximum theoretical number of tasks this source should spawn. - * @param connectorClass the class of kafka connect source connector to wrap. - * @param streamId the Stream id. - * - */ - public AbstractKafkaConnectComponent(SQLContext sqlContext, - Map connectorProperties, - Converter keyConverter, - Converter valueConverter, - OffsetBackingStore offsetBackingStore, - int maxTasks, - String connectorClass, - String streamId) { - try { - this.sqlContext = sqlContext; - this.maxTasks = maxTasks; - //instantiate connector - this.connectorName = connectorClass; - connector = ComponentFactory.loadComponent(connectorClass); - //create converters - this.keyConverter = keyConverter; - this.valueConverter = valueConverter; - this.connectorProperties = connectorProperties; - this.streamId = streamId; - - //Create the connector context - final ConnectorContext connectorContext = new ConnectorContext() { - @Override - public void requestTaskReconfiguration() { - try { - stopAllTasks(); - createAndStartAllTasks(); - } catch (Throwable t) { - LOGGER.error("Unable to reconfigure tasks for connector " + connectorName(), t); - } - } - - @Override - public void raiseError(Exception e) { - LOGGER.error("Connector " + connectorName() + " raised error : " + e.getMessage(), e); - } - }; - - LOGGER.info("Starting connector {}", connectorClass); - connector.initialize(connectorContext); - this.offsetBackingStore = offsetBackingStore; - - - } catch (Exception e) { - throw new DataException("Unable to create connector " + connectorName(), e); - } - - } - - public void start() { - try { - offsetBackingStore.start(); - //create and start tasks - createAndStartAllTasks(); - } catch (Exception e) { - try { - stop(); - } catch (Throwable t) { - LOGGER.error("Unable to properly stop tasks of connector " + connectorName(), t); - } - throw new DataException("Unable to start connector " + connectorName(), e); - } - } - - protected abstract void initialize(U task); - - /** - * Create all the {@link Runnable} workers needed to host the source tasks. - * - * @return - * @throws IllegalAccessException if task instantiation fails. - * @throws InstantiationException if task instantiation fails. - */ - protected void createAndStartAllTasks() throws IllegalAccessException, InstantiationException, ClassNotFoundException { - if (!startWatch.compareAndSet(false, true)) { - throw new IllegalStateException("Connector is already started"); - } - connector.start(connectorProperties); - Class taskClass = (Class) connector.taskClass(); - List> configs = connector.taskConfigs(maxTasks); - tasks.clear(); - LOGGER.info("Creating {} tasks for connector {}", configs.size(), connectorName()); - for (Map conf : configs) { - //create the task - U task = PluginProxy.create(taskClass.newInstance()); - initialize(task); - task.start(conf); - tasks.add(task); - - } - } - - - /** - * Create a converter to be used to translate internal data. - * Child classes can override this method to provide alternative converters. - * - * @return an instance of {@link Converter} - */ - protected Converter createInternalConverter(boolean isKey) { - JsonConverter internalConverter = new JsonConverter(); - internalConverter.configure(Collections.singletonMap("schemas.enable", "false"), isKey); - return internalConverter; - } - - /** - * Gets the connector name used by this stream source. - * - * @return - */ - protected String connectorName() { - return connectorName; - } - - - /** - * Stops every tasks running and serving for this connector. - */ - protected void stopAllTasks() { - LOGGER.info("Stopping every tasks for connector {}", connectorName()); - while (!tasks.isEmpty()) { - try { - tasks.remove(0).stop(); - } catch (Throwable t) { - LOGGER.warn("Error occurring while stopping a task of connector " + connectorName(), t); - } - } - } - - protected void stop() { - if (!startWatch.compareAndSet(true, false)) { - throw new IllegalStateException("Connector is not started"); - } - LOGGER.info("Stopping connector {}", connectorName()); - stopAllTasks(); - offsetBackingStore.stop(); - connector.stop(); - } - - - /** - * Check the stream source state. - * - * @return - */ - public boolean isRunning() { - return startWatch.get(); - } - - -} - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/Utils.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/Utils.java deleted file mode 100644 index cc95a24cf..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/Utils.java +++ /dev/null @@ -1,109 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect; - -import com.hurence.logisland.stream.spark.provider.StreamOptions; -import org.apache.kafka.common.config.ConfigDef; -import org.apache.kafka.connect.runtime.WorkerConfig; -import org.apache.kafka.connect.runtime.distributed.DistributedConfig; -import org.apache.kafka.connect.runtime.standalone.StandaloneConfig; -import org.apache.kafka.connect.storage.*; - -import java.io.IOException; -import java.io.StringReader; -import java.util.Map; -import java.util.Properties; -import java.util.stream.Collectors; - -public class Utils { - - - /** - * Configuration definition for {@link MemoryOffsetBackingStore} - */ - private static class MemoryConfig extends WorkerConfig { - public MemoryConfig(Map props) { - super(new ConfigDef(), props); - } - } - - /** - * Configuration definition for {@link FileOffsetBackingStore} - */ - private static class FileConfig extends WorkerConfig { - public FileConfig(Map props) { - super(new ConfigDef() - .define(StandaloneConfig.OFFSET_STORAGE_FILE_FILENAME_CONFIG, - ConfigDef.Type.STRING, - ConfigDef.Importance.HIGH, - "file to store offset data in") - , props); - } - } - - /** - * Configuration definition for {@link KafkaOffsetBackingStore} - */ - private static class KafkaConfig extends WorkerConfig { - public KafkaConfig(Map props) { - super(new ConfigDef() - .define(BOOTSTRAP_SERVERS_CONFIG, - ConfigDef.Type.LIST, - BOOTSTRAP_SERVERS_DEFAULT, - ConfigDef.Importance.HIGH, - BOOTSTRAP_SERVERS_DOC) - .define(DistributedConfig.OFFSET_STORAGE_TOPIC_CONFIG, - ConfigDef.Type.STRING, - ConfigDef.Importance.HIGH, - "kafka topic to store connector offsets in") - , props); - } - } - - public static Converter createConverter(String converterClassName, String propertiesAsString, boolean isKey) - throws ClassNotFoundException, IllegalAccessException, InstantiationException, IOException { - Converter ret = (Converter) Class.forName(converterClassName).newInstance(); - ret.configure(propertiesToMap(propertiesAsString), isKey); - return ret; - } - - public static Map propertiesToMap(String propertiesAsString) throws IOException { - Properties props = new Properties(); - props.load(new StringReader(propertiesAsString)); - return props.entrySet().stream().collect(Collectors.toMap(e -> e.getKey().toString(), e -> e.getValue().toString())); - } - - public static OffsetBackingStore createOffsetBackingStore(String type, Map properties) { - WorkerConfig workerConfig = null; - OffsetBackingStore offsetBackingStore; - if (StreamOptions.FILE_BACKING_STORE().getValue().equals(type)) { - offsetBackingStore = new FileOffsetBackingStore(); - workerConfig = new FileConfig(properties); - } else if (StreamOptions.MEMORY_BACKING_STORE().getValue().equals(type)) { - offsetBackingStore = new MemoryOffsetBackingStore(); - workerConfig = new MemoryConfig(properties); - } else if (StreamOptions.KAFKA_BACKING_STORE().getValue().equals(type)) { - offsetBackingStore = new KafkaOffsetBackingStore(); - workerConfig = new KafkaConfig(properties); - } else { - throw new IllegalArgumentException(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE().getName() + - " must be set!"); - } - offsetBackingStore.configure(workerConfig); - return offsetBackingStore; - } - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/converter/LogIslandRecordConverter.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/converter/LogIslandRecordConverter.java deleted file mode 100644 index 4ad94cdd9..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/converter/LogIslandRecordConverter.java +++ /dev/null @@ -1,232 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect.converter; - -import com.hurence.logisland.record.Field; -import com.hurence.logisland.record.*; -import com.hurence.logisland.serializer.RecordSerializer; -import com.hurence.logisland.serializer.SerializerProvider; -import com.hurence.logisland.stream.StreamProperties; -import org.apache.kafka.connect.data.*; -import org.apache.kafka.connect.errors.DataException; -import org.apache.kafka.connect.storage.Converter; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.lang.reflect.Array; -import java.nio.ByteBuffer; -import java.time.Instant; -import java.util.Collection; -import java.util.Date; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.stream.Collectors; - -public class LogIslandRecordConverter implements Converter { - - /** - * Record serializer class (instance of {@link RecordSerializer}) - */ - public static final String PROPERTY_RECORD_SERIALIZER = "record.serializer"; - /** - * Avro schema to use (only apply to {@link com.hurence.logisland.serializer.AvroSerializer}) - */ - public static final String PROPERTY_AVRO_SCHEMA = "avro.schema"; - - /** - * The record type to use. If not provided {@link LogIslandRecordConverter#PROPERTY_RECORD_TYPE} will be used. - */ - public static final String PROPERTY_RECORD_TYPE = StreamProperties.RECORD_TYPE().getName(); - - /** - * The default type for logisland {@link Record} created by this converter. - */ - private static final String DEFAULT_RECORD_TYPE = "kafka_connect"; - - private RecordSerializer recordSerializer; - private String recordType; - private boolean isKey; - - - @Override - public void configure(Map configs, boolean isKey) { - recordSerializer = SerializerProvider.getSerializer((String) configs.get(PROPERTY_RECORD_SERIALIZER), (String) configs.get(PROPERTY_AVRO_SCHEMA)); - recordType = ((Map) configs).getOrDefault(PROPERTY_RECORD_TYPE, DEFAULT_RECORD_TYPE).toString(); - this.isKey = isKey; - } - - @Override - public byte[] fromConnectData(String topic, Schema schema, Object value) { - try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - recordSerializer.serialize(baos, - new StandardRecord(recordType).setField(toFieldRecursive(FieldDictionary.RECORD_VALUE, schema, value, isKey))); - return baos.toByteArray(); - } catch (IOException ioe) { - throw new DataException("Unexpected IO Exception occurred while serializing data [topic " + topic + "]", ioe); - } - - } - - @Override - public SchemaAndValue toConnectData(String topic, byte[] value) { - try (ByteArrayInputStream bais = new ByteArrayInputStream(value)) { - Record r = recordSerializer.deserialize(bais); - Schema schema = toSchemaRecursive(r); - return new SchemaAndValue(schema, toObjectRecursive(r, schema)); - } catch (IOException ioe) { - throw new DataException("Unexpected IO Exception occurred while serializing data [topic " + topic + "]", ioe); - } - } - - - public Object toObjectRecursive(Object o, Schema schema) { - if (o instanceof Collection) { - return ((Collection) o).stream().map(elem -> toObjectRecursive(elem, schema.schema())); - } else if (o instanceof Map) { - Struct ret = new Struct(schema); - ((Map) o).forEach((k, v) -> ret.put(k.toString(), toObjectRecursive(o, schema.field(k.toString()).schema()))); - return ret; - } else if (o instanceof Record) { - Struct ret = new Struct(schema); - ((Record) o).getAllFieldsSorted().forEach(field -> ret.put(field.getName(), toObjectRecursive(field.getRawValue(), schema.field(field.getName()).schema()))); - return ret; - } - return o; - } - - private SchemaBuilder toSchemaRecursive(Object o) { - if (o instanceof Byte) { - return SchemaBuilder.bytes().optional(); - } else if (o instanceof Short) { - return SchemaBuilder.int16().optional(); - } else if (o instanceof Integer) { - return SchemaBuilder.int32().optional(); - - } else if (o instanceof Long) { - return SchemaBuilder.int64().optional(); - - } else if (o instanceof Float) { - return SchemaBuilder.float32().optional(); - - } else if (o instanceof Double) { - return SchemaBuilder.float64().optional(); - - } else if (o instanceof Boolean) { - return SchemaBuilder.bool().optional(); - } else if (o instanceof byte[]) { - return SchemaBuilder.bytes().optional(); - } else if (o instanceof Collection) { - return SchemaBuilder.array(toSchemaRecursive((Array.getLength(o) > 0 ? Array.get(o, 0) : null))).optional(); - } else if (o instanceof Map) { - SchemaBuilder sb = SchemaBuilder.struct(); - ((Map) o).forEach((k, v) -> sb.field(k.toString(), toSchemaRecursive(v))); - return sb.optional(); - } else if (o instanceof Record) { - SchemaBuilder sb = SchemaBuilder.struct(); - ((Record) o).getAllFieldsSorted().forEach(field -> sb.field(field.getName(), toSchemaRecursive(field.getRawValue()))); - return sb.optional(); - } - return SchemaBuilder.string().optional(); - } - - - private Field toFieldRecursive(String name, Schema schema, Object value, boolean isKey) { - try { - if (value == null) { - return new Field(name, FieldType.NULL, null); - } - final Schema.Type schemaType; - if (schema == null) { - schemaType = ConnectSchema.schemaType(value.getClass()); - if (schemaType == null) - throw new DataException("Java class " + value.getClass() + " does not have corresponding schema type."); - } else { - schemaType = schema.type(); - } - switch (schemaType) { - case INT8: - case INT16: - case INT32: - return new Field(name, FieldType.INT, value); - case INT64: - Object toSet = value; - if (value instanceof Date) { - toSet = ((Date) value).getTime(); - } else if (value instanceof Instant) { - toSet = ((Instant) value).toEpochMilli(); - } - return new Field(name, FieldType.LONG, toSet); - case FLOAT32: - return new Field(name, FieldType.FLOAT, value); - case FLOAT64: - return new Field(name, FieldType.DOUBLE, value); - case BOOLEAN: - return new Field(name, FieldType.BOOLEAN, value); - case STRING: - return new Field(name, FieldType.STRING, value); - case BYTES: - byte[] bytes = null; - if (value instanceof byte[]) { - bytes = (byte[]) value; - } else if (value instanceof ByteBuffer) { - bytes = ((ByteBuffer) value).array(); - } else { - //throw new DataException("Invalid type for bytes type: " + value.getClass()); - //AM: fix to handle special cases (see oracle jdbc issues) - return new Field(name, FieldType.STRING, value != null ? value.toString() : value); - } - return new Field(name, FieldType.BYTES, bytes); - case ARRAY: { - return new Field(name, FieldType.ARRAY, - ((Collection) value).stream().map(item -> { - Schema valueSchema = schema == null ? null : schema.valueSchema(); - return toFieldRecursive(FieldDictionary.RECORD_VALUE, valueSchema, item, true); - }) - .map(Field::getRawValue) - .collect(Collectors.toList())); - } - case MAP: { - return new Field(name, FieldType.MAP, new LinkedHashMap<>((Map) value)); - } - case STRUCT: { - Struct struct = (Struct) value; - - if (struct.schema() != schema) { - throw new DataException("Mismatching schema."); - } - if (isKey) { - Map ret = new HashMap<>(); - struct.schema().fields().stream().filter(field -> !(field.schema().isOptional() && struct.get(field) == null)) - .forEach(field -> ret.put(field.name(), toFieldRecursive(field.name(), field.schema(), struct.get(field), true).getRawValue())); - return new Field(name, FieldType.MAP, ret); - } else { - Record ret = new StandardRecord(); - struct.schema().fields().stream() - .filter(field -> !(field.schema().isOptional() && struct.get(field) == null)) - .forEach(field -> ret.setField(toFieldRecursive(field.name(), field.schema(), struct.get(field), true))); - return new Field(name, FieldType.RECORD, ret); - } - - } - } - throw new DataException("Couldn't convert " + value + " to a logisland Record."); - } catch (ClassCastException e) { - throw new DataException("Invalid type for " + schema.type() + ": " + value.getClass()); - } - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/KafkaConnectStreamSink.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/KafkaConnectStreamSink.java deleted file mode 100644 index d84170198..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/KafkaConnectStreamSink.java +++ /dev/null @@ -1,139 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect.sink; - -import com.google.common.collect.ListMultimap; -import com.google.common.collect.Multimaps; -import com.hurence.logisland.connect.AbstractKafkaConnectComponent; -import com.hurence.logisland.connect.source.KafkaConnectStreamSourceProvider; -import org.apache.kafka.clients.consumer.OffsetAndMetadata; -import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.connect.data.SchemaAndValue; -import org.apache.kafka.connect.sink.SinkConnector; -import org.apache.kafka.connect.sink.SinkRecord; -import org.apache.kafka.connect.sink.SinkTask; -import org.apache.kafka.connect.storage.Converter; -import org.apache.kafka.connect.storage.OffsetBackingStore; -import org.apache.spark.sql.SQLContext; -import scala.Tuple2; - -import java.util.*; -import java.util.concurrent.atomic.AtomicLong; - -/** - * Kafka {@link SinkConnector} to logisland adapter. - * - * @author amarziali - */ -public class KafkaConnectStreamSink extends AbstractKafkaConnectComponent { - - - private final ListMultimap bufferedRecords = Multimaps.synchronizedListMultimap( - Multimaps.newListMultimap(new HashMap<>(), ArrayList::new)); - private final Map contexts = new IdentityHashMap<>(); - - private final Map> partitions = Collections.synchronizedMap(new HashMap<>()); - - - private final String topic; - private final AtomicLong counter = new AtomicLong(); - - /** - * Base constructor. - * - * @param sqlContext the spark sql context. - * @param connectorProperties the connector related properties. - * @param keyConverter the converter for the data key - * @param valueConverter the converter for the data body - * @param offsetBackingStore the backing store implementation (can be in-memory, file based, kafka based, etc...) - * @param maxTasks the maximum theoretical number of tasks this source should spawn. - * @param connectorClass the class of kafka connect source connector to wrap. - * @param streamId the id of the underlying stream - */ - public KafkaConnectStreamSink(SQLContext sqlContext, - Map connectorProperties, - Converter keyConverter, - Converter valueConverter, - OffsetBackingStore offsetBackingStore, - int maxTasks, - String topic, - String connectorClass, - String streamId) { - super(sqlContext, connectorProperties, keyConverter, valueConverter, offsetBackingStore, maxTasks, connectorClass, streamId); - this.topic = topic; - } - - - @Override - protected void initialize(SinkTask task) { - SimpleSinkTaskContext sstc = new SimpleSinkTaskContext(topic); - task.initialize(sstc); - contexts.put(task, sstc); - } - - public boolean openPartition(int partition) { - Tuple2 ret = partitions.computeIfAbsent(partition, - part -> { - SinkTask task = tasks.get(partition % tasks.size()); - TopicPartition tp = new TopicPartition(topic, part); - task.open(Collections.singleton(tp)); - SimpleSinkTaskContext tk = contexts.get(task); - return Tuple2.apply(task, tk); - }); - - return ret._2().assignThenState(partition); - } - - public void enqueueOnPartition(int partition, byte[] key, byte[] value) { - SchemaAndValue keySV = keyConverter.toConnectData(topic, key); - SchemaAndValue valueSV = valueConverter.toConnectData(topic, value); - - bufferedRecords.put(partition, - new SinkRecord(topic, - partition, - keySV.schema(), - keySV.value(), - valueSV.schema(), - valueSV.value(), - counter.incrementAndGet())); - } - - public void flushPartition(int partition) { - List records = bufferedRecords.get(partition); - if (!records.isEmpty()) { - partitions.get(partition)._1().put(records); - partitions.get(partition)._1().flush(Collections.singletonMap( - new TopicPartition(topic, partition), - new OffsetAndMetadata(records.get(records.size() - 1).kafkaOffset())) - ); - bufferedRecords.removeAll(partition); - - } - } - - @Override - protected void stopAllTasks() { - try { - super.stopAllTasks(); - } finally { - counter.set(0); - contexts.clear(); - bufferedRecords.clear(); - partitions.clear(); - } - - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/NullSink.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/NullSink.java deleted file mode 100644 index 34b2bbcc1..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/NullSink.java +++ /dev/null @@ -1,96 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect.sink; - -import org.apache.kafka.clients.consumer.OffsetAndMetadata; -import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.common.config.ConfigDef; -import org.apache.kafka.connect.connector.Task; -import org.apache.kafka.connect.sink.SinkConnector; -import org.apache.kafka.connect.sink.SinkRecord; -import org.apache.kafka.connect.sink.SinkTask; - -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -/** - * A busybox {@link SinkConnector} - * - * @author amarziali - */ -public class NullSink extends SinkConnector { - - public static class NullSinkTask extends SinkTask { - @Override - public void start(Map props) { - - } - - @Override - public void put(Collection records) { - - } - - @Override - public void flush(Map offsets) { - - } - - @Override - public void stop() { - - } - - @Override - public String version() { - return ""; - } - } - - @Override - public String version() { - return ""; - } - - @Override - public void start(Map props) { - - } - - @Override - public Class taskClass() { - return NullSinkTask.class; - } - - @Override - public List> taskConfigs(int maxTasks) { - return IntStream.range(0, maxTasks).mapToObj(i -> Collections.emptyMap()).collect(Collectors.toList()); - } - - @Override - public void stop() { - - } - - @Override - public ConfigDef config() { - return new ConfigDef(); - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/SimpleSinkTaskContext.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/SimpleSinkTaskContext.java deleted file mode 100644 index 05c662318..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/sink/SimpleSinkTaskContext.java +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect.sink; - -import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.connect.sink.SinkTaskContext; - -import java.util.*; -import java.util.stream.Collectors; - -/** - * A simple version of {@link SinkTaskContext} - * - * @author amarziali - */ -public class SimpleSinkTaskContext implements SinkTaskContext { - - private final Map state = Collections.synchronizedMap(new HashMap<>()); - private final String topic; - - public SimpleSinkTaskContext(String topic) { - this.topic = topic; - } - - @Override - public void offset(Map offsets) { - //not implemented - } - - @Override - public void offset(TopicPartition tp, long offset) { - //not implemented - } - - @Override - public void timeout(long timeoutMs) { - //not implemented - } - - @Override - public Set assignment() { - return state.entrySet().stream().filter(Map.Entry::getValue) - .map(entry -> new TopicPartition(topic, entry.getKey())) - .collect(Collectors.toSet()); - } - - @Override - public void pause(TopicPartition... partitions) { - Arrays.stream(partitions).map(TopicPartition::partition).forEach(p -> state.put(p, false)); - - } - - @Override - public void resume(TopicPartition... partitions) { - Arrays.stream(partitions).map(TopicPartition::partition).forEach(p -> state.put(p, true)); - - } - - @Override - public void requestCommit() { - - } - - public boolean assignThenState(int partition) { - return state.computeIfAbsent(partition, p -> true); - } - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSource.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSource.java deleted file mode 100644 index dabf03390..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSource.java +++ /dev/null @@ -1,286 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect.source; - - -import com.hurence.logisland.connect.AbstractKafkaConnectComponent; -import com.hurence.logisland.stream.spark.provider.StreamOptions; -import com.hurence.logisland.util.spark.SparkPlatform; -import org.apache.commons.lang3.StringUtils; -import org.apache.kafka.connect.runtime.WorkerSourceTaskContext; -import org.apache.kafka.connect.source.SourceConnector; -import org.apache.kafka.connect.source.SourceRecord; -import org.apache.kafka.connect.source.SourceTask; -import org.apache.kafka.connect.storage.Converter; -import org.apache.kafka.connect.storage.OffsetBackingStore; -import org.apache.kafka.connect.storage.OffsetStorageReaderImpl; -import org.apache.kafka.connect.storage.OffsetStorageWriter; -import org.apache.kafka.connect.util.ConnectorTaskId; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.execution.streaming.Offset; -import org.apache.spark.sql.execution.streaming.SerializedOffset; -import org.apache.spark.sql.execution.streaming.Source; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; -import org.apache.spark.unsafe.types.UTF8String; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import scala.Option; -import scala.Tuple2; -import scala.collection.JavaConversions; - -import java.util.*; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import java.util.stream.StreamSupport; - -/** - * Kafka connect to spark sql streaming bridge. - * - * @author amarziali - */ -public class KafkaConnectStreamSource extends AbstractKafkaConnectComponent implements Source { - - - /** - * The Schema used for this source. - */ - public final static StructType SCHEMA = new StructType(new StructField[]{ - new StructField(StreamOptions.KAFKA_CONNECT_CONNECTOR_PROPERTIES().getName(), - DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType), false, Metadata.empty()), - new StructField(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER().getName(), - DataTypes.StringType, false, Metadata.empty()), - new StructField(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER_PROPERTIES().getName(), - DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType), false, Metadata.empty()), - new StructField(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER().getName(), - DataTypes.StringType, false, Metadata.empty()), - new StructField(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER_PROPERTIES().getName(), - DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType), false, Metadata.empty()), - new StructField(StreamOptions.KAFKA_CONNECT_MAX_TASKS().getName(), - DataTypes.createMapType(DataTypes.IntegerType, DataTypes.StringType), false, Metadata.empty()) - }); - /** - * The schema used to represent the outgoing dataframe. - */ - public final static StructType DATA_SCHEMA = new StructType(new StructField[]{ - new StructField("topic", DataTypes.StringType, false, Metadata.empty()), - new StructField("sourcePartition", DataTypes.StringType, false, Metadata.empty()), - new StructField("sourceOffset", DataTypes.StringType, false, Metadata.empty()), - new StructField("key", DataTypes.BinaryType, true, Metadata.empty()), - new StructField("value", DataTypes.BinaryType, false, Metadata.empty()) - - }); - private final static Logger LOGGER = LoggerFactory.getLogger(KafkaConnectStreamSource.class); - - private final AtomicLong counter = new AtomicLong(); - private final AtomicInteger taskCounter = new AtomicInteger(); - - private final Map offsetWriterMap = new IdentityHashMap<>(); - private final SortedMap>> bufferedRecords = - Collections.synchronizedSortedMap(new TreeMap<>()); - private final SortedMap>> uncommittedRecords = - Collections.synchronizedSortedMap(new TreeMap<>()); - private final Map busyTasks = Collections.synchronizedMap(new IdentityHashMap<>()); - - private final SparkPlatform sparkPlatform = StreamSupport.stream( - Spliterators.spliteratorUnknownSize(ServiceLoader.load(SparkPlatform.class).iterator(), Spliterator.ORDERED), - false).findFirst().orElseThrow(() -> new IllegalStateException("SparkPlatform service spi not defined. " + - "Unable to continue")); - - - /** - * Base constructor. Should be called by {@link KafkaConnectStreamSourceProvider} - * - * @param sqlContext the spark sql context. - * @param connectorProperties the connector related properties. - * @param keyConverter the converter for the data key - * @param valueConverter the converter for the data body - * @param offsetBackingStore the backing store implementation (can be in-memory, file based, kafka based, etc...) - * @param maxTasks the maximum theoretical number of tasks this source should spawn. - * @param connectorClass the class of kafka connect source connector to wrap. - * @param streamId the id of the underlying stream - */ - public KafkaConnectStreamSource(SQLContext sqlContext, - Map connectorProperties, - Converter keyConverter, - Converter valueConverter, - OffsetBackingStore offsetBackingStore, - int maxTasks, - String connectorClass, - String streamId) { - super(sqlContext, connectorProperties, keyConverter, valueConverter, offsetBackingStore, maxTasks, connectorClass, streamId); - - } - - - @Override - protected void initialize(SourceTask task) { - int taskId = taskCounter.incrementAndGet(); - ConnectorTaskId connectorTaskId = new ConnectorTaskId(StringUtils.join(new String[]{streamId, connectorName}, '#'), taskId); - task.initialize(new WorkerSourceTaskContext(new OffsetStorageReaderImpl(offsetBackingStore, connectorTaskId.toString(), - createInternalConverter(true), createInternalConverter(false)))); - offsetWriterMap.put(task, new OffsetStorageWriter(offsetBackingStore, connectorTaskId.toString(), - createInternalConverter(true), createInternalConverter(false))); - - } - - - @Override - public StructType schema() { - return SCHEMA; - } - - @Override - protected void createAndStartAllTasks() throws IllegalAccessException, InstantiationException, ClassNotFoundException { - counter.set(0); - taskCounter.set(0); - busyTasks.clear(); - bufferedRecords.clear(); - offsetWriterMap.clear(); - super.createAndStartAllTasks(); - } - - @Override - public synchronized Option getOffset() { - if (!uncommittedRecords.isEmpty()) { - return Option.apply(SerializedOffset.apply(Long.toString(counter.incrementAndGet()))); - } - if (bufferedRecords.isEmpty()) { - tasks.forEach(t -> busyTasks.computeIfAbsent(t, sourceTask -> { - Thread thread = new Thread(() -> { - try { - List> tmp = sourceTask.poll().stream() - .map(sourceRecord -> Tuple2.apply(sourceTask, sourceRecord)) - .collect(Collectors.toList()); - if (!tmp.isEmpty()) { - bufferedRecords.put(counter.incrementAndGet(), tmp); - } - } catch (InterruptedException ie) { - LOGGER.warn("Task {} interrupted while waiting.", sourceTask.getClass().getCanonicalName()); - } finally { - busyTasks.remove(t); - } - }); - thread.start(); - return thread; - })); - } else { - return Option.apply(SerializedOffset.apply(bufferedRecords.lastKey().toString())); - - } - return Option.empty(); - } - - - @Override - public Dataset getBatch(Option start, Offset end) { - Long startOff = start.isDefined() ? Long.parseLong(start.get().json()) : - !bufferedRecords.isEmpty() ? bufferedRecords.firstKey() : 0L; - - Map> current = - new LinkedHashMap<>(bufferedRecords.subMap(startOff, Long.parseLong(end.json()) + 1)) - .keySet().stream() - .flatMap(offset -> { - List> srl = bufferedRecords.remove(offset); - if (srl != null) { - uncommittedRecords.put(offset, srl); - return srl.stream(); - } - return Stream.empty(); - }) - .map(Tuple2::_2) - .map(sourceRecord -> InternalRow.fromSeq(JavaConversions.asScalaBuffer(Arrays.asList( - toUTFString(sourceRecord.topic()), - toUTFString(sourceRecord.sourcePartition()), - toUTFString(sourceRecord.sourceOffset()), - keyConverter.fromConnectData(sourceRecord.topic(), sourceRecord.keySchema(), sourceRecord.key()), - valueConverter.fromConnectData(sourceRecord.topic(), sourceRecord.valueSchema(), sourceRecord.value()) - )).toSeq())) - .collect(Collectors.groupingBy(row -> Objects.hashCode((row.getString(1))))); - return sparkPlatform.createStreamingDataFrame(sqlContext, new SimpleRDD(sqlContext.sparkContext(), current), DATA_SCHEMA); - - - } - - private UTF8String toUTFString(Object o) { - if (o != null) { - return UTF8String.fromString(o.toString()); - } - return UTF8String.EMPTY_UTF8; - } - - @Override - public void commit(Offset end) { - if (uncommittedRecords.isEmpty()) { - return; - } - //first commit all offsets already given - List> recordsToCommit = - new LinkedHashMap<>(uncommittedRecords.subMap(uncommittedRecords.firstKey(), Long.parseLong(end.json()) + 1)).keySet().stream() - .flatMap(key -> uncommittedRecords.remove(key).stream()) - .collect(Collectors.toList()); - - recordsToCommit.forEach(tuple -> { - try { - offsetWriterMap.get(tuple._1()).offset(tuple._2().sourcePartition(), tuple._2().sourceOffset()); - tuple._1().commitRecord(tuple._2()); - } catch (Exception e) { - LOGGER.warn("Unable to commit record " + tuple._2(), e); - } - }); - recordsToCommit.stream().map(Tuple2::_1).distinct().forEach(sourceTask -> { - try { - sourceTask.commit(); - } catch (Exception e) { - LOGGER.warn("Unable to bulk commit offset for connector " + connectorName, e); - } - }); - //now flush offset writer - offsetWriterMap.values().forEach(offsetStorageWriter -> { - try { - if (offsetStorageWriter.beginFlush()) { - offsetStorageWriter.doFlush((error, result) -> { - if (error == null) { - LOGGER.debug("Flushing till offset {} with result {}", end, result); - } else { - LOGGER.error("Unable to commit records till source offset " + end, error); - - } - }).get(30, TimeUnit.SECONDS); - } - } catch (Exception e) { - LOGGER.error("Unable to commit records till source offset " + end, e); - } - }); - } - - - @Override - public void stop() { - super.stop(); - } - -} - - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSourceProvider.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSourceProvider.java deleted file mode 100644 index 8d97a277c..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/KafkaConnectStreamSourceProvider.java +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect.source; - -import com.hurence.logisland.connect.Utils; -import com.hurence.logisland.stream.spark.provider.StreamOptions; -import org.apache.kafka.connect.source.SourceConnector; -import org.apache.kafka.connect.storage.Converter; -import org.apache.kafka.connect.storage.OffsetBackingStore; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.sql.execution.streaming.Source; -import org.apache.spark.sql.sources.StreamSourceProvider; -import org.apache.spark.sql.types.StructType; -import scala.Option; -import scala.Tuple2; -import scala.collection.immutable.Map; - -/** - * A {@link StreamSourceProvider} capable of creating spark {@link com.hurence.logisland.stream.spark.structured.StructuredStream} - * enabled kafka sources. - * - * @author amarziali - */ -public class KafkaConnectStreamSourceProvider implements StreamSourceProvider { - - @Override - public Source createSource(SQLContext sqlContext, String metadataPath, Option schema, String providerName, Map parameters) { - try { - Converter keyConverter = Utils.createConverter(parameters.get(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER().getName()).get(), - parameters.get(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER_PROPERTIES().getName()).get(), true); - Converter valueConverter = Utils.createConverter(parameters.get(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER().getName()).get(), - parameters.get(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER_PROPERTIES().getName()).get(), false); - //create the right backing store - String bs = parameters.get(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE().getName()).get(); - java.util.Map offsetBackingStoreProperties = - Utils.propertiesToMap(parameters.get(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE_PROPERTIES().getName()).get()); - OffsetBackingStore offsetBackingStore = Utils.createOffsetBackingStore(bs, offsetBackingStoreProperties); - - KafkaConnectStreamSource ret = new KafkaConnectStreamSource(sqlContext, - Utils.propertiesToMap(parameters.get(StreamOptions.KAFKA_CONNECT_CONNECTOR_PROPERTIES().getName()).get()), - keyConverter, - valueConverter, - offsetBackingStore, - Integer.parseInt(parameters.get(StreamOptions.KAFKA_CONNECT_MAX_TASKS().getName()).get()), - parameters.get(StreamOptions.KAFKA_CONNECT_CONNECTOR_CLASS().getName()).get(), - parameters.get("path").get()); - ret.start(); - return ret; - } catch (Exception e) { - throw new IllegalArgumentException("Unable to create kafka connect stream source: " + e.getMessage(), e); - } - - - } - - @Override - public Tuple2 sourceSchema(SQLContext sqlContext, Option schema, String providerName, Map parameters) { - return Tuple2.apply(providerName, KafkaConnectStreamSource.DATA_SCHEMA); - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimplePartition.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimplePartition.java deleted file mode 100644 index 171e74bd4..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimplePartition.java +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect.source; - -import org.apache.spark.Partition; - -/** - * Simple partition. - * - * @author amarziali - */ -public class SimplePartition implements Partition { - - private final int index; - private final int hash; - - public SimplePartition(int index, int hash) { - this.index = index; - this.hash = hash; - } - - @Override - public int index() { - return index; - } - - public int getHash() { - return hash; - } - - @Override - public String toString() { - return "SimplePartition{" + - "index=" + index + - ", hash=" + hash + - '}'; - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimpleRDD.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimpleRDD.java deleted file mode 100644 index 9768dcf30..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/SimpleRDD.java +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect.source; - -import org.apache.spark.Dependency; -import org.apache.spark.Partition; -import org.apache.spark.SparkContext; -import org.apache.spark.TaskContext; -import org.apache.spark.rdd.RDD; -import org.apache.spark.sql.catalyst.InternalRow; -import scala.collection.Iterator; -import scala.collection.JavaConversions; -import scala.reflect.ClassTag$; - -import java.util.Collections; -import java.util.List; -import java.util.Map; - -/** - * Simple kafka connect source partitioned RDD. - * - * @author amarziali - */ -public class SimpleRDD extends RDD { - - final Map> data; - - public SimpleRDD(SparkContext _sc, Map> data) { - super(_sc, JavaConversions.collectionAsScalaIterable(Collections.>emptyList()).toSeq(), - ClassTag$.MODULE$.apply(InternalRow.class)); - this.data = data; - } - - @Override - public Iterator compute(Partition split, TaskContext context) { - return JavaConversions.collectionAsScalaIterable(data.get(((SimplePartition)split).getHash())).iterator(); - } - - @Override - public Partition[] getPartitions() { - Partition[] ret = new SimplePartition[data.size()]; - int j = 0; - for (Integer i : data.keySet()) { - ret[j] = new SimplePartition(j, i); - j++; - } - return ret; - - } - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceConnector.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceConnector.java deleted file mode 100644 index 65a29c89e..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceConnector.java +++ /dev/null @@ -1,136 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect.source.timed; - -import org.apache.kafka.common.config.ConfigDef; -import org.apache.kafka.connect.connector.Task; -import org.apache.kafka.connect.source.SourceConnector; - -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * A connector that emits an empty record at fixed rate waking up the processing pipeline. - * - * @author amarziali - */ -public class ClockSourceConnector extends SourceConnector { - - public static final String RATE = "rate"; - public static final String POLL_CRON_SCHEDULER_CONFIG = "poll.cron"; - public static final String SNAPSHOT_FIELD_CONFIG = "snapshot.field"; - public static final String TSID_FIELD_CONFIG = "tsid.field"; - public static final String DATE_FIELD_CONFIG = "date.field"; - public static final String DATE_FORMAT_CONFIG = "date.format"; - public static final String DATE_TIMEZONE_CONFIG = "date.timezone"; - public static final String RECORD_ID_FIELD_CONFIG = "record.id.field"; - public static final String HAS_ONGOING_RECORD_CONFIG = "has.ongoing.record"; - public static final String HAS_PREVIOUS_RECORD_CONFIG = "has.previous.record"; - public static final String CURRENT_RECORD_ID_VALUE_CONFIG = "current.record.id.value"; - - - public static final long RATE_DEFAULT = 60000; - public static final String POLL_CRON_SCHEDULER_DEFAULT = null; - public static final String SNAPSHOT_FIELD_DEFAULT = null; - public static final String TSID_FIELD_DEFAULT = null; - public static final String DATE_FIELD_DEFAULT = null; - public static final String DATE_FORMAT_DEFAULT = "yyyy-MM-dd HH:mm:ss z"; - public static final String DATE_TIMEZONE_DEFAULT = "CET"; - public static final String RECORD_ID_FIELD_DEFAULT = "id"; - public static final boolean HAS_ONGOING_RECORD_DEFAULT = false; - public static final boolean HAS_PREVIOUS_RECORD_DEFAULT = false; - public static final String CURRENT_RECORD_ID_VALUE_DEFAULT = "clockRecord"; - - - private static final ConfigDef CONFIG = new ConfigDef() - .define(RATE, ConfigDef.Type.LONG, RATE_DEFAULT, ConfigDef.Importance.HIGH, "The clock rate in milliseconds") - .define(POLL_CRON_SCHEDULER_CONFIG, ConfigDef.Type.STRING, POLL_CRON_SCHEDULER_DEFAULT, ConfigDef.Importance.HIGH, "The cron expression") - .define(SNAPSHOT_FIELD_CONFIG, ConfigDef.Type.STRING, SNAPSHOT_FIELD_DEFAULT, ConfigDef.Importance.HIGH, "Name of the field containing the snapshot id") - .define(TSID_FIELD_CONFIG, ConfigDef.Type.STRING, TSID_FIELD_DEFAULT, ConfigDef.Importance.HIGH, "Name of the field containing the ordering column") - .define(DATE_FIELD_CONFIG, ConfigDef.Type.STRING, DATE_FIELD_DEFAULT, ConfigDef.Importance.HIGH, "Name of the field containing the date in human readable format") - .define(DATE_FORMAT_CONFIG, ConfigDef.Type.STRING, DATE_FORMAT_DEFAULT, ConfigDef.Importance.HIGH, "Format to use to display date in human readable-format") - .define(DATE_TIMEZONE_CONFIG, ConfigDef.Type.STRING, DATE_TIMEZONE_DEFAULT, ConfigDef.Importance.HIGH, "Timezone to use to display date in human readable-format") - .define(RECORD_ID_FIELD_CONFIG, ConfigDef.Type.STRING, RECORD_ID_FIELD_DEFAULT, ConfigDef.Importance.HIGH, "Name of the field containing the id of the record") - .define(HAS_ONGOING_RECORD_CONFIG, ConfigDef.Type.BOOLEAN, HAS_ONGOING_RECORD_DEFAULT, ConfigDef.Importance.HIGH, "If set to true, it will produce an additional record with ongoing snapshot details") - .define(HAS_PREVIOUS_RECORD_CONFIG, ConfigDef.Type.BOOLEAN, HAS_PREVIOUS_RECORD_DEFAULT, ConfigDef.Importance.HIGH, "If set to true, it will produce an additional record with previous snapshot details") - .define(CURRENT_RECORD_ID_VALUE_CONFIG, ConfigDef.Type.STRING, CURRENT_RECORD_ID_VALUE_DEFAULT, ConfigDef.Importance.HIGH, "Specifies the id value of the record"); - - private long rate; - private String recordIdField; - private String currentRecordIdValue; - private String recordSnapshotField; - private String cronExprValue; - private String tsidField; - private String dateField; - private boolean hasOngoingRecordDefault; - private boolean hasPreviousRecordDefault; - private String formatDateValue; - private String timezoneDateValue; - - @Override - public String version() { - return "1.0"; - } - - @Override - public void start(Map props) { - rate = (Long) CONFIG.parse(props).get(RATE); - recordIdField = (String) CONFIG.parse(props).get(RECORD_ID_FIELD_CONFIG); - recordSnapshotField = (String) CONFIG.parse(props).get(SNAPSHOT_FIELD_CONFIG); - cronExprValue = (String) CONFIG.parse(props).get(POLL_CRON_SCHEDULER_CONFIG); - tsidField = (String) CONFIG.parse(props).get(TSID_FIELD_CONFIG); - dateField = (String) CONFIG.parse(props).get(DATE_FIELD_CONFIG); - hasOngoingRecordDefault = (boolean) CONFIG.parse(props).get(HAS_ONGOING_RECORD_CONFIG); - hasPreviousRecordDefault = (boolean) CONFIG.parse(props).get(HAS_PREVIOUS_RECORD_CONFIG); - currentRecordIdValue = (String) CONFIG.parse(props).get(CURRENT_RECORD_ID_VALUE_CONFIG); - formatDateValue = (String) CONFIG.parse(props).get(DATE_FORMAT_CONFIG); - timezoneDateValue = (String) CONFIG.parse(props).get(DATE_TIMEZONE_CONFIG); - } - - @Override - public Class taskClass() { - return ClockSourceTask.class; - } - - @Override - public List> taskConfigs(int maxTasks) { - Map mapConfig = new HashMap<>(); - mapConfig.put(RATE, Long.toString(rate)); - mapConfig.put(RECORD_ID_FIELD_CONFIG, recordIdField); - mapConfig.put(CURRENT_RECORD_ID_VALUE_CONFIG, currentRecordIdValue); - mapConfig.put(SNAPSHOT_FIELD_CONFIG, recordSnapshotField); - mapConfig.put(POLL_CRON_SCHEDULER_CONFIG, cronExprValue); - mapConfig.put(TSID_FIELD_CONFIG, tsidField); - mapConfig.put(DATE_FIELD_CONFIG, dateField); - mapConfig.put(HAS_ONGOING_RECORD_CONFIG, Boolean.toString(hasOngoingRecordDefault)); - mapConfig.put(HAS_PREVIOUS_RECORD_CONFIG, Boolean.toString(hasPreviousRecordDefault)); - mapConfig.put(DATE_FORMAT_CONFIG, formatDateValue); - mapConfig.put(DATE_TIMEZONE_CONFIG, timezoneDateValue); - return Collections.singletonList(mapConfig); - } - - @Override - public void stop() { - } - - @Override - public ConfigDef config() { - - return CONFIG; - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceTask.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceTask.java deleted file mode 100644 index 9e33b5278..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/connect/source/timed/ClockSourceTask.java +++ /dev/null @@ -1,250 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect.source.timed; - -import org.apache.kafka.common.utils.SystemTime; -import org.apache.kafka.common.utils.Time; -import org.apache.kafka.connect.data.Schema; -import org.apache.kafka.connect.data.SchemaBuilder; -import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.source.SourceRecord; -import org.apache.kafka.connect.source.SourceTask; - -import java.text.SimpleDateFormat; -import java.util.*; - -import org.springframework.scheduling.support.CronSequenceGenerator; - - -/** - * {@link SourceTask} for {@link ClockSourceConnector} - * - * @author amarziali, jarnou - * - * The ClockSourceTask is a kafka connect service controller designed to - * generate a/(a set of) record(s) on a regular basis. - * It can be either on a rate (in milliseconds) or on a cron basis (cron expression). - * - * Note: If both rate and cron are specified in the configuration of the controller, - * the cron takes precedence over the rate. - * - * It is possible to add a field in the produced record containing a value - * (number of seconds since 1970) corresponding to the time at which the record has been - * produced and another field containing the date in a human-readable format in a - * specified timezone (CET being the default) - * - * By the way: - * It is also possible in addition to the standard record produced to generate 2 additional records. - * One is containing the data (snaphot, date) for the ongoing 'clock', and another one for the - * previous one. - * The use-case for these 2 additional records is the update of a table of snapshots for instance. - * The idea is to have a list of all snapshots, as well as the ongoing and the previous one. - * - */ -public class ClockSourceTask extends SourceTask { - - private Time time; - private long rate; - private String cronExpr; - private String recordIdField; - private String recordIdValue; - private String snapshotField; - private String tsidField; - private String dateField; - private CronSequenceGenerator cronSeqGen = null; - private boolean useCron = false; - private boolean useSnapshot = false; - private boolean useTSID = false; - private boolean useDate = false; - private boolean hasOngoingRecord = false; - private boolean hasPreviousRecord = false; - private long recordSnapshot = -1; // Uniquely identifies a poll/retrieval of the data from a src - private Schema finalSchema = null; - private long previousRecordSnapshot = -1; - private static long TSID_DEFAULT = -1; - private String dateFormat; - private String dateTimezone; - - - - @Override - public void start(Map props) { - this.time = new SystemTime(); - rate = Long.parseLong(props.get(ClockSourceConnector.RATE)); - cronExpr = props.get(ClockSourceConnector.POLL_CRON_SCHEDULER_CONFIG); - recordIdField = props.get(ClockSourceConnector.RECORD_ID_FIELD_CONFIG); - snapshotField = props.get(ClockSourceConnector.SNAPSHOT_FIELD_CONFIG); - tsidField = props.get(ClockSourceConnector.TSID_FIELD_CONFIG); - dateField = props.get(ClockSourceConnector.DATE_FIELD_CONFIG); - recordIdValue = props.get(ClockSourceConnector.CURRENT_RECORD_ID_VALUE_CONFIG); - dateFormat = props.get(ClockSourceConnector.DATE_FORMAT_CONFIG); - dateTimezone = props.get(ClockSourceConnector.DATE_TIMEZONE_CONFIG); - - // Check if cron should be used && Generate a cron object once for further use - if ((cronExpr != null) && (cronExpr.isEmpty() != true)) { - useCron = CronSequenceGenerator.isValidExpression(cronExpr); - } - - if (useCron) { - cronSeqGen = new CronSequenceGenerator(cronExpr); - } - useSnapshot = (snapshotField != null) ? true : false; - useTSID = (tsidField != null) ? true : false; - useDate = (dateField != null) ? true : false; - hasOngoingRecord = new Boolean(props.get(ClockSourceConnector.HAS_ONGOING_RECORD_CONFIG)); - hasPreviousRecord = new Boolean(props.get(ClockSourceConnector.HAS_PREVIOUS_RECORD_CONFIG)); - - // Build the schema if not created yet - if (finalSchema == null) { - SchemaBuilder newSchema = SchemaBuilder.struct(); - newSchema.field(recordIdField, Schema.STRING_SCHEMA); - if (useSnapshot) { - newSchema.field(snapshotField, Schema.INT64_SCHEMA); - } - if (useTSID) { - newSchema.field(tsidField, Schema.INT64_SCHEMA); - } - if (useDate) { - newSchema.field(dateField, Schema.STRING_SCHEMA); - } - finalSchema = newSchema.build(); - } - } - - @Override - public List poll() throws InterruptedException { - final long untilNext; - if (useCron) { - Date nextTriggerDate = cronSeqGen.next(new Date(time.milliseconds())); - long nextTriggerDateInMs = nextTriggerDate.getTime(); - untilNext = nextTriggerDateInMs - time.milliseconds(); - if (useSnapshot) { - recordSnapshot = nextTriggerDateInMs ; - } - time.sleep(untilNext); - } - else { - if (useSnapshot){ - recordSnapshot = (time.milliseconds()+rate) ; - } - Thread.sleep(rate); - } - - Struct recordVal = new Struct(finalSchema); - recordVal.put(recordIdField, recordIdValue); - if (useSnapshot) { - recordVal.put(snapshotField, recordSnapshot); - if (useDate){ - String jdate = secToString(recordSnapshot, dateFormat, dateTimezone); - recordVal.put(dateField, jdate); - } - } - if (useTSID) { - recordVal.put(tsidField, recordSnapshot); - } - - SourceRecord sr = new SourceRecord( - null, - null, - "", - finalSchema, - recordVal); - - if ( ! hasOngoingRecord && ! hasPreviousRecord ) { - return Collections.singletonList(sr); - } - else { - List listRecords = new LinkedList<>(); - listRecords.add(sr); - - if (useSnapshot) { - // Build ongoing record (if requested) - if (hasOngoingRecord){ - Struct orVal = new Struct(finalSchema); - orVal.put(recordIdField, "ongoing"); - if (useSnapshot) { - orVal.put(snapshotField, recordSnapshot); - if (useDate){ - String jdate = secToString(recordSnapshot, dateFormat, dateTimezone); - orVal.put(dateField, jdate); - } - } - if (useTSID) { - orVal.put(tsidField, TSID_DEFAULT); - } - - SourceRecord or = new SourceRecord( - null, - null, - "", - finalSchema, - orVal); - listRecords.add(or); - } - - // Build previous record (if requested) - if (hasPreviousRecord && previousRecordSnapshot > 0) { - Struct prVal = new Struct(finalSchema); - prVal.put(recordIdField, "previous"); - if (useSnapshot) { - prVal.put(snapshotField, previousRecordSnapshot); - if (useDate){ - String jdate = secToString(previousRecordSnapshot, dateFormat, dateTimezone); - prVal.put(dateField, jdate); - } - } - if (useTSID) { - prVal.put(tsidField, TSID_DEFAULT); - } - - SourceRecord pr = new SourceRecord(null, - null, - "", - finalSchema, - prVal); - listRecords.add(pr); - } - previousRecordSnapshot = recordSnapshot; - } - - return listRecords; - } - } - - @Override - public void stop() { - - } - - @Override - public String version() { - return "1.0"; - } - - /* - * Return the timeInSec in a Human Readable - * format (dateFormat) using the timezone given in parameter. - */ - private String secToString(long timeInSec, String dateFormat, String timezone){ - //convert seconds to milliseconds - Date date = new Date(timeInSec); - // format of the date - SimpleDateFormat jdf = new SimpleDateFormat(dateFormat); - jdf.setTimeZone(TimeZone.getTimeZone(timezone)); - String jdate = jdf.format(date); - return jdate; - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/PipelineConfigurationBroadcastWrapper.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/PipelineConfigurationBroadcastWrapper.java deleted file mode 100644 index da7c39477..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/PipelineConfigurationBroadcastWrapper.java +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote; - -import com.hurence.logisland.engine.EngineContext; -import com.hurence.logisland.processor.ProcessContext; -import com.hurence.logisland.stream.StreamContext; -import org.apache.spark.SparkContext; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.broadcast.Broadcast; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Collection; -import java.util.Map; -import java.util.stream.Collectors; - -/** - * A {@link Broadcast} wrapper for a Stream pipeline configuration. - * This class allow to magically synchronize data modified from the spark driver to every executor. - * - * @author amarziali - */ -public class PipelineConfigurationBroadcastWrapper { - private static final Logger logger = LoggerFactory.getLogger(PipelineConfigurationBroadcastWrapper.class); - - private Broadcast>> broadcastedPipelineMap; - - private static PipelineConfigurationBroadcastWrapper obj = new PipelineConfigurationBroadcastWrapper(); - - private PipelineConfigurationBroadcastWrapper() { - } - - public static PipelineConfigurationBroadcastWrapper getInstance() { - return obj; - } - - public JavaSparkContext getSparkContext(SparkContext sc) { - JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); - return jsc; - } - - public void refresh(Map> pipelineMap, SparkContext sparkContext) { - logger.info("Refreshing dataflow pipelines!"); - - if (broadcastedPipelineMap != null) { - broadcastedPipelineMap.unpersist(); - } - broadcastedPipelineMap = getSparkContext(sparkContext).broadcast(pipelineMap); - } - - public void refresh(EngineContext engineContext, SparkContext sparkContext) { - logger.info("Refreshing dataflow pipelines!"); - - if (broadcastedPipelineMap != null) { - broadcastedPipelineMap.unpersist(); - } - broadcastedPipelineMap = getSparkContext(sparkContext).broadcast(engineContext.getStreamContexts().stream() - .collect(Collectors.toMap(StreamContext::getIdentifier, s -> s.getProcessContexts().stream().collect(Collectors.toList())))); - - } - - - public Collection get(String streamName) { - return broadcastedPipelineMap.getValue().get(streamName); - } -} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiClient.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiClient.java deleted file mode 100644 index 49e3ca5e1..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiClient.java +++ /dev/null @@ -1,247 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote; - -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.SerializationFeature; -import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; -import com.hurence.logisland.engine.spark.remote.model.DataFlow; -import okhttp3.*; -import okhttp3.internal.http.HttpDate; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.validation.ConstraintViolation; -import javax.validation.ConstraintViolationException; -import javax.validation.Validation; -import javax.validation.Validator; -import javax.ws.rs.core.HttpHeaders; -import javax.ws.rs.core.MediaType; -import java.time.Duration; -import java.time.Instant; -import java.util.Date; -import java.util.Iterator; -import java.util.Optional; -import java.util.Set; -import java.util.concurrent.TimeUnit; - -/** - * Rest client wrapper for logisland remote APIs. - * - * @author amarziali - */ -public class RemoteApiClient { - - /** - * Conversation state. - */ - public static class State { - public Instant lastModified; - } - - /** - * Connection settings. - */ - public static class ConnectionSettings { - - private final String baseUrl; - private final Duration socketTimeout; - private final Duration connectTimeout; - private final String username; - private final String password; - - /** - * Constructs a new instance. - * If username and password are provided, the client will be configured to supply a basic authentication. - * - * @param baseUrl the base url - * @param socketTimeout the read/write socket timeout - * @param connectTimeout the connection socket timeout - * @param username the username if a basic authentication is needed. - * @param password the password if a basic authentication is needed. - */ - public ConnectionSettings(String baseUrl, Duration socketTimeout, Duration connectTimeout, String username, String password) { - this.baseUrl = baseUrl; - this.socketTimeout = socketTimeout; - this.connectTimeout = connectTimeout; - this.username = username; - this.password = password; - } - } - - private static final Logger logger = LoggerFactory.getLogger(RemoteApiClient.class); - - private static final String DATAFLOW_RESOURCE_URI = "dataflows"; - private static final String STREAM_RESOURCE_URI = "streams"; - - - private static final Validator validator = Validation.buildDefaultValidatorFactory().getValidator(); - - private final OkHttpClient client; - private final HttpUrl baseUrl; - private final ObjectMapper mapper; - - - /** - * Construct a new instance with provided connection settings. - * - * @param connectionSettings the {@link ConnectionSettings} - */ - public RemoteApiClient(ConnectionSettings connectionSettings) { - this.baseUrl = HttpUrl.parse(connectionSettings.baseUrl); - this.mapper = new ObjectMapper(); - mapper.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES); - mapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); - mapper.registerModule(new JavaTimeModule()) - .findAndRegisterModules(); - - OkHttpClient.Builder builder = new OkHttpClient() - .newBuilder() - .readTimeout(connectionSettings.socketTimeout.toMillis(), TimeUnit.MILLISECONDS) - .writeTimeout(connectionSettings.socketTimeout.toMillis(), TimeUnit.MILLISECONDS) - .connectTimeout(connectionSettings.connectTimeout.toMillis(), TimeUnit.MILLISECONDS) - .followRedirects(true) - .followSslRedirects(true); - //add basic auth if needed. - if (connectionSettings.username != null && connectionSettings.password != null) { - builder.addInterceptor(chain -> { - Request originalRequest = chain.request(); - Request requestWithBasicAuth = originalRequest - .newBuilder() - .header(HttpHeaders.AUTHORIZATION, Credentials.basic(connectionSettings.username, connectionSettings.password)) - .build(); - return chain.proceed(requestWithBasicAuth); - }); - } - this.client = builder.build(); - } - - - /** - * Generic method to fetch and validate a HTTP resource. - * - * @param url the resource Url. - * @param state the conversation state. - * @param resourceClass the bean model class. - * @param the type of the model data to return. - * @return an {@link Optional} bean containing requested validated data. - */ - private Optional doFetch(HttpUrl url, State state, Class resourceClass) { - Request.Builder request = new Request.Builder() - .url(url).addHeader(HttpHeaders.ACCEPT, MediaType.APPLICATION_JSON); - - if (state.lastModified != null) { - request.addHeader(HttpHeaders.IF_MODIFIED_SINCE, HttpDate.format(new Date((state.lastModified.toEpochMilli())))); - } - - try (Response response = client.newCall(request.build()).execute()) { - if (response.code() != javax.ws.rs.core.Response.Status.NOT_MODIFIED.getStatusCode()) { - - if (!response.isSuccessful()) { - logger.error("Error refreshing {} from remote server. Got code {}", resourceClass.getCanonicalName(), response.code()); - } else { - String lm = response.header(HttpHeaders.LAST_MODIFIED); - if (lm != null) { - try { - Date tmp = HttpDate.parse(lm); - if (tmp != null) { - state.lastModified = tmp.toInstant(); - } - } catch (Exception e) { - logger.warn("Unable to correctly parse Last-Modified Header"); - } - } - T ret = mapper.readValue(response.body().byteStream(), resourceClass); - //validate against javax.validation annotations. - doValidate(ret); - return Optional.of(ret); - } - } - } catch (Exception e) { - logger.error("Unable to refresh dataflow from remote server", e); - } - - return Optional.empty(); - } - - /** - * Perform validation of the given bean. - * - * @param bean the instance to validate - * @see javax.validation.Validator#validate - */ - private void doValidate(Object bean) { - Set> result = validator.validate(bean); - if (!result.isEmpty()) { - StringBuilder sb = new StringBuilder("Bean validation failed: "); - for (Iterator> it = result.iterator(); it.hasNext(); ) { - ConstraintViolation violation = it.next(); - sb.append(violation.getPropertyPath()).append(" - ").append(violation.getMessage()); - if (it.hasNext()) { - sb.append("; "); - } - } - throw new ConstraintViolationException(sb.toString(), result); - } - } - - /** - * Fetches dataflow from a remote server. - * - * @param dataflowName the name of the dataflow to fetch. - * @param state the conversation state (never null) - * @return a optional {@link DataFlow} (never null). Empty in case of error or no results. - */ - public Optional fetchDataflow(String dataflowName, State state) { - return doFetch(baseUrl.newBuilder().addPathSegment(DATAFLOW_RESOURCE_URI).addPathSegment(dataflowName).build(), - state, DataFlow.class); - } - - /** - * Push a dataflow configuration to a remote server. - * We do not care about http result code since the call is fire and forget. - * - * @param dataflowName the name of the dataflow to push - * @param dataFlow the item to push. - */ - public void pushDataFlow(String dataflowName, DataFlow dataFlow) { - try { - Request request = new Request.Builder() - .url(baseUrl.newBuilder() - .addPathSegment(DATAFLOW_RESOURCE_URI).addPathSegment(dataflowName) - .build()) - .addHeader(HttpHeaders.ACCEPT, MediaType.APPLICATION_JSON) - .post(dataFlow != null ? - RequestBody.create(okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), - - mapper.writeValueAsString(dataFlow)) : - RequestBody.create(null, new byte[0])) - .build(); - try (Response response = client.newCall(request).execute()) { - if (!response.isSuccessful()) { - logger.warn("Expected application to answer with 200 OK. Got {}", response.code()); - } - } - - - } catch (Exception e) { - logger.warn("Unexpected exception trying to push latest dataflow configuration", e); - } - } - - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiComponentFactory.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiComponentFactory.java deleted file mode 100644 index a9748cffc..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/RemoteApiComponentFactory.java +++ /dev/null @@ -1,250 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote; - -import com.hurence.logisland.component.ComponentFactory; -import com.hurence.logisland.component.ConfigurableComponent; -import com.hurence.logisland.component.PropertyDescriptor; -import com.hurence.logisland.config.ControllerServiceConfiguration; -import com.hurence.logisland.controller.ControllerService; -import com.hurence.logisland.controller.ControllerServiceInitializationContext; -import com.hurence.logisland.controller.StandardControllerServiceContext; -import com.hurence.logisland.engine.EngineContext; -import com.hurence.logisland.engine.spark.remote.model.*; -import com.hurence.logisland.processor.ProcessContext; -import com.hurence.logisland.processor.StandardProcessContext; -import com.hurence.logisland.stream.RecordStream; -import com.hurence.logisland.stream.StandardStreamContext; -import com.hurence.logisland.stream.StreamContext; -import org.apache.spark.SparkContext; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.function.Function; -import java.util.stream.Collectors; - -/** - * ] - * Component factory resolving logisland components from remote api model. - * - * @author amarziali - */ -public class RemoteApiComponentFactory { - - private static final Logger logger = LoggerFactory.getLogger(RemoteApiComponentFactory.class); - - - /** - * Instantiates a stream from of configuration - * - * @param stream - * @return - */ - public StreamContext getStreamContext(Stream stream) { - try { - final RecordStream recordStream = ComponentFactory.loadComponent(stream.getComponent()); - final StreamContext instance = - new StandardStreamContext(recordStream, stream.getName()); - - // instantiate each related processor - stream.getPipeline().getProcessors().stream() - .map(this::getProcessContext) - .forEach(instance::addProcessContext); - - - // set the config properties - configureComponent(recordStream, stream.getConfig()) - .forEach((k, s) -> instance.setProperty(k, s)); - if (!instance.isValid()) { - throw new IllegalArgumentException("Stream is not valid"); - } - - logger.info("created stream {}", stream.getName()); - return instance; - - } catch (ClassNotFoundException e) { - throw new RuntimeException("unable to instantiate stream " + stream.getName(), e); - } - } - - /** - * Constructs processors. - * - * @param processor the processor bean. - * @return optionally the constructed processor context or nothing in case of error. - */ - public ProcessContext getProcessContext(Processor processor) { - try { - final com.hurence.logisland.processor.Processor processorInstance = ComponentFactory.loadComponent(processor.getComponent()); - final ProcessContext processContext = - new StandardProcessContext(processorInstance, processor.getName()); - - // set all properties - configureComponent(processorInstance, processor.getConfig()) - .forEach((k, s) -> processContext.setProperty(k, s)); - ; - - if (!processContext.isValid()) { - throw new IllegalArgumentException("Processor is not valid"); - } - - - logger.info("created processor {}", processor); - return processContext; - } catch (ClassNotFoundException e) { - throw new RuntimeException("unable to instantiate processor " + processor.getName(), e); - } - - } - - - /** - * Constructs controller services. - * - * @param service the service bean. - * @return optionally the constructed service configuration or nothing in case of error. - */ - public ControllerServiceConfiguration getControllerServiceConfiguration(Service service) { - try { - ControllerService cs = ComponentFactory.loadComponent(service.getComponent()); - ControllerServiceConfiguration configuration = new ControllerServiceConfiguration(); - configuration.setControllerService(service.getName()); - configuration.setComponent(service.getComponent()); - configuration.setDocumentation(service.getDocumentation()); - configuration.setType("service"); - configuration.setConfiguration(configureComponent(cs, service.getConfig())); - ControllerServiceInitializationContext ic = new StandardControllerServiceContext(cs, service.getName()); - configuration.getConfiguration().forEach((k, s) -> ic.setProperty(k, s)); - if (!ic.isValid()) { - throw new IllegalArgumentException("Service is not valid"); - } - logger.info("created service {}", service.getName()); - return configuration; - } catch (Exception e) { - throw new RuntimeException("unable to instantiate service " + service.getName(), e); - } - - - } - - /** - * Updates the state of the engine if needed. - * - * @param sparkContext the spark context - * @param engineContext the engineContext - * @param dataflow the new dataflow (new state) - * @param oldDataflow latest dataflow dataflow. - */ - public boolean updateEngineContext(SparkContext sparkContext, EngineContext engineContext, DataFlow dataflow, DataFlow oldDataflow) { - boolean changed = false; - if (oldDataflow == null || oldDataflow.getLastModified().isBefore(dataflow.getLastModified())) { - logger.info("We have a new configuration. Resetting current engine"); - logger.info("Configuring dataflow. Last change at {} is {}", dataflow.getLastModified(), dataflow.getModificationReason()); - - - List css = dataflow.getServices().stream() - .map(this::getControllerServiceConfiguration) - .collect(Collectors.toList()); - - List sc = dataflow.getStreams().stream() - .map(this::getStreamContext) - .collect(Collectors.toList()); - - sc.forEach(streamContext -> { - if (!streamContext.isValid()) { - throw new IllegalArgumentException("Unable to validate steam " + streamContext.getIdentifier()); - } - }); - - logger.info("Restarting engine"); - engineContext.getEngine().reset(engineContext); - css.forEach(engineContext::addControllerServiceConfiguration); - sc.forEach(engineContext::addStreamContext); - - PipelineConfigurationBroadcastWrapper.getInstance().refresh( - engineContext.getStreamContexts().stream() - .collect(Collectors.toMap(StreamContext::getIdentifier, StreamContext::getProcessContexts)) - , sparkContext); - updatePipelines(sparkContext, engineContext, dataflow.getStreams()); - engineContext.getEngine().start(engineContext); - changed = true; - - } else { - //need to update pipelines? - - Map streamMap = dataflow.getStreams().stream().collect(Collectors.toMap(Stream::getName, Function.identity())); - - List mergedStreamList = new ArrayList<>(); - for (Stream oldStream : oldDataflow.getStreams()) { - Stream newStream = streamMap.get(oldStream.getName()); - if (newStream != null && oldStream.getPipeline().getLastModified().isBefore(newStream.getPipeline().getLastModified())) { - changed = true; - logger.info("Detected change for pipeline {}", newStream.getName()); - mergedStreamList.add(newStream); - } else { - mergedStreamList.add(oldStream); - } - } - if (changed) { - updatePipelines(sparkContext, engineContext, mergedStreamList); - } - - } - return changed; - } - - - /** - * Update pipelines. - * - * @param sparkContext the spark context - * @param engineContext the engine context. - * @param streams the list of streams - */ - public void updatePipelines(SparkContext sparkContext, EngineContext engineContext, Collection streams) { - Map> pipelineMap = streams.stream() - .collect(Collectors.toMap(Stream::getName, - s -> s.getPipeline().getProcessors().stream().map(this::getProcessContext) - .collect(Collectors.toList()))); - engineContext.getStreamContexts().forEach(streamContext -> { - streamContext.getProcessContexts().clear(); - streamContext.getProcessContexts().addAll(pipelineMap.get(streamContext.getIdentifier())); - }); - - PipelineConfigurationBroadcastWrapper.getInstance().refresh(pipelineMap, sparkContext); - } - - private Map configureComponent(ConfigurableComponent component, Collection properties) { - final Map propertyMap = properties.stream().collect(Collectors.toMap(Property::getKey, Function.identity())); - return propertyMap.keySet().stream().map(component::getPropertyDescriptor) - .filter(propertyDescriptor -> propertyDescriptor != null) - .filter(propertyDescriptor -> propertyMap.containsKey(propertyDescriptor.getName()) || - (propertyDescriptor.getDefaultValue() != null && propertyDescriptor.isRequired())) - .collect(Collectors.toMap(PropertyDescriptor::getName, propertyDescriptor -> { - String value = propertyDescriptor.getDefaultValue(); - if (propertyMap.containsKey(propertyDescriptor.getName())) { - value = propertyMap.get(propertyDescriptor.getName()).getValue(); - } - return value; - })); - } - - -} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Component.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Component.java deleted file mode 100755 index 50ac7ad5a..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Component.java +++ /dev/null @@ -1,186 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote.model; - -import com.fasterxml.jackson.annotation.JsonProperty; -import io.swagger.annotations.ApiModelProperty; - -import javax.validation.Valid; -import javax.validation.constraints.NotNull; -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; - -/** - * Component - */ -@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") - -public class Component { - @JsonProperty("name") - private String name = null; - - @JsonProperty("component") - private String component = null; - - @JsonProperty("documentation") - private String documentation = null; - - @JsonProperty("config") - @Valid - private List config = new ArrayList<>(); - - public Component name(String name) { - this.name = name; - return this; - } - - /** - * Get name - * - * @return name - **/ - @ApiModelProperty(required = true, value = "") - @NotNull - - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public Component component(String component) { - this.component = component; - return this; - } - - /** - * Get component - * - * @return component - **/ - @ApiModelProperty(required = true, value = "") - @NotNull - - - public String getComponent() { - return component; - } - - public void setComponent(String component) { - this.component = component; - } - - public Component documentation(String documentation) { - this.documentation = documentation; - return this; - } - - /** - * Get documentation - * - * @return documentation - **/ - @ApiModelProperty(value = "") - - - public String getDocumentation() { - return documentation; - } - - public void setDocumentation(String documentation) { - this.documentation = documentation; - } - - public Component config(List config) { - this.config = config; - return this; - } - - public Component addConfigItem(Property configItem) { - if (this.config == null) { - this.config = new ArrayList(); - } - this.config.add(configItem); - return this; - } - - /** - * Get config - * - * @return config - **/ - @ApiModelProperty(value = "") - - @Valid - - public List getConfig() { - return config; - } - - public void setConfig(List config) { - this.config = config; - } - - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - Component component = (Component) o; - return Objects.equals(this.name, component.name) && - Objects.equals(this.component, component.component) && - Objects.equals(this.documentation, component.documentation) && - Objects.equals(this.config, component.config); - } - - @Override - public int hashCode() { - return Objects.hash(name, component, documentation, config); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("class Component {\n"); - - sb.append(" name: ").append(toIndentedString(name)).append("\n"); - sb.append(" component: ").append(toIndentedString(component)).append("\n"); - sb.append(" documentation: ").append(toIndentedString(documentation)).append("\n"); - sb.append(" config: ").append(toIndentedString(config)).append("\n"); - sb.append("}"); - return sb.toString(); - } - - /** - * Convert the given object to string with each line indented by 4 spaces - * (except the first line). - */ - private String toIndentedString(Object o) { - if (o == null) { - return "null"; - } - return o.toString().replace("\n", "\n "); - } -} - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/DataFlow.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/DataFlow.java deleted file mode 100755 index d10fc7a76..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/DataFlow.java +++ /dev/null @@ -1,144 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote.model; - -import com.fasterxml.jackson.annotation.JsonProperty; -import io.swagger.annotations.ApiModel; -import io.swagger.annotations.ApiModelProperty; - -import javax.validation.Valid; -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; - -/** - * A streaming pipeline. - */ -@ApiModel(description = "A streaming pipeline.") -@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") - -public class DataFlow extends Versioned { - @JsonProperty("services") - @Valid - private List services = new ArrayList<>(); - - @JsonProperty("streams") - @Valid - private List streams = new ArrayList<>(); - - public DataFlow services(List services) { - this.services = services; - return this; - } - - public DataFlow addServicesItem(Service servicesItem) { - if (this.services == null) { - this.services = new ArrayList(); - } - this.services.add(servicesItem); - return this; - } - - /** - * The service controllers. - * - * @return services - **/ - @ApiModelProperty(value = "The service controllers.") - - @Valid - - public List getServices() { - return services; - } - - public void setServices(List services) { - this.services = services; - } - - public DataFlow streams(List streams) { - this.streams = streams; - return this; - } - - public DataFlow addStreamsItem(Stream streamsItem) { - if (this.streams == null) { - this.streams = new ArrayList(); - } - this.streams.add(streamsItem); - return this; - } - - /** - * The engine properties. - * - * @return streams - **/ - @ApiModelProperty(value = "The engine properties.") - - @Valid - - public List getStreams() { - return streams; - } - - public void setStreams(List streams) { - this.streams = streams; - } - - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - DataFlow dataFlow = (DataFlow) o; - return Objects.equals(this.services, dataFlow.services) && - Objects.equals(this.streams, dataFlow.streams) && - super.equals(o); - } - - @Override - public int hashCode() { - return Objects.hash(services, streams, super.hashCode()); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("class DataFlow {\n"); - sb.append(" ").append(toIndentedString(super.toString())).append("\n"); - sb.append(" services: ").append(toIndentedString(services)).append("\n"); - sb.append(" streams: ").append(toIndentedString(streams)).append("\n"); - sb.append("}"); - return sb.toString(); - } - - /** - * Convert the given object to string with each line indented by 4 spaces - * (except the first line). - */ - private String toIndentedString(Object o) { - if (o == null) { - return "null"; - } - return o.toString().replace("\n", "\n "); - } -} - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Pipeline.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Pipeline.java deleted file mode 100755 index 2d08c33fb..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Pipeline.java +++ /dev/null @@ -1,108 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote.model; - -import com.fasterxml.jackson.annotation.JsonProperty; -import io.swagger.annotations.ApiModel; -import io.swagger.annotations.ApiModelProperty; - -import javax.validation.Valid; -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; - -/** - * Tracks stream processing pipeline configuration - */ -@ApiModel(description = "Tracks stream processing pipeline configuration") -@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") - -public class Pipeline extends Versioned { - @JsonProperty("processors") - @Valid - private List processors = new ArrayList<>(); - - public Pipeline processors(List processors) { - this.processors = processors; - return this; - } - - public Pipeline addProcessorsItem(Processor processorsItem) { - if (this.processors == null) { - this.processors = new ArrayList(); - } - this.processors.add(processorsItem); - return this; - } - - /** - * Get processors - * - * @return processors - **/ - @ApiModelProperty(value = "") - - @Valid - - public List getProcessors() { - return processors; - } - - public void setProcessors(List processors) { - this.processors = processors; - } - - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - Pipeline pipeline = (Pipeline) o; - return Objects.equals(this.processors, pipeline.processors) && - super.equals(o); - } - - @Override - public int hashCode() { - return Objects.hash(processors, super.hashCode()); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("class Pipeline {\n"); - sb.append(" ").append(toIndentedString(super.toString())).append("\n"); - sb.append(" processors: ").append(toIndentedString(processors)).append("\n"); - sb.append("}"); - return sb.toString(); - } - - /** - * Convert the given object to string with each line indented by 4 spaces - * (except the first line). - */ - private String toIndentedString(Object o) { - if (o == null) { - return "null"; - } - return o.toString().replace("\n", "\n "); - } -} - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Processor.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Processor.java deleted file mode 100755 index 1350a44c5..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Processor.java +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote.model; - -import io.swagger.annotations.ApiModel; - -import java.util.Objects; - -/** - * A logisland 'processor'. - */ -@ApiModel(description = "A logisland 'processor'.") -@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") - -public class Processor extends Component { - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - return true; - } - - @Override - public int hashCode() { - return Objects.hash(super.hashCode()); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("class Processor {\n"); - sb.append(" ").append(toIndentedString(super.toString())).append("\n"); - sb.append("}"); - return sb.toString(); - } - - /** - * Convert the given object to string with each line indented by 4 spaces - * (except the first line). - */ - private String toIndentedString(Object o) { - if (o == null) { - return "null"; - } - return o.toString().replace("\n", "\n "); - } -} - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Property.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Property.java deleted file mode 100755 index 739cc1707..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Property.java +++ /dev/null @@ -1,147 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote.model; - -import com.fasterxml.jackson.annotation.JsonProperty; -import io.swagger.annotations.ApiModelProperty; - -import javax.validation.constraints.NotNull; -import java.util.Objects; - -/** - * Property - */ -@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") - -public class Property { - @JsonProperty("key") - private String key = null; - - @JsonProperty("type") - private String type = "string"; - - @JsonProperty("value") - private String value = null; - - public Property key(String key) { - this.key = key; - return this; - } - - /** - * Get key - * - * @return key - **/ - @ApiModelProperty(required = true, value = "") - @NotNull - - - public String getKey() { - return key; - } - - public void setKey(String key) { - this.key = key; - } - - public Property type(String type) { - this.type = type; - return this; - } - - /** - * Get type - * - * @return type - **/ - @ApiModelProperty(value = "") - - - public String getType() { - return type; - } - - public void setType(String type) { - this.type = type; - } - - public Property value(String value) { - this.value = value; - return this; - } - - /** - * Get value - * - * @return value - **/ - @ApiModelProperty(required = true, value = "") - @NotNull - - - public String getValue() { - return value; - } - - public void setValue(String value) { - this.value = value; - } - - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - Property property = (Property) o; - return Objects.equals(this.key, property.key) && - Objects.equals(this.type, property.type) && - Objects.equals(this.value, property.value); - } - - @Override - public int hashCode() { - return Objects.hash(key, type, value); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("class Property {\n"); - - sb.append(" key: ").append(toIndentedString(key)).append("\n"); - sb.append(" type: ").append(toIndentedString(type)).append("\n"); - sb.append(" value: ").append(toIndentedString(value)).append("\n"); - sb.append("}"); - return sb.toString(); - } - - /** - * Convert the given object to string with each line indented by 4 spaces - * (except the first line). - */ - private String toIndentedString(Object o) { - if (o == null) { - return "null"; - } - return o.toString().replace("\n", "\n "); - } -} - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Service.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Service.java deleted file mode 100755 index a4834f747..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Service.java +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote.model; - -import io.swagger.annotations.ApiModel; - -import java.util.Objects; - -/** - * A logisland 'controller service'. - */ -@ApiModel(description = "A logisland 'controller service'.") -@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") - -public class Service extends Component { - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - return true; - } - - @Override - public int hashCode() { - return Objects.hash(super.hashCode()); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("class Service {\n"); - sb.append(" ").append(toIndentedString(super.toString())).append("\n"); - sb.append("}"); - return sb.toString(); - } - - /** - * Convert the given object to string with each line indented by 4 spaces - * (except the first line). - */ - private String toIndentedString(Object o) { - if (o == null) { - return "null"; - } - return o.toString().replace("\n", "\n "); - } -} - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Stream.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Stream.java deleted file mode 100755 index 8dc999e90..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Stream.java +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote.model; - -import com.fasterxml.jackson.annotation.JsonProperty; -import io.swagger.annotations.ApiModelProperty; - -import javax.validation.Valid; -import java.util.Objects; - -/** - * Stream - */ -@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") -public class Stream extends Component { - @JsonProperty("pipeline") - private Pipeline pipeline = null; - - public Stream pipeline(Pipeline pipeline) { - this.pipeline = pipeline; - return this; - } - - /** - * Get pipeline - * - * @return pipeline - **/ - @ApiModelProperty(value = "") - - @Valid - - public Pipeline getPipeline() { - return pipeline; - } - - public void setPipeline(Pipeline pipeline) { - this.pipeline = pipeline; - } - - - @Override - public boolean equals(java.lang.Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - Stream stream = (Stream) o; - return Objects.equals(this.pipeline, stream.pipeline) && - super.equals(o); - } - - @Override - public int hashCode() { - return Objects.hash(pipeline, super.hashCode()); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("class Stream {\n"); - sb.append(" ").append(toIndentedString(super.toString())).append("\n"); - sb.append(" pipeline: ").append(toIndentedString(pipeline)).append("\n"); - sb.append("}"); - return sb.toString(); - } - - /** - * Convert the given object to string with each line indented by 4 spaces - * (except the first line). - */ - private String toIndentedString(java.lang.Object o) { - if (o == null) { - return "null"; - } - return o.toString().replace("\n", "\n "); - } -} - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Versioned.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Versioned.java deleted file mode 100755 index fbd57fc65..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/engine/spark/remote/model/Versioned.java +++ /dev/null @@ -1,125 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote.model; - -import com.fasterxml.jackson.annotation.JsonProperty; -import io.swagger.annotations.ApiModel; -import io.swagger.annotations.ApiModelProperty; - -import javax.validation.Valid; -import javax.validation.constraints.NotNull; -import java.time.OffsetDateTime; -import java.util.Objects; - -/** - * a versioned component - */ -@ApiModel(description = "a versioned component") -@javax.annotation.Generated(value = "io.swagger.codegen.languages.SpringCodegen", date = "2018-06-03T13:00:49.942Z") - -public class Versioned { - @JsonProperty("lastModified") - private OffsetDateTime lastModified = null; - - @JsonProperty("modificationReason") - private String modificationReason = null; - - public Versioned lastModified(OffsetDateTime lastModified) { - this.lastModified = lastModified; - return this; - } - - /** - * the last modified timestamp of this pipeline (used to trigger changes). - * - * @return lastModified - **/ - @ApiModelProperty(required = true, value = "the last modified timestamp of this pipeline (used to trigger changes).") - @NotNull - - @Valid - - public OffsetDateTime getLastModified() { - return lastModified; - } - - public void setLastModified(OffsetDateTime lastModified) { - this.lastModified = lastModified; - } - - public Versioned modificationReason(String modificationReason) { - this.modificationReason = modificationReason; - return this; - } - - /** - * Can be used to document latest changeset. - * - * @return modificationReason - **/ - @ApiModelProperty(value = "Can be used to document latest changeset.") - - - public String getModificationReason() { - return modificationReason; - } - - public void setModificationReason(String modificationReason) { - this.modificationReason = modificationReason; - } - - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - Versioned versioned = (Versioned) o; - return Objects.equals(this.lastModified, versioned.lastModified) && - Objects.equals(this.modificationReason, versioned.modificationReason); - } - - @Override - public int hashCode() { - return Objects.hash(lastModified, modificationReason); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("class Versioned {\n"); - - sb.append(" lastModified: ").append(toIndentedString(lastModified)).append("\n"); - sb.append(" modificationReason: ").append(toIndentedString(modificationReason)).append("\n"); - sb.append("}"); - return sb.toString(); - } - - /** - * Convert the given object to string with each line indented by 4 spaces - * (except the first line). - */ - private String toIndentedString(Object o) { - if (o == null) { - return "null"; - } - return o.toString().replace("\n", "\n "); - } -} - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProcessorMetrics.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProcessorMetrics.java deleted file mode 100644 index cfb5cddad..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProcessorMetrics.java +++ /dev/null @@ -1,123 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.util.spark; - - -import com.hurence.logisland.metrics.Names; -import com.hurence.logisland.record.FieldDictionary; -import com.hurence.logisland.record.Record; -import org.apache.spark.groupon.metrics.UserMetricsSystem; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - - -/** - * Created by tom on 09/09/16. - */ -public class ProcessorMetrics { - private static Logger logger = LoggerFactory.getLogger(ProcessorMetrics.class.getName()); - - public synchronized static void resetMetrics(final String metricPrefix) { - UserMetricsSystem.gauge(metricPrefix + Names.INCOMING_MESSAGES).set(0); - UserMetricsSystem.gauge(metricPrefix + Names.INCOMING_RECORDS).set(0); - UserMetricsSystem.gauge(metricPrefix + Names.OUTGOING_RECORDS).set(0); - UserMetricsSystem.gauge(metricPrefix + Names.ERRORS).set(0); - UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_FIELD_AVERAGE).set(0); - UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_RECORD_AVERAGE).set(0); - UserMetricsSystem.gauge(metricPrefix + Names.RECORDS_PER_SECOND_AVERAGE).set(0); - UserMetricsSystem.gauge(metricPrefix + Names.PROCESSED_BYTES).set(0); - UserMetricsSystem.gauge(metricPrefix + Names.PROCESSED_FIELDS).set(0); - UserMetricsSystem.gauge(metricPrefix + Names.ERROR_PERCENTAGE).set(0); - UserMetricsSystem.gauge(metricPrefix + Names.FIELDS_PER_RECORD_AVERAGE).set(0); - UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_SECOND_AVERAGE).set(0); - //UserMetricsSystem.gauge(metricPrefix + "processing_time_ms").set(0); - } - - - /** - * publish - * - * @param metricPrefix - * @param incomingEvents - * @param outgoingEvents - * @param fromOffset - * @param untilOffset - * @param processingDurationInMillis - */ - public synchronized static void computeMetrics( - final String metricPrefix, - final Collection incomingEvents, - final Collection outgoingEvents, - final long fromOffset, - final long untilOffset, - final long processingDurationInMillis) { - - - if ((outgoingEvents != null) && (outgoingEvents.size() != 0)) { - - UserMetricsSystem.gauge(metricPrefix + Names.INCOMING_MESSAGES).set(untilOffset - fromOffset); - UserMetricsSystem.gauge(metricPrefix + Names.INCOMING_RECORDS).set(incomingEvents.size()); - UserMetricsSystem.gauge(metricPrefix + Names.OUTGOING_RECORDS).set(outgoingEvents.size()); - - long errorCount = outgoingEvents.stream().filter(r -> r != null && r.hasField(FieldDictionary.RECORD_ERRORS)).count(); - UserMetricsSystem.gauge(metricPrefix + "errors").set(errorCount); - if (outgoingEvents.size() != 0) { - final List recordSizesInBytes = new ArrayList<>(); - final List recordNumberOfFields = new ArrayList<>(); - - outgoingEvents.forEach(record -> { - recordSizesInBytes.add(record.sizeInBytes()); - recordNumberOfFields.add(record.size()); - }); - - final int numberOfProcessedBytes = recordSizesInBytes.stream().mapToInt(Integer::intValue).sum(); - final int numberOfProcessedFields = recordNumberOfFields.stream().mapToInt(Integer::intValue).sum(); - - if (numberOfProcessedFields != 0) { - UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_FIELD_AVERAGE).set(numberOfProcessedBytes / numberOfProcessedFields); - } else { - UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_FIELD_AVERAGE).set(0); - } - if (processingDurationInMillis != 0) { - UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_SECOND_AVERAGE).set(numberOfProcessedBytes * 1000 / processingDurationInMillis); - UserMetricsSystem.gauge(metricPrefix + Names.RECORDS_PER_SECOND_AVERAGE).set(outgoingEvents.size() * 1000 / processingDurationInMillis); - } else { - UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_SECOND_AVERAGE).set(0); - UserMetricsSystem.gauge(metricPrefix + Names.RECORDS_PER_SECOND_AVERAGE).set(0); - } - - - UserMetricsSystem.gauge(metricPrefix + Names.PROCESSED_BYTES).set(numberOfProcessedBytes); - UserMetricsSystem.gauge(metricPrefix + Names.PROCESSED_FIELDS).set(numberOfProcessedFields); - - UserMetricsSystem.gauge(metricPrefix + Names.ERROR_PERCENTAGE).set((long) (100.0f * errorCount / outgoingEvents.size())); - UserMetricsSystem.gauge(metricPrefix + Names.FIELDS_PER_RECORD_AVERAGE).set(numberOfProcessedFields / outgoingEvents.size()); - UserMetricsSystem.gauge(metricPrefix + Names.BYTES_PER_RECORD_AVERAGE).set(numberOfProcessedBytes / outgoingEvents.size()); - } else if (errorCount > 0) - UserMetricsSystem.gauge(metricPrefix + Names.ERROR_PERCENTAGE).set(100L); - else - UserMetricsSystem.gauge(metricPrefix + Names.ERROR_PERCENTAGE).set(0L); - - - // UserMetricsSystem.gauge(metricPrefix + "processing_time_ms").set(processingDurationInMillis); - - } - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProtoBufRegistrator.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProtoBufRegistrator.java deleted file mode 100644 index 62e0159a2..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/ProtoBufRegistrator.java +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.util.spark; - -import com.esotericsoftware.kryo.Kryo; -import com.esotericsoftware.kryo.serializers.FieldSerializer; -import org.apache.spark.serializer.KryoRegistrator; -import org.eclipse.kura.core.message.protobuf.KuraPayloadProto; - - -public class ProtoBufRegistrator implements KryoRegistrator { - @Override - public void registerClasses(Kryo kryo) { - kryo.register(KuraPayloadProto.KuraPayload.class, new FieldSerializer(kryo, KuraPayloadProto.KuraPayload.class)); - } -} - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkConfigReader.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkConfigReader.java deleted file mode 100644 index 1f059e7e6..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkConfigReader.java +++ /dev/null @@ -1,59 +0,0 @@ -package com.hurence.logisland.util.spark; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; -import com.hurence.logisland.config.LogislandConfiguration; -import com.hurence.logisland.util.string.StringUtils; -import org.apache.spark.SparkContext; -import org.apache.spark.rdd.RDD; - -import java.util.Arrays; - -import static com.hurence.logisland.config.ConfigReader.checkLogislandConf; - -/** - * This configuration reader depends on spark. We do not want to place methods in this class in the - * com.hurence.logisland.config.ConfigReader class where the loadConfig (from local filesystem) method - * resides, as it would introduce a spark dependency in the logisland-framework module. Only the spark - * engine should have a spark dependency. So this class should be loaded from the StreamProcessingRunner - * and this will succeed only in environments where a spark 2 engine is available and used, otherwise it - * will fail to load. This will for instance be successful in the databricks environment, which is by the - * way the first purpose for which this class is being introduced. - */ -public class SparkConfigReader { - - /** - * Loads a YAML config file using (file located in the shared filesystem) - * - * @param configFilePath the path of the config file - * @return a LogislandSessionConfiguration - * @throws Exception - */ - public static LogislandConfiguration loadConfigFromSharedFS(String configFilePath) throws Exception { - ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); - - /** - * In Databricks, developers should utilize the shared SparkContext instead of creating one using the constructor. - * When running a job, you can access the shared context by calling SparkContext.getOrCreate(). - * - * Also in databricks, a path like /path/to/a/file will be loaded from DBFS so will be interpreted like - * dbfs:/path/to/a/file - */ - - SparkContext sparkContext = SparkContext.getOrCreate(); - - RDD configRdd = sparkContext.textFile(configFilePath, 1); - String[] configStringArray = (String[])configRdd.collect(); - String configString = String.join("\n", Arrays.asList(configStringArray)); - - // replace all host from environment variables - String fileContent = StringUtils.resolveEnvVars(configString, "localhost"); - - System.out.println("Configuration:\n" + fileContent); - - LogislandConfiguration logislandConf = mapper.readValue(fileContent, LogislandConfiguration.class); - checkLogislandConf(logislandConf); - - return logislandConf; - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkPlatform.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkPlatform.java deleted file mode 100644 index 32124abb3..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/java/com/hurence/logisland/util/spark/SparkPlatform.java +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.util.spark; - -import org.apache.spark.rdd.RDD; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.StructType; - -public interface SparkPlatform { - Dataset createStreamingDataFrame(SQLContext sqlContext, RDD catalystRows, StructType schema); -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/resources/META-INF/services/org.apache.spark.metrics.sink.KafkaSink b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/resources/META-INF/services/org.apache.spark.metrics.sink.KafkaSink deleted file mode 100644 index ecb6d54f3..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/resources/META-INF/services/org.apache.spark.metrics.sink.KafkaSink +++ /dev/null @@ -1 +0,0 @@ -org.apache.spark.metrics.sink.KafkaSink \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/KafkaStreamProcessingEngine.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/KafkaStreamProcessingEngine.scala deleted file mode 100644 index c7ee30d8c..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/KafkaStreamProcessingEngine.scala +++ /dev/null @@ -1,659 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.hurence.logisland.engine.spark - - -import java.util -import java.util.concurrent.Executors -import java.util.regex.Pattern -import java.util.{Collections, UUID} - -import com.hurence.logisland.component.{AllowableValue, ComponentContext, PropertyDescriptor} -import com.hurence.logisland.engine.spark.remote.PipelineConfigurationBroadcastWrapper -import com.hurence.logisland.engine.{AbstractProcessingEngine, EngineContext} -import com.hurence.logisland.stream.spark.{AbstractKafkaRecordStream, SparkRecordStream} -import com.hurence.logisland.validator.StandardValidators -import org.apache.spark.groupon.metrics.UserMetricsSystem -import org.apache.spark.sql.SQLContext -import org.apache.spark.sql.streaming.StreamingQueryListener -import org.apache.spark.streaming.{Milliseconds, StreamingContext} -import org.apache.spark.{SparkConf, SparkContext, SparkEnv} -import org.slf4j.LoggerFactory - -import scala.collection.JavaConversions._ - - -object KafkaStreamProcessingEngine { - - - val SPARK_PROPERTIES_FILE_PATH: PropertyDescriptor = new PropertyDescriptor.Builder()//Not used in code but in logisland.sh script. Si it must be present ! - .name("spark.properties.file.path") - .description("for using --properties-file option while submitting spark job") - .required(false) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build - - val SPARK_MONITORING_DRIVER_PORT: PropertyDescriptor = new PropertyDescriptor.Builder()//Not used in code but in logisland.sh script. Si it must be present ! - .name("spark.monitoring.driver.port") - .description("The port for exposing monitoring metrics") - .required(false) - .addValidator(StandardValidators.POSITIVE_LONG_VALIDATOR) - .build - - val SPARK_MASTER = new PropertyDescriptor.Builder() - .name("spark.master") - .description("The url to Spark Master") - .required(true) - // The regex allows "local[K]" with K as an integer, "local[*]", "yarn", "yarn-client", "yarn-cluster" and "spark://HOST[:PORT]" - // there is NO support for "mesos://HOST:PORT" - .addValidator(StandardValidators.createRegexMatchingValidator(Pattern.compile( - "^(yarn|" + - "local(\\[([0-9]+|\\*)(,[0-9]+)?\\])?|" + - "spark:\\/\\/[a-z0-9\\.\\-]+(:[0-9]+)?(,[a-z0-9\\.\\-]+(:[0-9]+)?)*|" + - "mesos:\\/\\/((zk:\\/\\/[a-z0-9\\.\\-]+:[0-9]+(,[a-z0-9\\.\\-]+:[0-9]+)*\\/mesos)|(([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+|[a-z][a-z0-9\\.\\-]+)(:[0-9]+)?))|" + - "k8s://.+)$"))) - .defaultValue("local[2]") - .build - - val SPARK_APP_NAME = new PropertyDescriptor.Builder() - .name("spark.app.name") - .description("Tha application name") - .required(true) - .addValidator(StandardValidators.createRegexMatchingValidator(Pattern.compile("^[a-zA-z0-9-_\\.]+$"))) - .defaultValue("logisland") - .build - - val SPARK_STREAMING_BATCH_DURATION = new PropertyDescriptor.Builder() - .name("spark.streaming.batchDuration") - .description("") - .required(true) - .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) - .defaultValue("2000") - .build - - val SPARK_YARN_DEPLOYMODE = new PropertyDescriptor.Builder() - .name("spark.yarn.deploy-mode") - .description("The yarn deploy mode") - .required(false) - // .allowableValues("client", "cluster") - .build - - val SPARK_YARN_QUEUE = new PropertyDescriptor.Builder() - .name("spark.yarn.queue") - .description("The name of the YARN queue") - .required(false) - // .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .defaultValue("default") - .build - - val memorySizePattern = Pattern.compile("^[0-9]+[mMgG]$"); - val SPARK_DRIVER_MEMORY = new PropertyDescriptor.Builder() - .name("spark.driver.memory") - .description("The memory size for Spark driver") - .required(false) - .addValidator(StandardValidators.createRegexMatchingValidator(memorySizePattern)) - .defaultValue("512m") - .build - - val SPARK_EXECUTOR_MEMORY = new PropertyDescriptor.Builder() - .name("spark.executor.memory") - .description("The memory size for Spark executors") - .required(false) - .addValidator(StandardValidators.createRegexMatchingValidator(memorySizePattern)) - .defaultValue("1g") - .build - - val SPARK_DRIVER_CORES = new PropertyDescriptor.Builder() - .name("spark.driver.cores") - .description("The number of cores for Spark driver") - .required(false) - .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) - .defaultValue("4") - .build - - val SPARK_EXECUTOR_CORES = new PropertyDescriptor.Builder() - .name("spark.executor.cores") - .description("The number of cores for Spark driver") - .required(false) - .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) - .defaultValue("1") - .build - - val SPARK_EXECUTOR_INSTANCES = new PropertyDescriptor.Builder() - .name("spark.executor.instances") - .description("The number of instances for Spark app") - .required(false) - .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) - .build - - val SPARK_SERIALIZER = new PropertyDescriptor.Builder() - .name("spark.serializer") - .description("Class to use for serializing objects that will be sent over the network " + - "or need to be cached in serialized form") - .required(false) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .defaultValue("org.apache.spark.serializer.KryoSerializer") - .build - - val SPARK_STREAMING_BLOCK_INTERVAL = new PropertyDescriptor.Builder() - .name("spark.streaming.blockInterval") - .description("Interval at which data received by Spark Streaming receivers is chunked into blocks " + - "of data before storing them in Spark. Minimum recommended - 50 ms") - .required(false) - .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) - .defaultValue("350") - .build - - val SPARK_STREAMING_KAFKA_MAX_RATE_PER_PARTITION = new PropertyDescriptor.Builder() - .name("spark.streaming.kafka.maxRatePerPartition") - .description("Maximum rate (number of records per second) at which data will be read from each Kafka partition") - .required(false) - .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) - .defaultValue("5000") - .build - - val SPARK_STREAMING_BACKPRESSURE_ENABLED = new PropertyDescriptor.Builder() - .name("spark.streaming.backpressure.enabled") - .description("This enables the Spark Streaming to control the receiving rate based on " + - "the current batch scheduling delays and processing times so that the system " + - "receives only as fast as the system can process.") - .required(false) - .addValidator(StandardValidators.BOOLEAN_VALIDATOR) - .defaultValue("false") - .build - - val SPARK_STREAMING_UNPERSIST = new PropertyDescriptor.Builder() - .name("spark.streaming.unpersist") - .description("Force RDDs generated and persisted by Spark Streaming to be automatically unpersisted " + - "from Spark's memory. The raw input data received by Spark Streaming is also automatically cleared." + - " Setting this to false will allow the raw data and persisted RDDs to be accessible outside " + - "the streaming application as they will not be cleared automatically. " + - "But it comes at the cost of higher memory usage in Spark.") - .required(false) - .addValidator(StandardValidators.BOOLEAN_VALIDATOR) - .defaultValue("false") - .build - - val SPARK_UI_PORT = new PropertyDescriptor.Builder() - .name("spark.ui.port") - .description("") - .required(false) - .addValidator(StandardValidators.PORT_VALIDATOR) - .defaultValue("4050") - .build - - val SPARK_STREAMING_TIMEOUT = new PropertyDescriptor.Builder() - .name("spark.streaming.timeout") - .description("") - .required(false) - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("-1") - .build - - val SPARK_STREAMING_KAFKA_MAXRETRIES = new PropertyDescriptor.Builder() - .name("spark.streaming.kafka.maxRetries") - .description("Maximum rate (number of records per second) at which data will be read from each Kafka partition") - .required(false) - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("3") - .build - - val SPARK_STREAMING_UI_RETAINED_BATCHES = new PropertyDescriptor.Builder() - .name("spark.streaming.ui.retainedBatches") - .description("How many batches the Spark Streaming UI and status APIs remember before garbage collecting.") - .required(false) - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("200") - .build - - val SPARK_STREAMING_RECEIVER_WAL_ENABLE = new PropertyDescriptor.Builder() - .name("spark.streaming.receiver.writeAheadLog.enable") - .description("Enable write ahead logs for receivers. " + - "All the input data received through receivers will be saved to write ahead logs " + - "that will allow it to be recovered after driver failures.") - .required(false) - .addValidator(StandardValidators.BOOLEAN_VALIDATOR) - .defaultValue("false") - .build - - - val SPARK_YARN_MAX_APP_ATTEMPTS = new PropertyDescriptor.Builder() - .name("spark.yarn.maxAppAttempts") - .description("Because Spark driver and Application Master share a single JVM," + - " any error in Spark driver stops our long-running job. " + - "Fortunately it is possible to configure maximum number of attempts " + - "that will be made to re-run the application. " + - "It is reasonable to set higher value than default 2 " + - "(derived from YARN cluster property yarn.resourcemanager.am.max-attempts). " + - "4 works quite well, higher value may cause unnecessary restarts" + - " even if the reason of the failure is permanent.") - .required(false) - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("4") - .build - - - val SPARK_YARN_AM_ATTEMPT_FAILURES_VALIDITY_INTERVAL = new PropertyDescriptor.Builder() - .name("spark.yarn.am.attemptFailuresValidityInterval") - .description("If the application runs for days or weeks without restart " + - "or redeployment on highly utilized cluster, " + - "4 attempts could be exhausted in few hours. " + - "To avoid this situation, the attempt counter should be reset on every hour of so.") - .required(false) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .defaultValue("1h") - .build - - val SPARK_YARN_MAX_EXECUTOR_FAILURES = new PropertyDescriptor.Builder() - .name("spark.yarn.max.executor.failures") - .description("a maximum number of executor failures before the application fails. " + - "By default it is max(2 * num executors, 3), " + - "well suited for batch jobs but not for long-running jobs." + - " The property comes with corresponding validity interval which also should be set." + - "8 * num_executors") - .required(false) - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("20") - .build - - - val SPARK_YARN_EXECUTOR_FAILURES_VALIDITY_INTERVAL = new PropertyDescriptor.Builder() - .name("spark.yarn.executor.failuresValidityInterval") - .description("If the application runs for days or weeks without restart " + - "or redeployment on highly utilized cluster, " + - "x attempts could be exhausted in few hours. " + - "To avoid this situation, the attempt counter should be reset on every hour of so.") - .required(false) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .defaultValue("1h") - .build - - val SPARK_TASK_MAX_FAILURES = new PropertyDescriptor.Builder() - .name("spark.task.maxFailures") - .description("For long-running jobs you could also consider to boost maximum" + - " number of task failures before giving up the job. " + - "By default tasks will be retried 4 times and then job fails.") - .required(false) - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("8") - .build - - val SPARK_MEMORY_STORAGE_FRACTION = new PropertyDescriptor.Builder() - .name("spark.memory.storageFraction") - .description("expresses the size of R as a fraction of M (default 0.5). " + - "R is the storage space within M where cached blocks immune to being evicted by execution.") - .required(false) - .addValidator(StandardValidators.FLOAT_VALIDATOR) - .defaultValue("0.5") - .build - - val SPARK_MEMORY_FRACTION = new PropertyDescriptor.Builder() - .name("spark.memory.fraction") - .description("expresses the size of M as a fraction of the (JVM heap space - 300MB) (default 0.75). " + - "The rest of the space (25%) is reserved for user data structures, internal metadata in Spark, " + - "and safeguarding against OOM errors in the case of sparse and unusually large records.") - .required(false) - .addValidator(StandardValidators.FLOAT_VALIDATOR) - .defaultValue("0.6") - .build - - val FAIR = new AllowableValue("FAIR", "FAIR", "fair sharing") - val FIFO = new AllowableValue("FIFO", "FIFO", "queueing jobs one after another") - - val SPARK_SCHEDULER_MODE = new PropertyDescriptor.Builder() - .name("spark.scheduler.mode") - .description("The scheduling mode between jobs submitted to the same SparkContext. " + - "Can be set to FAIR to use fair sharing instead of queueing jobs one after another. " + - "Useful for multi-user services.") - .required(false) - .allowableValues(FAIR, FIFO) - .defaultValue(FAIR.getValue) - .build - - val JAVA_MESOS_LIBRARY_PATH = new PropertyDescriptor.Builder() - .name("java.library.path") - .description("The java library path to use with mesos.") - .required(false) - .build - - val SPARK_MESOS_CORE_MAX = new PropertyDescriptor.Builder() - .name("spark.cores.max") - .description("The maximum number of total executor core with mesos.") - .required(false) - .build - -} - - -class KafkaStreamProcessingEngine extends AbstractProcessingEngine { - - private val logger = LoggerFactory.getLogger(classOf[KafkaStreamProcessingEngine]) - private val conf = new SparkConf() - private var running = false - protected var batchDurationMs: Int = 1000 - - - /** - * Provides subclasses the ability to perform initialization logic - */ - override def init(context: ComponentContext): Unit = { - super.init(context) - val engineContext = context.asInstanceOf[EngineContext] - val sparkMaster = engineContext.getPropertyValue(KafkaStreamProcessingEngine.SPARK_MASTER).asString - val appName = engineContext.getPropertyValue(KafkaStreamProcessingEngine.SPARK_APP_NAME).asString - batchDurationMs = engineContext.getPropertyValue(KafkaStreamProcessingEngine.SPARK_STREAMING_BATCH_DURATION).asInteger().intValue() - - /** - * job configuration - */ - - - conf.setAppName(appName) - conf.setMaster(sparkMaster) - - def setConfProperty(conf: SparkConf, engineContext: EngineContext, propertyDescriptor: PropertyDescriptor) = { - - // Need to check if the properties are set because those properties are not "requires" - if (engineContext.getPropertyValue(propertyDescriptor).isSet) { - conf.set(propertyDescriptor.getName, engineContext.getPropertyValue(propertyDescriptor).asString) - } - } - - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_STREAMING_UI_RETAINED_BATCHES) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_STREAMING_RECEIVER_WAL_ENABLE) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_STREAMING_KAFKA_MAXRETRIES) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_UI_PORT) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_STREAMING_UNPERSIST) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_STREAMING_BACKPRESSURE_ENABLED) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_STREAMING_BLOCK_INTERVAL) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_STREAMING_KAFKA_MAX_RATE_PER_PARTITION) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_SERIALIZER) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_DRIVER_MEMORY) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_EXECUTOR_MEMORY) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_DRIVER_CORES) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_EXECUTOR_CORES) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_EXECUTOR_INSTANCES) - - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_YARN_MAX_APP_ATTEMPTS) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_YARN_AM_ATTEMPT_FAILURES_VALIDITY_INTERVAL) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_YARN_MAX_EXECUTOR_FAILURES) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_YARN_EXECUTOR_FAILURES_VALIDITY_INTERVAL) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_TASK_MAX_FAILURES) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_MEMORY_FRACTION) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_MEMORY_STORAGE_FRACTION) - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_SCHEDULER_MODE) - - conf.set("spark.kryo.registrator", "com.hurence.logisland.util.spark.ProtoBufRegistrator") - - if (sparkMaster startsWith "yarn") { - // Note that SPARK_YARN_DEPLOYMODE is not used by spark itself but only by spark-submit CLI - // That's why we do not need to propagate it here - setConfProperty(conf, engineContext, KafkaStreamProcessingEngine.SPARK_YARN_QUEUE) - } - - @transient val sparkContext = getCurrentSparkContext() - - UserMetricsSystem.initialize(sparkContext, "LogislandMetrics") - - - - - /** - * shutdown context gracefully - */ - sys.ShutdownHookThread { - logger.info("Gracefully stopping Spark Streaming Application") - shutdown(engineContext) - logger.info("Application stopped") - } - - - PipelineConfigurationBroadcastWrapper.getInstance().refresh(engineContext, sparkContext) - - - SQLContext.getOrCreate(getCurrentSparkContext()).streams.addListener(new StreamingQueryListener { - - val runMap = scala.collection.mutable.Map[UUID, String]() - val executor = Executors.newSingleThreadExecutor() - //force early initialization of this pool - executor.submit(new Runnable { - override def run(): Unit = {} - }) - - override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = { - logger.info(s"Streaming query for stream ${event.name} has been started") - runMap.put(event.id, event.name) - } - - override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = { - } - - override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = { - if (event.exception.isDefined && !getCurrentSparkContext().isStopped) { - val currentStreamId = runMap.get(event.id) - logger.warn(s"Streaming query for stream $currentStreamId terminated with exception ${event.exception}. " + - s"The engine will be reset") - - executor.submit(new Runnable { - override def run(): Unit = { - Thread.sleep(1000); - engineContext.getEngine.reset(engineContext) - } - }) - } - } - }) - - running = true - - logger.info(s"spark context initialized with master:$sparkMaster, " + - s"appName:$appName, " + - s"batchDuration:$batchDurationMs ") - logger.info(s"conf : ${conf.toDebugString}") - } - - override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { - val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] - descriptors.add(KafkaStreamProcessingEngine.SPARK_APP_NAME) - descriptors.add(KafkaStreamProcessingEngine.SPARK_MASTER) - descriptors.add(KafkaStreamProcessingEngine.SPARK_MONITORING_DRIVER_PORT) - descriptors.add(KafkaStreamProcessingEngine.SPARK_YARN_DEPLOYMODE) - descriptors.add(KafkaStreamProcessingEngine.SPARK_YARN_QUEUE) - descriptors.add(KafkaStreamProcessingEngine.SPARK_DRIVER_MEMORY) - descriptors.add(KafkaStreamProcessingEngine.SPARK_EXECUTOR_MEMORY) - descriptors.add(KafkaStreamProcessingEngine.SPARK_DRIVER_CORES) - descriptors.add(KafkaStreamProcessingEngine.SPARK_EXECUTOR_CORES) - descriptors.add(KafkaStreamProcessingEngine.SPARK_EXECUTOR_INSTANCES) - descriptors.add(KafkaStreamProcessingEngine.SPARK_SERIALIZER) - descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_BLOCK_INTERVAL) - descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_KAFKA_MAX_RATE_PER_PARTITION) - descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_BATCH_DURATION) - descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_BACKPRESSURE_ENABLED) - descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_UNPERSIST) - descriptors.add(KafkaStreamProcessingEngine.SPARK_UI_PORT) - descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_TIMEOUT) - descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_KAFKA_MAXRETRIES) - descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_UI_RETAINED_BATCHES) - descriptors.add(KafkaStreamProcessingEngine.SPARK_STREAMING_RECEIVER_WAL_ENABLE) - descriptors.add(KafkaStreamProcessingEngine.SPARK_YARN_MAX_APP_ATTEMPTS) - descriptors.add(KafkaStreamProcessingEngine.SPARK_YARN_AM_ATTEMPT_FAILURES_VALIDITY_INTERVAL) - descriptors.add(KafkaStreamProcessingEngine.SPARK_YARN_MAX_EXECUTOR_FAILURES) - descriptors.add(KafkaStreamProcessingEngine.SPARK_YARN_EXECUTOR_FAILURES_VALIDITY_INTERVAL) - descriptors.add(KafkaStreamProcessingEngine.SPARK_TASK_MAX_FAILURES) - descriptors.add(KafkaStreamProcessingEngine.SPARK_MEMORY_FRACTION) - descriptors.add(KafkaStreamProcessingEngine.SPARK_MEMORY_STORAGE_FRACTION) - descriptors.add(KafkaStreamProcessingEngine.SPARK_SCHEDULER_MODE) - descriptors.add(KafkaStreamProcessingEngine.SPARK_PROPERTIES_FILE_PATH) - descriptors.add(KafkaStreamProcessingEngine.JAVA_MESOS_LIBRARY_PATH) - descriptors.add(KafkaStreamProcessingEngine.SPARK_MESOS_CORE_MAX) - - Collections.unmodifiableList(descriptors) - } - - - /** - * start the engine - * - * @param engineContext - */ - override def start(engineContext: EngineContext) = { - logger.info("starting Spark Engine") - val streamingContext = createStreamingContext(engineContext) - if (!engineContext.getStreamContexts.map(p => p.getStream).filter(p => p.isInstanceOf[AbstractKafkaRecordStream]).isEmpty) { - streamingContext.start() - } - - } - - protected def getCurrentSparkStreamingContext(sparkContext: SparkContext): StreamingContext = { - return StreamingContext.getActiveOrCreate(() => - return new StreamingContext(sparkContext, Milliseconds(batchDurationMs)) - ) - } - - protected def getCurrentSparkContext(): SparkContext = { - return SparkContext.getOrCreate(conf) - } - - - def createStreamingContext(engineContext: EngineContext): StreamingContext = { - - - @transient val sc = getCurrentSparkContext() - @transient val ssc = getCurrentSparkStreamingContext(sc) - val appName = sc.appName; - - - /** - * loop over processContext - */ - engineContext.getStreamContexts.foreach(streamingContext => { - try { - val kafkaStream = streamingContext.getStream.asInstanceOf[SparkRecordStream] - - kafkaStream.setup(appName, ssc, streamingContext, engineContext) - kafkaStream.start() - } catch { - case ex: Exception => - throw new IllegalStateException("something bad happened, please check Kafka or cluster health", ex) - } - - }) - ssc - } - - - override def shutdown(engineContext: EngineContext) = { - if (running) { - running = false - logger.info(s"shutting down Spark engine") - stop(engineContext, true) - } - } - - def stop(engineContext: EngineContext, doStopSparkContext: Boolean) = { - synchronized { - val sc = getCurrentSparkContext(); - if (!sc.isStopped) { - - engineContext.getStreamContexts.foreach(streamingContext => { - try { - val kafkaStream = streamingContext.getStream.asInstanceOf[SparkRecordStream] - kafkaStream.stop() - } catch { - case ex: Exception => - logger.error("something bad happened, please check Kafka or cluster health : {}", ex.getMessage) - } - }) - - try { - if (!sc.isStopped) { - val ssc = getCurrentSparkStreamingContext(sc); - ssc.stop(stopSparkContext = false, stopGracefully = true) - } - - } finally { - if (doStopSparkContext && !sc.isStopped) { - try { - sc.stop(); - } catch { - case ex: Exception => - logger.error("something bad while stopping the spark context. Please check cluster health : {}", ex.getMessage) - } - } - } - - } - } - } - - override def onPropertyModified(descriptor: PropertyDescriptor, oldValue: String, newValue: String) = { - logger.info(s"property ${ - descriptor.getName - } value changed from $oldValue to $newValue") - } - - /** - * Await for termination. - * - */ - override def awaitTermination(engineContext: EngineContext): Unit = { - var timeout = engineContext.getPropertyValue(KafkaStreamProcessingEngine.SPARK_STREAMING_TIMEOUT) - .asInteger().toInt - val sc = getCurrentSparkContext() - - while (!sc.isStopped) { - try { - if (timeout < 0) { - Thread.sleep(200) - } else { - val toSleep = Math.min(200, timeout); - Thread.sleep(toSleep) - timeout -= toSleep - } - } catch { - case e: InterruptedException => return - case unknown: Throwable => throw unknown - } - } - } - - - /** - * Reset the engine by stopping the streaming context. - */ - override def reset(engineContext: EngineContext): Unit = { - shutdown(engineContext) - } - - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/RemoteApiStreamProcessingEngine.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/RemoteApiStreamProcessingEngine.scala deleted file mode 100644 index f6ef5ce60..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/engine/spark/RemoteApiStreamProcessingEngine.scala +++ /dev/null @@ -1,198 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark - -import java.time.Duration -import java.util -import java.util.Collections -import java.util.concurrent.{Executors, TimeUnit} - -import com.hurence.logisland.component.PropertyDescriptor -import com.hurence.logisland.engine.EngineContext -import com.hurence.logisland.engine.spark.remote.model.DataFlow -import com.hurence.logisland.engine.spark.remote.{RemoteApiClient, RemoteApiComponentFactory} -import com.hurence.logisland.stream.StandardStreamContext -import com.hurence.logisland.stream.spark.DummyRecordStream -import com.hurence.logisland.validator.StandardValidators -import org.apache.spark.streaming.dstream.DStream -import org.slf4j.LoggerFactory - -object RemoteApiStreamProcessingEngine { - val REMOTE_API_BASE_URL = new PropertyDescriptor.Builder() - .name("remote.api.baseUrl") - .description("The base URL of the remote server providing logisland configuration") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build - - val REMOTE_API_POLLING_RATE = new PropertyDescriptor.Builder() - .name("remote.api.polling.rate") - .description("Remote api polling rate in milliseconds") - .required(true) - .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) - .build - - val REMOTE_API_CONFIG_PUSH_RATE = new PropertyDescriptor.Builder() - .name("remote.api.push.rate") - .description("Remote api configuration push rate in milliseconds") - .required(true) - .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) - .build - - val REMOTE_API_CONNECT_TIMEOUT = new PropertyDescriptor.Builder() - .name("remote.api.timeouts.connect") - .description("Remote api connection timeout in milliseconds") - .required(false) - .defaultValue("10000") - .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) - .build - - val REMOTE_API_SOCKET_TIMEOUT = new PropertyDescriptor.Builder() - .name("remote.api.timeouts.socket") - .description("Remote api default read/write socket timeout in milliseconds") - .required(false) - .defaultValue("10000") - .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) - .build - - val REMOTE_API_USER = new PropertyDescriptor.Builder() - .name("remote.api.auth.user") - .description("The basic authentication user for the remote api endpoint.") - .required(false) - .build - - val REMOTE_API_PASSWORD = new PropertyDescriptor.Builder() - .name("remote.api.auth.password") - .description("The basic authentication password for the remote api endpoint.") - .required(false) - .build -} - -class RemoteApiStreamProcessingEngine extends KafkaStreamProcessingEngine { - - private val logger = LoggerFactory.getLogger(classOf[RemoteApiStreamProcessingEngine]) - private var initialized = false - - - override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { - val ret = new util.ArrayList(super.getSupportedPropertyDescriptors) - ret.add(RemoteApiStreamProcessingEngine.REMOTE_API_BASE_URL) - ret.add(RemoteApiStreamProcessingEngine.REMOTE_API_POLLING_RATE) - ret.add(RemoteApiStreamProcessingEngine.REMOTE_API_CONFIG_PUSH_RATE) - ret.add(RemoteApiStreamProcessingEngine.REMOTE_API_CONNECT_TIMEOUT) - ret.add(RemoteApiStreamProcessingEngine.REMOTE_API_USER) - ret.add(RemoteApiStreamProcessingEngine.REMOTE_API_PASSWORD) - ret.add(RemoteApiStreamProcessingEngine.REMOTE_API_SOCKET_TIMEOUT) - return Collections.unmodifiableList(ret) - } - - - /** - * start the engine - * - * @param engineContext - */ - override def start(engineContext: EngineContext): Unit = { - // engineContext.addStreamContext(new StandardStreamContext(new DummyRecordStream(), "busybox")) - - if (!initialized) { - initialized = true - val remoteApiClient = new RemoteApiClient(new RemoteApiClient.ConnectionSettings( - engineContext.getProperty(RemoteApiStreamProcessingEngine.REMOTE_API_BASE_URL), - Duration.ofMillis(engineContext.getPropertyValue(RemoteApiStreamProcessingEngine.REMOTE_API_SOCKET_TIMEOUT).asLong()), - Duration.ofMillis(engineContext.getPropertyValue(RemoteApiStreamProcessingEngine.REMOTE_API_CONNECT_TIMEOUT).asLong()), - engineContext.getProperty(RemoteApiStreamProcessingEngine.REMOTE_API_USER), - engineContext.getProperty(RemoteApiStreamProcessingEngine.REMOTE_API_PASSWORD))) - - - val appName = getCurrentSparkContext().appName - var currentDataflow: DataFlow = null - - //schedule dataflow refresh - @transient lazy val executor = Executors.newSingleThreadScheduledExecutor(); - @transient lazy val remoteApiComponentFactory = new RemoteApiComponentFactory - - - executor.scheduleWithFixedDelay(new Runnable { - val state = new RemoteApiClient.State - - override def run(): Unit = { - var changed = false - try { - val dataflow = remoteApiClient.fetchDataflow(appName, state) - if (dataflow.isPresent) { - changed = true - if (remoteApiComponentFactory.updateEngineContext(getCurrentSparkContext(), engineContext, dataflow.get, currentDataflow)) { - currentDataflow = dataflow.get() - } - } - } catch { - case default: Throwable => { - currentDataflow = null - logger.warn("Unexpected exception while trying to poll for new dataflow configuration", default) - reset(engineContext) - } - } finally { - if (changed) { - try { - remoteApiClient.pushDataFlow(appName, currentDataflow); - } catch { - case default: Throwable => logger.warn("Unexpected exception while trying to push configuration to remote server", default) - } - } - } - } - }, 0, engineContext.getProperty(RemoteApiStreamProcessingEngine.REMOTE_API_POLLING_RATE).toInt, TimeUnit.MILLISECONDS - ) - - executor.scheduleWithFixedDelay(new Runnable { - - override def run(): Unit = { - try { - remoteApiClient.pushDataFlow(appName, currentDataflow) - } catch { - case default: Throwable => logger.warn("Unexpected exception while trying to push configuration to remote server", default) - } - } - }, 0, engineContext.getProperty(RemoteApiStreamProcessingEngine.REMOTE_API_CONFIG_PUSH_RATE).toInt, TimeUnit.MILLISECONDS - ) - - - } - - - super.start(engineContext) - } - - - override def shutdown(engineContext: EngineContext): Unit = { - super.shutdown(engineContext) - } - - /** - * Reset the engine by stopping the streaming context. - */ - override def reset(engineContext: EngineContext): Unit = { - logger.info(s"Resetting engine ${ - engineContext.getIdentifier - }") - super.stop(engineContext, false) - engineContext.getStreamContexts.clear() - engineContext.getControllerServiceConfigurations.clear() - } - - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/AbstractKafkaRecordStream.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/AbstractKafkaRecordStream.scala deleted file mode 100644 index 1ba79f9f1..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/AbstractKafkaRecordStream.scala +++ /dev/null @@ -1,344 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.hurence.logisland.stream.spark - -import java.io.ByteArrayInputStream -import java.util -import java.util.Collections - -import com.hurence.logisland.component.PropertyDescriptor -import com.hurence.logisland.engine.EngineContext -import com.hurence.logisland.engine.spark.remote.PipelineConfigurationBroadcastWrapper -import com.hurence.logisland.record.Record -import com.hurence.logisland.serializer._ -import com.hurence.logisland.stream.StreamProperties._ -import com.hurence.logisland.stream.{AbstractRecordStream, StreamContext} -import com.hurence.logisland.util.kafka.KafkaSink -import com.hurence.logisland.util.spark._ -import kafka.zk.AdminZkClient -import kafka.zk.KafkaZkClient -import kafka.zookeeper.ZooKeeperClient -import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, OffsetAndMetadata, OffsetCommitCallback} -import org.apache.kafka.clients.producer.ProducerConfig -import org.apache.kafka.common.TopicPartition -import org.apache.kafka.common.security.JaasUtils -import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySerializer} -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.groupon.metrics.UserMetricsSystem -import org.apache.spark.rdd.RDD -//import org.apache.spark.streaming.kafka.KafkaUtils; -//import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe -//import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent -//import org.apache.spark.streaming.kafka010.{CanCommitOffsets, KafkaUtils, OffsetRange} -import org.apache.spark.streaming.{Seconds, StreamingContext} -import org.slf4j.LoggerFactory - -import scala.collection.JavaConversions._ - - -abstract class AbstractKafkaRecordStream extends AbstractRecordStream with SparkRecordStream { - - - val NONE_TOPIC: String = "none" - private val logger = LoggerFactory.getLogger(this.getClass) - protected var kafkaSink: Broadcast[KafkaSink] = null - protected var appName: String = "" - @transient protected var ssc: StreamingContext = null - protected var streamContext: StreamContext = null - protected var engineContext: EngineContext = null - protected var controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink] = null - protected var needMetricsReset = false - -// override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { -// val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] -// descriptors.add(ERROR_TOPICS) -// descriptors.add(INPUT_TOPICS) -// descriptors.add(OUTPUT_TOPICS) -// descriptors.add(AVRO_INPUT_SCHEMA) -// descriptors.add(AVRO_OUTPUT_SCHEMA) -// descriptors.add(INPUT_SERIALIZER) -// descriptors.add(OUTPUT_SERIALIZER) -// descriptors.add(ERROR_SERIALIZER) -// descriptors.add(KAFKA_TOPIC_AUTOCREATE) -// descriptors.add(KAFKA_TOPIC_DEFAULT_PARTITIONS) -// descriptors.add(KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR) -// descriptors.add(KAFKA_METADATA_BROKER_LIST) -// descriptors.add(KAFKA_ZOOKEEPER_QUORUM) -// descriptors.add(KAFKA_MANUAL_OFFSET_RESET) -// descriptors.add(KAFKA_BATCH_SIZE) -// descriptors.add(KAFKA_LINGER_MS) -// descriptors.add(KAFKA_ACKS) -// descriptors.add(WINDOW_DURATION) -// descriptors.add(SLIDE_DURATION) -// Collections.unmodifiableList(descriptors) -// } -// -// -// override def setup(appName: String, ssc: StreamingContext, streamContext: StreamContext, engineContext: EngineContext) = { -// this.appName = appName -// this.ssc = ssc -// this.streamContext = streamContext -// this.engineContext = engineContext -// -// } -// -// override def getStreamContext(): StreamingContext = this.ssc -// -// override def start() = { -// if (ssc == null) -// throw new IllegalStateException("stream not initialized") -// -// try { -// -// // Define the Kafka parameters, broker list must be specified -// val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString.split(",").toSet -// val outputTopics = streamContext.getPropertyValue(OUTPUT_TOPICS).asString.split(",").toSet -// val errorTopics = streamContext.getPropertyValue(ERROR_TOPICS).asString.split(",").toSet -// val metricsTopics = DEFAULT_METRICS_TOPIC.getValue.split(",").toSet -// -// val topicAutocreate = streamContext.getPropertyValue(KAFKA_TOPIC_AUTOCREATE).asBoolean().booleanValue() -// val topicDefaultPartitions = streamContext.getPropertyValue(KAFKA_TOPIC_DEFAULT_PARTITIONS).asInteger().intValue() -// val topicDefaultReplicationFactor = streamContext.getPropertyValue(KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR).asInteger().intValue() -// val brokerList = streamContext.getPropertyValue(KAFKA_METADATA_BROKER_LIST).asString -// val zkQuorum = streamContext.getPropertyValue(KAFKA_ZOOKEEPER_QUORUM).asString -// -// val kafkaBatchSize = streamContext.getPropertyValue(KAFKA_BATCH_SIZE).asString -// val kafkaLingerMs = streamContext.getPropertyValue(KAFKA_LINGER_MS).asString -// val kafkaAcks = streamContext.getPropertyValue(KAFKA_ACKS).asString -// val kafkaOffset = streamContext.getPropertyValue(KAFKA_MANUAL_OFFSET_RESET).asString -// -// -// val kafkaSinkParams = Map( -// ProducerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokerList, -// ProducerConfig.CLIENT_ID_CONFIG -> appName, -// ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getCanonicalName, -// ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName, -// ProducerConfig.ACKS_CONFIG -> kafkaAcks, -// ProducerConfig.RETRIES_CONFIG -> "3", -// ProducerConfig.LINGER_MS_CONFIG -> kafkaLingerMs, -// ProducerConfig.BATCH_SIZE_CONFIG -> kafkaBatchSize, -// ProducerConfig.RETRY_BACKOFF_MS_CONFIG -> "1000", -// ProducerConfig.RECONNECT_BACKOFF_MS_CONFIG -> "1000") -// -// kafkaSink = ssc.sparkContext.broadcast(KafkaSink(kafkaSinkParams)) -// controllerServiceLookupSink = ssc.sparkContext.broadcast( -// ControllerServiceLookupSink(engineContext.getControllerServiceConfigurations) -// ) -// -// // TODO deprecate topic creation here (must be done through the agent) -//// if (topicAutocreate) { -////// val zkUtils = ZkUtils.apply(zkQuorum, 10000, 10000, JaasUtils.isZkSecurityEnabled) -////// createTopicsIfNeeded(zkUtils, inputTopics, topicDefaultPartitions, topicDefaultReplicationFactor) -////// createTopicsIfNeeded(zkUtils, outputTopics, topicDefaultPartitions, topicDefaultReplicationFactor) -////// createTopicsIfNeeded(zkUtils, errorTopics, topicDefaultPartitions, topicDefaultReplicationFactor) -////// createTopicsIfNeeded(zkUtils, metricsTopics, 1, 1) -//// zkQuorum -//// val zooKeeperClient : ZooKeeperClient = new ZooKeeperClient(zkQuorum, -//// 1000, -//// 1000: Int, -//// 10: Int, -//// , -//// metricGroup: String, -//// metricType: String) ) -//// val kafkaZkClient : KafkaZkClient = new KafkaZkClient -//// val adminZkClient : AdminZkClient = new AdminZkClient[kafkaZkClient] -//// } -// -// -// val kafkaParams = Map[String, Object]( -// ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokerList, -// ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer], -// ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer], -// ConsumerConfig.GROUP_ID_CONFIG -> appName, -// ConsumerConfig.RECONNECT_BACKOFF_MS_CONFIG -> "50", -// ConsumerConfig.RETRY_BACKOFF_MS_CONFIG -> "100", -// ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> kafkaOffset, -// ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> "false", -// ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG -> "30000" -// /*, -// ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG -> "5000"*/ -// ) -// -// -// logger.info(s"starting Kafka direct stream on topics $inputTopics from $kafkaOffset offsets") -// @transient val kafkaStream = KafkaUtils.createDirectStream[Array[Byte], Array[Byte]]( -// ssc, -// PreferConsistent, -// Subscribe[Array[Byte], Array[Byte]](inputTopics, kafkaParams) -// ) -// -// // do the parallel processing -// -// val stream = if (streamContext.getPropertyValue(WINDOW_DURATION).isSet) { -// if (streamContext.getPropertyValue(SLIDE_DURATION).isSet) -// kafkaStream.window( -// Seconds(streamContext.getPropertyValue(WINDOW_DURATION).asLong()), -// Seconds(streamContext.getPropertyValue(SLIDE_DURATION).asLong()) -// ) -// else -// kafkaStream.window(Seconds(streamContext.getPropertyValue(WINDOW_DURATION).asLong())) -// -// } else kafkaStream -// -// -// stream -// .foreachRDD(rdd => { -// -// this.streamContext.getProcessContexts().clear(); -// this.streamContext.getProcessContexts().addAll( -// PipelineConfigurationBroadcastWrapper.getInstance().get(this.streamContext.getIdentifier)) -// -// if (!rdd.isEmpty()) { -// -// -// val offsetRanges = process(rdd) -// // some time later, after outputs have completed -// if (offsetRanges.nonEmpty) { -// // kafkaStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges.get) -// -// -// kafkaStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges.get, new OffsetCommitCallback() { -// def onComplete(m: java.util.Map[TopicPartition, OffsetAndMetadata], e: Exception) { -// if (null != e) { -// logger.error("error commiting offsets", e) -// } -// } -// }) -// -// -// needMetricsReset = true -// } -// else if (needMetricsReset) { -// try { -// -// for (partitionId <- 0 to rdd.getNumPartitions) { -// val pipelineMetricPrefix = streamContext.getIdentifier + "." + -// "partition" + partitionId + "." -// val pipelineTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + "Pipeline.processing_time_ms").time() -// -// streamContext.getProcessContexts.foreach(processorContext => { -// UserMetricsSystem.timer(pipelineMetricPrefix + processorContext.getIdentifier + ".processing_time_ms") -// .time() -// .stop() -// -// ProcessorMetrics.resetMetrics(pipelineMetricPrefix + processorContext.getIdentifier + ".") -// }) -// pipelineTimerContext.stop() -// } -// } catch { -// case ex: Throwable => -// logger.error(s"exception : ${ex.toString}") -// None -// } finally { -// needMetricsReset = false -// } -// } -// } -// -// }) -// } catch { -// case ex: Throwable => -// ex.printStackTrace() -// logger.error("something bad happened, please check Kafka or Zookeeper health : {}", ex) -// } -// } -// -// -// /** -// * to be overriden by subclasses -// * -// * @param rdd -// */ -// def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] - - - /** - * build a serializer - * - * @param inSerializerClass the serializer type - * @param schemaContent an Avro schema - * @return the serializer - */ - def getSerializer(inSerializerClass: String, schemaContent: String): RecordSerializer = { - SerializerProvider.getSerializer(inSerializerClass, schemaContent) - } - - /** - * - * @param partition - * @param serializer - * @return - */ - def deserializeRecords(partition: Iterator[ConsumerRecord[Array[Byte], Array[Byte]]], serializer: RecordSerializer): List[Record] = { - partition.flatMap(rawEvent => { - try { - val bais = new ByteArrayInputStream(rawEvent.value()) - val deserialized = serializer.deserialize(bais) - bais.close() - - Some(deserialized) - } catch { - case t: Throwable => - logger.error(s"exception while deserializing events ${t.getMessage}") - None - } - }).toList - } - - -// /** -// * Topic creation -// * -// * @param zkUtils -// * @param topics -// * @param topicDefaultPartitions -// * @param topicDefaultReplicationFactor -// */ -// def createTopicsIfNeeded(zkUtils: ZkUtils, -// topics: Set[String], -// topicDefaultPartitions: Int, -// topicDefaultReplicationFactor: Int): Unit = { -// -// topics.foreach(topic => { -// -// if (!topic.equals(NONE_TOPIC) && !AdminUtils.topicExists(zkUtils, topic)) { -// AdminUtils.createTopic(zkUtils, topic, topicDefaultPartitions, topicDefaultReplicationFactor) -// Thread.sleep(1000) -// logger.info(s"created topic $topic with" + -// s" $topicDefaultPartitions partitions and" + -// s" $topicDefaultReplicationFactor replicas") -// } -// }) -// } -} - - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/DummyRecordStream.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/DummyRecordStream.scala deleted file mode 100644 index fec79e1cd..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/DummyRecordStream.scala +++ /dev/null @@ -1,68 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark - -import java.util - -import com.hurence.logisland.component.PropertyDescriptor -import com.hurence.logisland.engine.EngineContext -import com.hurence.logisland.stream.{AbstractRecordStream, StreamContext} -import com.hurence.logisland.util.spark.SparkUtils -import org.apache.spark.storage.StorageLevel -import org.apache.spark.streaming.StreamingContext -import org.apache.spark.streaming.receiver.Receiver - -class DummyRecordStream extends AbstractRecordStream with SparkRecordStream { - - @transient private var streamingContext: StreamingContext = _ - - /** - * Allows subclasses to register which property descriptor objects are - * supported. - * - * @return PropertyDescriptor objects this processor currently supports - */ - override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { - return new util.ArrayList[PropertyDescriptor]() - } - - override def start(): Unit = { - val stream = streamingContext.receiverStream(new Receiver[Long](StorageLevel.NONE) { - override def onStart(): Unit = {} - - override def onStop(): Unit = {} - }) - stream.foreachRDD(rdd => { - //do nothing :) - }) - stream.start() - - } - - /** - * setup the stream with spark app properties - * - * @param appName - * @param ssc - * @param streamContext - */ - override def setup(appName: String, ssc: StreamingContext, streamContext: StreamContext, engineContext: EngineContext): Unit = { - streamingContext = ssc - - } - - override def getStreamContext(): StreamingContext = streamingContext -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamDebugger.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamDebugger.scala deleted file mode 100644 index 4165e4e8b..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamDebugger.scala +++ /dev/null @@ -1,191 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (bailet.thomas@gmail.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark - -import java.util -import java.util.Collections - -import com.hurence.logisland.record.{FieldDictionary, Record, RecordUtils} -import com.hurence.logisland.util.record.RecordSchemaUtil -import com.hurence.logisland.util.spark.ProcessorMetrics -import org.apache.avro.Schema -import org.apache.kafka.clients.consumer.ConsumerRecord -import org.apache.spark.TaskContext -import org.apache.spark.rdd.RDD -//import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange} -import org.slf4j.LoggerFactory - -import scala.collection.JavaConversions._ -import com.hurence.logisland.stream.StreamProperties._ - -//class KafkaRecordStreamDebugger extends AbstractKafkaRecordStream { -// val logger = LoggerFactory.getLogger(this.getClass.getName) -// -// -// /** -// * launch the chain of processing for each partition of the RDD in parallel -// * -// * @param rdd -// */ -// override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { -// if (!rdd.isEmpty()) { -// // Cast the rdd to an interface that lets us get an array of OffsetRange -// val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges -// -// val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString -// val outputTopics = streamContext.getPropertyValue(OUTPUT_TOPICS).asString -// val errorTopics = streamContext.getPropertyValue(ERROR_TOPICS).asString -// val brokerList = streamContext.getPropertyValue(KAFKA_METADATA_BROKER_LIST).asString -// -// -// rdd.foreachPartition(partition => { -// if (partition.nonEmpty) { -// /** -// * index to get the correct offset range for the rdd partition we're working on -// * This is safe because we haven't shuffled or otherwise disrupted partitioning, -// * and the original input rdd partitions were 1:1 with kafka partitions -// */ -// val partitionId = TaskContext.get.partitionId() -// val offsetRange = offsetRanges(TaskContext.get.partitionId) -// -// /** -// * create serializers -// */ -// val deserializer = getSerializer( -// streamContext.getPropertyValue(INPUT_SERIALIZER).asString, -// streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) -// val serializer = getSerializer( -// streamContext.getPropertyValue(OUTPUT_SERIALIZER).asString, -// streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) -// val errorSerializer = getSerializer( -// streamContext.getPropertyValue(ERROR_SERIALIZER).asString, -// streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) -// -// /** -// * process events by chaining output records -// */ -// var firstPass = true -// var incomingEvents: util.Collection[Record] = Collections.emptyList() -// var outgoingEvents: util.Collection[Record] = Collections.emptyList() -// val processingMetrics: util.Collection[Record] = new util.ArrayList[Record]() -// logger.info("start processing") -// -// streamContext.getProcessContexts.foreach(processorContext => { -// val startTime = System.currentTimeMillis() -// val processor = processorContext.getProcessor -// -// -// if (firstPass) { -// /** -// * convert incoming Kafka messages into Records -// * if there's no serializer we assume that we need to compute a Record from K/V -// */ -// incomingEvents = if ( -// streamContext.getPropertyValue(INPUT_SERIALIZER).asString -// == NO_SERIALIZER.getValue) { -// // parser -// partition.map(rawMessage => { -// val key = if (rawMessage.key() != null) new String(rawMessage.key()) else "" -// val value = if (rawMessage.value() != null) new String(rawMessage.value()) else "" -// RecordUtils.getKeyValueRecord(key, value) -// }).toList -// } else { -// // processor -// deserializeRecords(partition, deserializer) -// } -// -// firstPass = false -// } else { -// incomingEvents = outgoingEvents -// } -// -// /** -// * process incoming events -// */ -// outgoingEvents = processor.process(processorContext, incomingEvents) -// -// -// }) -// -// -// /** -// * Do we make records compliant with a given Avro schema ? -// */ -// if (streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).isSet) { -// try { -// val strSchema = streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString() -// val schema = RecordSchemaUtil.compileSchema(strSchema) -// -// -// outgoingEvents = outgoingEvents.map(record => RecordSchemaUtil.convertToValidRecord(record, schema)) -// } catch { -// case t: Throwable => -// logger.warn("something wrong while converting records " + -// "to valid accordingly to provide Avro schema " + t.getMessage) -// } -// -// } -// -// -// logger.info("sending to kafka") -// -// /** -// * push outgoing events and errors to Kafka -// */ -// kafkaSink.value.produce( -// streamContext.getPropertyValue(OUTPUT_TOPICS).asString, -// outgoingEvents.toList, -// serializer -// ) -// -// kafkaSink.value.produce( -// streamContext.getPropertyValue(ERROR_TOPICS).asString, -// outgoingEvents.filter(r => r.hasField(FieldDictionary.RECORD_ERRORS)).toList, -// errorSerializer -// ) -// -// logger.info("saving offsets") -// -// /** -// * save latest offset to Zookeeper -// */ -// // zkSink.value.saveOffsetRangesToZookeeper(appName, offsetRange) -// logger.info("processed " + outgoingEvents.size() + " messages") -// } -// }) -// -// return Some(offsetRanges) -// } -// None -// } -//} - - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamHDFSBurner.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamHDFSBurner.scala deleted file mode 100644 index 832639e1e..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamHDFSBurner.scala +++ /dev/null @@ -1,229 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (bailet.thomas@gmail.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark - -import java.text.SimpleDateFormat -import java.util -import java.util.Collections - -import com.hurence.logisland.component.PropertyDescriptor -import com.hurence.logisland.record.{FieldDictionary, FieldType} -import com.hurence.logisland.stream.StreamProperties._ -import com.hurence.logisland.util.spark.SparkUtils -import org.apache.kafka.clients.consumer.ConsumerRecord -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession} -//import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange} -import org.slf4j.LoggerFactory - - -//class KafkaRecordStreamHDFSBurner extends AbstractKafkaRecordStream { -// -// -// private val logger = LoggerFactory.getLogger(classOf[KafkaRecordStreamHDFSBurner]) -// -// -// override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { -// val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] -// -// descriptors.addAll(super.getSupportedPropertyDescriptors()) -// -// descriptors.add(OUTPUT_FOLDER_PATH) -// descriptors.add(OUTPUT_FORMAT) -// descriptors.add(RECORD_TYPE) -// descriptors.add(NUM_PARTITIONS) -// descriptors.add(EXCLUDE_ERRORS) -// descriptors.add(DATE_FORMAT) -// descriptors.add(INPUT_FORMAT) -// Collections.unmodifiableList(descriptors) -// } -// -// private def sanitizeSchema(dataType: DataType): DataType = { -// dataType match { -// case structType: StructType => -// DataTypes.createStructType(structType.fields.map(f => -// DataTypes.createStructField(f.name.replaceAll("[:,-]", "_"), sanitizeSchema(f.dataType), f.nullable, f.metadata) -// )) -// case arrayType: ArrayType => -// DataTypes.createArrayType(sanitizeSchema(arrayType.elementType), arrayType.containsNull) -// case mapType: MapType => -// DataTypes.createMapType(sanitizeSchema(mapType.keyType), sanitizeSchema(mapType.valueType), mapType.valueContainsNull) -// case other => other -// } -// -// -// } -// -// override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { -// if (!rdd.isEmpty()) { -// // Cast the rdd to an interface that lets us get an array of OffsetRange -// val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges -// -// // Get the singleton instance of SQLContext -// val sqlContext = SparkSession -// .builder() -// .appName(appName) -// .config(ssc.sparkContext.getConf) -// .getOrCreate() -// -// -// // this is used to implicitly convert an RDD to a DataFrame. -// -// val deserializer = getSerializer( -// streamContext.getPropertyValue(INPUT_SERIALIZER).asString, -// streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) -// -// -// val records = rdd.mapPartitions(p => deserializeRecords(p, deserializer).iterator) -// -// -// if (!records.isEmpty()) { -// -// -// val sdf = new SimpleDateFormat(streamContext.getPropertyValue(DATE_FORMAT).asString) -// -// -// val numPartitions = streamContext.getPropertyValue(NUM_PARTITIONS).asInteger() -// val outputFormat = streamContext.getPropertyValue(OUTPUT_FORMAT).asString() -// val doExcludeErrors = streamContext.getPropertyValue(EXCLUDE_ERRORS).asBoolean() -// val recordType = streamContext.getPropertyValue(RECORD_TYPE).asString() -// val outPath = streamContext.getPropertyValue(OUTPUT_FOLDER_PATH).asString() -// -// val records = rdd.mapPartitions(p => deserializeRecords(p, deserializer).iterator) -// .filter(r => -// r.hasField(FieldDictionary.RECORD_TYPE) && -// r.getField(FieldDictionary.RECORD_TYPE).asString() == recordType) -// .map(r => { -// try { -// if (r.hasField(FieldDictionary.RECORD_DAYTIME)) -// r -// else -// r.setField(FieldDictionary.RECORD_DAYTIME, FieldType.STRING, sdf.format(r.getTime)) -// } -// catch { -// case ex: Throwable => r -// } -// }) -// -// -// if (!records.isEmpty()) { -// var df: DataFrame = null; -// val inputFormat = streamContext.getPropertyValue(INPUT_FORMAT).asString() -// if (inputFormat.isEmpty) { -// -// val schema = SparkUtils.convertFieldsNameToSchema(records.take(1)(0)) -// val rows = if (doExcludeErrors) { -// records -// .filter(r => !r.hasField(FieldDictionary.RECORD_ERRORS)) -// .map(r => SparkUtils.convertToRow(r, schema)) -// } else { -// records.map(r => SparkUtils.convertToRow(r, schema)) -// } -// -// -// logger.info(schema.toString()) -// df = sqlContext.createDataFrame(rows, schema) -// } else { -// if ("json".equals(inputFormat)) { -// import sqlContext.implicits._ -// val rdf = records.map(record => (record.getType, record.getField(FieldDictionary.RECORD_DAYTIME).asString)) -// .toDF(FieldDictionary.RECORD_TYPE, FieldDictionary.RECORD_DAYTIME) -// val json = sqlContext.read.json(records.map(record => record.getField(FieldDictionary.RECORD_VALUE).asString())) -// val merged = rdf.rdd.zip(json.rdd) -// .map { -// case (rowLeft, rowRight) => Row.fromSeq(rowLeft.toSeq ++ rowRight.toSeq) -// } -// df = sqlContext.createDataFrame(merged, StructType(rdf.schema.fields ++ sanitizeSchema(json.schema).asInstanceOf[StructType].fields)) -// } else { -// throw new IllegalArgumentException(s"Input format $inputFormat is not supported") -// } -// } -// -// outputFormat match { -// case FILE_FORMAT_PARQUET => -// df.repartition(numPartitions) -// .write -// .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) -// .mode(SaveMode.Append) -// .parquet(outPath) -// case FILE_FORMAT_JSON => -// df.repartition(numPartitions) -// .write -// .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) -// .mode(SaveMode.Append) -// .json(outPath) -// case FILE_FORMAT_ORC => -// df.repartition(numPartitions) -// .write -// .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) -// .mode(SaveMode.Append) -// .orc(outPath) -// case FILE_FORMAT_TXT => -// df.repartition(numPartitions) -// .write -// .partitionBy(FieldDictionary.RECORD_DAYTIME, FieldDictionary.RECORD_TYPE) -// .mode(SaveMode.Append) -// .text(outPath) -// case _ => -// throw new IllegalArgumentException(s"$outputFormat not supported yet") -// } -// -// /** -// * save latest offset to Zookeeper -// */ -// // offsetRanges.foreach(offsetRange => zkSink.value.saveOffsetRangesToZookeeper(appName, offsetRange)) -// } -// -// } -// -// return Some(offsetRanges) -// } -// None -// } -//} - - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamParallelProcessing.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamParallelProcessing.scala deleted file mode 100644 index 7cede3283..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamParallelProcessing.scala +++ /dev/null @@ -1,226 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark - -import java.util -import java.util.Collections - -import com.hurence.logisland.component.PropertyDescriptor -import com.hurence.logisland.record.{FieldDictionary, Record, RecordUtils} -import com.hurence.logisland.util.record.RecordSchemaUtil -import com.hurence.logisland.util.spark.ProcessorMetrics -import org.apache.avro.Schema -import org.apache.kafka.clients.consumer.ConsumerRecord -import org.apache.kafka.common.errors.OffsetOutOfRangeException -import org.apache.spark.TaskContext -import org.apache.spark.groupon.metrics.{SparkMeter, UserMetricsSystem} -import org.apache.spark.rdd.RDD -//import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, OffsetRange} -import org.slf4j.LoggerFactory - -import scala.collection.JavaConversions._ -import com.hurence.logisland.stream.StreamProperties._ - - -//class KafkaRecordStreamParallelProcessing extends AbstractKafkaRecordStream { -// val logger = LoggerFactory.getLogger(this.getClass) -// -// override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { -// val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] -// -// descriptors.addAll(super.getSupportedPropertyDescriptors()) -// Collections.unmodifiableList(descriptors) -// } -// -// /** -// * launch the chain of processing for each partition of the RDD in parallel -// * -// * @param rdd -// */ -// override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { -// if (!rdd.isEmpty()) { -// // Cast the rdd to an interface that lets us get an array of OffsetRange -// val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges -// -// rdd.foreachPartition(partition => { -// try { -// if (partition.nonEmpty) { -// /** -// * index to get the correct offset range for the rdd partition we're working on -// * This is safe because we haven't shuffled or otherwise disrupted partitioning, -// * and the original input rdd partitions were 1:1 with kafka partitions -// */ -// val partitionId = TaskContext.get.partitionId() -// val offsetRange = offsetRanges(TaskContext.get.partitionId) -// -// val pipelineMetricPrefix = streamContext.getIdentifier + "." + -// "partition" + partitionId + "." -// val pipelineTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + "Pipeline.processing_time_ms" ).time() -// -// -// /** -// * create serializers -// */ -// val deserializer = getSerializer( -// streamContext.getPropertyValue(INPUT_SERIALIZER).asString, -// streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) -// val serializer = getSerializer( -// streamContext.getPropertyValue(OUTPUT_SERIALIZER).asString, -// streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) -// val errorSerializer = getSerializer( -// streamContext.getPropertyValue(ERROR_SERIALIZER).asString, -// streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) -// -// /** -// * process events by chaining output records -// */ -// var firstPass = true -// var incomingEvents: util.Collection[Record] = Collections.emptyList() -// var outgoingEvents: util.Collection[Record] = Collections.emptyList() -// -// streamContext.getProcessContexts.foreach(processorContext => { -// val startTime = System.currentTimeMillis() -// val processor = processorContext.getProcessor -// -// val processorTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + -// processorContext.getIdentifier + ".processing_time_ms").time() -// /** -// * convert incoming Kafka messages into Records -// * if there's no serializer we assume that we need to compute a Record from K/V -// */ -// if (firstPass) { -// incomingEvents = if ( -// streamContext.getPropertyValue(INPUT_SERIALIZER).asString -// == NO_SERIALIZER.getValue) { -// // parser -// partition.map(rawMessage => { -// val key = if (rawMessage.key() != null) new String(rawMessage.key()) else "" -// val value = if (rawMessage.value() != null) new String(rawMessage.value()) else "" -// RecordUtils.getKeyValueRecord(key, value) -// }).toList -// } else { -// // processor -// deserializeRecords(partition, deserializer) -// } -// -// firstPass = false -// } else { -// incomingEvents = outgoingEvents -// } -// -// /** -// * process incoming events -// */ -// if (processor.hasControllerService) { -// val controllerServiceLookup = controllerServiceLookupSink.value.getControllerServiceLookup() -// processorContext.setControllerServiceLookup(controllerServiceLookup) -// } -// -// if (!processor.isInitialized) { -// processor.init(processorContext) -// } -// -// outgoingEvents = processor.process(processorContext, incomingEvents) -// -// /** -// * compute metrics -// */ -// ProcessorMetrics.computeMetrics( -// pipelineMetricPrefix + processorContext.getIdentifier + ".", -// incomingEvents, -// outgoingEvents, -// offsetRange.fromOffset, -// offsetRange.untilOffset, -// System.currentTimeMillis() - startTime) -// -// processorTimerContext.stop() -// }) -// -// -// /** -// * Do we make records compliant with a given Avro schema ? -// */ -// if (streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).isSet) { -// try { -// val strSchema = streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString() -// val schema = RecordSchemaUtil.compileSchema(strSchema) -// -// outgoingEvents = outgoingEvents.map(record => RecordSchemaUtil.convertToValidRecord(record, schema)) -// } catch { -// case t: Throwable => -// logger.warn("something wrong while converting records " + -// "to valid accordingly to provide Avro schema " + t.getMessage) -// } -// -// } -// -// /** -// * push outgoing events and errors to Kafka -// */ -// kafkaSink.value.produce( -// streamContext.getPropertyValue(OUTPUT_TOPICS).asString, -// outgoingEvents.toList, -// serializer -// ) -// -// kafkaSink.value.produce( -// streamContext.getPropertyValue(ERROR_TOPICS).asString, -// outgoingEvents.filter(r => r.hasField(FieldDictionary.RECORD_ERRORS)).toList, -// errorSerializer -// ) -// -// pipelineTimerContext.stop() -// } -// } -// catch { -// case ex: OffsetOutOfRangeException => -// val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString -// val brokerList = streamContext.getPropertyValue(KAFKA_METADATA_BROKER_LIST).asString -// /* val latestOffsetsString = zkSink.value.loadOffsetRangesFromZookeeper( -// brokerList, -// appName, -// inputTopics.split(",").toSet) -// .map(t => s"${t._1.topic}_${t._1.partition}:${t._2}") -// .mkString(", ") -// val offestsString = offsetRanges -// .map(o => s"${o.topic}_${o.partition}:${o.fromOffset}/${o.untilOffset}") -// .mkString(", ") -// logger.error(s"unable to process partition. current Offsets $offestsString latest offsets $latestOffsetsString")*/ -// logger.error(s"exception : ${ex.toString}") -// -// } -// }) -// Some(offsetRanges) -// } -// else None -// } -//} - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamSQLAggregator.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamSQLAggregator.scala deleted file mode 100644 index 64ce57caa..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/KafkaRecordStreamSQLAggregator.scala +++ /dev/null @@ -1,145 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (bailet.thomas@gmail.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark - -import java.util -import java.util.Collections - -import com.hurence.logisland.annotation.documentation.{CapabilityDescription, Tags} -import com.hurence.logisland.component.PropertyDescriptor -import com.hurence.logisland.record.{FieldDictionary, Record} -import com.hurence.logisland.util.spark.{ProcessorMetrics, SparkUtils} -import com.hurence.logisland.validator.StandardValidators -import org.apache.avro.Schema -import org.apache.kafka.clients.consumer.ConsumerRecord -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession -//import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange} -import org.slf4j.LoggerFactory - -import scala.collection.JavaConversions._ -import com.hurence.logisland.stream.StreamProperties._ - - -//@Tags(Array("stream", "SQL", "query", "record")) -//@CapabilityDescription("This is a stream capable of SQL query interpretations.") -//class KafkaRecordStreamSQLAggregator extends AbstractKafkaRecordStream { -// -// private val logger = LoggerFactory.getLogger(classOf[KafkaRecordStreamSQLAggregator]) -// -// -// override def getSupportedPropertyDescriptors: util.List[PropertyDescriptor] = { -// val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] -// descriptors.addAll(super.getSupportedPropertyDescriptors()) -// descriptors.add(MAX_RESULTS_COUNT) -// descriptors.add(SQL_QUERY) -// descriptors.add(OUTPUT_RECORD_TYPE) -// Collections.unmodifiableList(descriptors) -// } -// -// override def process(rdd: RDD[ConsumerRecord[Array[Byte], Array[Byte]]]): Option[Array[OffsetRange]] = { -// if (!rdd.isEmpty()) { -// // Cast the rdd to an interface that lets us get an array of OffsetRange -// // val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges -// -// val sqlContext = SparkSession -// .builder() -// .appName(appName) -// .config(ssc.sparkContext.getConf) -// .getOrCreate() -// -// // this is used to implicitly convert an RDD to a DataFrame. -// @transient lazy val deserializer = getSerializer( -// streamContext.getPropertyValue(INPUT_SERIALIZER).asString, -// streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) -// -// val inputTopics = streamContext.getPropertyValue(INPUT_TOPICS).asString -// -// //here how to handle elements that are not successfully deserialized ??? -// //currently we lose them ! -// //I think we should create an ErrorRecord containing key, value. -// val records: RDD[Record] = rdd.mapPartitions(p => deserializeRecords(p, deserializer).iterator) -// -// /** -// * get a Dataframe schema (either from an Avro schema or from the first record) -// */ -// val schema = try { -// val parser = new Schema.Parser -// val schema = parser.parse(streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) -// SparkUtils.convertAvroSchemaToDataframeSchema(schema) -// } -// catch { -// case e: Exception => -// logger.error("unable to add schema :{}", e.getMessage) -// SparkUtils.convertFieldsNameToSchema(records.take(1)(0)) -// } -// -// if (!records.isEmpty()) { -// -// val rows = records.filter(r => !r.hasField(FieldDictionary.RECORD_ERRORS)) -// .map(r => SparkUtils.convertToRow(r, schema)) -// -// -// sqlContext.createDataFrame(rows, schema).createOrReplaceTempView(inputTopics) -// -// -// -// -// val query = streamContext.getPropertyValue(SQL_QUERY).asString() -// val outputRecordType = streamContext.getPropertyValue(OUTPUT_RECORD_TYPE).asString() -// -// sqlContext.sql(query).rdd -// .foreachPartition(rows => { -// val outgoingEvents = rows.map(row => SparkUtils.convertToRecord(row, outputRecordType)).toList -// /** -// * create serializers -// */ -// val serializer = getSerializer( -// streamContext.getPropertyValue(OUTPUT_SERIALIZER).asString, -// streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) -// val errorSerializer = getSerializer( -// streamContext.getPropertyValue(ERROR_SERIALIZER).asString, -// streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) -// -// -// -// -// /** -// * push outgoing events and errors to Kafka -// */ -// kafkaSink.value.produce( -// streamContext.getPropertyValue(OUTPUT_TOPICS).asString, -// outgoingEvents, -// serializer -// ) -// -// kafkaSink.value.produce( -// streamContext.getPropertyValue(ERROR_TOPICS).asString, -// outgoingEvents.filter(r => r.hasField(FieldDictionary.RECORD_ERRORS)), -// errorSerializer -// ) -// -// }) -// -// -// } -// return None //Some(offsetRanges) -// } -// None -// } -//} - - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/SparkRecordStream.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/SparkRecordStream.scala deleted file mode 100644 index 19cff7c7b..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/SparkRecordStream.scala +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark - -import com.hurence.logisland.engine.EngineContext -import com.hurence.logisland.stream.{RecordStream, StreamContext} -import org.apache.spark.streaming.StreamingContext - - -trait SparkRecordStream extends RecordStream { - - /** - * ssetup the stream with spark app properties - * - * @param appName - * @param ssc - * @param streamContext - */ - def setup(appName: String,ssc: StreamingContext, streamContext: StreamContext, engineContext: EngineContext) - def getStreamContext() : StreamingContext -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala deleted file mode 100644 index de8cb9871..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/package.scala +++ /dev/null @@ -1,546 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream - -import com.hurence.logisland.component.{AllowableValue, PropertyDescriptor} -import com.hurence.logisland.serializer._ -import com.hurence.logisland.stream.spark.structured.provider.StructuredStreamProviderService -import com.hurence.logisland.validator.StandardValidators - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -object StreamProperties { - - val NONE_TOPIC = "none" - - val DEFAULT_RAW_TOPIC = new AllowableValue("_raw", "default raw topic", "the incoming non structured topic") - val DEFAULT_RECORDS_TOPIC = new AllowableValue("_records", "default events topic", "the outgoing structured topic") - val DEFAULT_ERRORS_TOPIC = new AllowableValue("_errors", "default raw topic", "the outgoing structured error topic") - val DEFAULT_METRICS_TOPIC = new AllowableValue("_metrics", "default metrics topic", "the topic to place processing metrics") - - val INPUT_TOPICS: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.input.topics") - .description("Sets the input Kafka topic name") - .required(true) - .defaultValue(DEFAULT_RAW_TOPIC.getValue) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build - - val OUTPUT_TOPICS: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.output.topics") - .description("Sets the output Kafka topic name") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .defaultValue(DEFAULT_RECORDS_TOPIC.getValue) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build - - val ERROR_TOPICS: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.error.topics") - .description("Sets the error topics Kafka topic name") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .defaultValue(DEFAULT_ERRORS_TOPIC.getValue) - .build - - val INPUT_TOPICS_PARTITIONS: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.input.topics.partitions") - .description("if autoCreate is set to true, this will set the number of partition at topic creation time") - .required(false) - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("20") - .build - - val OUTPUT_TOPICS_PARTITIONS: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.output.topics.partitions") - .description("if autoCreate is set to true, this will set the number of partition at topic creation time") - .required(false) - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("20") - .build - - val AVRO_INPUT_SCHEMA: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("avro.input.schema") - .description("the avro schema definition") - .required(false) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build - - val AVRO_OUTPUT_SCHEMA: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("avro.output.schema") - .description("the avro schema definition for the output serialization") - .required(false) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build - - val AVRO_SERIALIZER = new AllowableValue(classOf[AvroSerializer].getName, - "avro serialization", "serialize events as avro blocs") - val JSON_SERIALIZER = new AllowableValue(classOf[JsonSerializer].getName, - "json serialization", "serialize events as json blocs") - val EXTENDED_JSON_SERIALIZER = new AllowableValue(classOf[ExtendedJsonSerializer].getName, - "extended json serialization", "serialize events as json blocs supporting nested objects/arrays") - val KRYO_SERIALIZER = new AllowableValue(classOf[KryoSerializer].getName, - "kryo serialization", "serialize events as binary blocs") - val STRING_SERIALIZER = new AllowableValue(classOf[StringSerializer].getName, - "string serialization", "serialize events as string") - val BYTESARRAY_SERIALIZER = new AllowableValue(classOf[BytesArraySerializer].getName, - "byte array serialization", "serialize events as byte arrays") - val KURA_PROTOCOL_BUFFER_SERIALIZER = new AllowableValue(classOf[KuraProtobufSerializer].getName, - "Kura Protobuf serialization", "serialize events as Kura protocol buffer") - val NO_SERIALIZER = new AllowableValue("none", "no serialization", "send events as bytes") - - val INPUT_SERIALIZER: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.input.topics.serializer") - .description("") - .required(false) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .allowableValues(KRYO_SERIALIZER, JSON_SERIALIZER, EXTENDED_JSON_SERIALIZER, AVRO_SERIALIZER, BYTESARRAY_SERIALIZER, STRING_SERIALIZER, NO_SERIALIZER) - .defaultValue(KRYO_SERIALIZER.getValue) - .build - - val OUTPUT_SERIALIZER: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.output.topics.serializer") - .description("") - .required(false) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .allowableValues(KRYO_SERIALIZER, JSON_SERIALIZER, EXTENDED_JSON_SERIALIZER, AVRO_SERIALIZER, BYTESARRAY_SERIALIZER, STRING_SERIALIZER, NO_SERIALIZER) - .defaultValue(KRYO_SERIALIZER.getValue) - .build - - val ERROR_SERIALIZER: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.error.topics.serializer") - .description("") - .required(false) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .defaultValue(JSON_SERIALIZER.getValue) - .allowableValues(KRYO_SERIALIZER, JSON_SERIALIZER, EXTENDED_JSON_SERIALIZER, AVRO_SERIALIZER, BYTESARRAY_SERIALIZER, STRING_SERIALIZER, NO_SERIALIZER) - .build - - - val KAFKA_TOPIC_AUTOCREATE: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.topic.autoCreate") - .description("define wether a topic should be created automatically if not already exists") - .required(false) - .addValidator(StandardValidators.BOOLEAN_VALIDATOR) - .defaultValue("true") - .build - - val KAFKA_TOPIC_DEFAULT_PARTITIONS: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.topic.default.partitions") - .description("if autoCreate is set to true, this will set the number of partition at topic creation time") - .required(false) - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("20") - .build - - val KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.topic.default.replicationFactor") - .description("if autoCreate is set to true, this will set the number of replica for each partition at topic creation time") - .required(false) - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("3") - .build - - val KAFKA_METADATA_BROKER_LIST: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.metadata.broker.list") - .description("a comma separated list of host:port brokers") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .defaultValue("sandbox:9092") - .build - - val KAFKA_ZOOKEEPER_QUORUM: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.zookeeper.quorum") - .description("") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .defaultValue("sandbox:2181") - .build - - val LATEST_OFFSET = new AllowableValue("latest", "latest", "the offset to the latest offset") - val EARLIEST_OFFSET = new AllowableValue("earliest", "earliest offset", "the offset to the earliest offset") - val NONE_OFFSET = new AllowableValue("none", "none offset", "the latest saved offset") - - val KAFKA_MANUAL_OFFSET_RESET: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.manual.offset.reset") - .description("What to do when there is no initial offset in Kafka or if the current offset does not exist " + - "any more on the server (e.g. because that data has been deleted):\n" + - "earliest: automatically reset the offset to the earliest offset\n" + - "latest: automatically reset the offset to the latest offset\n" + - "none: throw exception to the consumer if no previous offset is found for the consumer's group\n" + - "anything else: throw exception to the consumer.") - .required(false) - .allowableValues(LATEST_OFFSET, EARLIEST_OFFSET, NONE_OFFSET) - .defaultValue(EARLIEST_OFFSET.getValue) - .build - - - val KAFKA_BATCH_SIZE: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.batch.size") - .description("measures batch size in total bytes instead of the number of messages. " + - "It controls how many bytes of data to collect before sending messages to the Kafka broker. " + - "Set this as high as possible, without exceeding available memory. The default value is 16384.\n\n" + - "If you increase the size of your buffer, it might never get full." + - "The Producer sends the information eventually, based on other triggers, such as linger time in milliseconds. " + - "Although you can impair memory usage by setting the buffer batch size too high, " + - "this does not impact latency.\n\n" + - "If your producer is sending all the time, " + - "you are probably getting the best throughput possible. If the producer is often idle, " + - "you might not be writing enough data to warrant the current allocation of resources.") - .required(false) - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("16384") - .build - - - val KAFKA_LINGER_MS: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.linger.ms") - .description("linger.ms sets the maximum time to buffer data in asynchronous mode. " + - "For example, a setting of 100 batches 100ms of messages to send at once. " + - "This improves throughput, but the buffering adds message delivery latency.\n\n" + - "By default, the producer does not wait. It sends the buffer any time data is available.\n\n" + - "Instead of sending immediately, you can set linger.ms to 5 and send more messages in one batch." + - " This would reduce the number of requests sent, but would add up to 5 milliseconds of latency to records " + - "sent, even if the load on the system does not warrant the delay.\n\n" + - "The farther away the broker is from the producer, the more overhead required to send messages. " + - "Increase linger.ms for higher latency and higher throughput in your producer.") - .required(false) - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("5") - .build - - val KAFKA_ACKS: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("kafka.acks") - .description("The number of acknowledgments the producer requires the leader to have received before considering a request complete. This controls the " - + " durability of records that are sent. The following settings are common: " - + "
      " - + "
    • acks=0 If set to zero then the producer will not wait for any acknowledgment from the" - + " server at all. The record will be immediately added to the socket buffer and considered sent. No guarantee can be" - + " made that the server has received the record in this case, and the retries configuration will not" - + " take effect (as the client won't generally know of any failures). The offset given back for each record will" - + " always be set to -1." - + "
    • acks=1 This will mean the leader will write the record to its local log but will respond" - + " without awaiting full acknowledgement from all followers. In this case should the leader fail immediately after" - + " acknowledging the record but before the followers have replicated it then the record will be lost." - + "
    • acks=all This means the leader will wait for the full set of in-sync replicas to" - + " acknowledge the record. This guarantees that the record will not be lost as long as at least one in-sync replica" - + " remains alive. This is the strongest available guarantee.") - .required(false) - .defaultValue("all") - .build - - - val WINDOW_DURATION: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("window.duration") - .description("all the elements in seen in a sliding window of time over. windowDuration = width of the window; must be a multiple of batching interval") - .addValidator(StandardValidators.LONG_VALIDATOR) - .required(false) - .build - - val SLIDE_DURATION: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("slide.duration") - .description("sliding interval of the window (i.e., the interval after which the new DStream will generate RDDs); must be a multiple of batching interval") - .addValidator(StandardValidators.LONG_VALIDATOR) - .required(false) - .build - - val GROUPBY: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("groupby") - .description("comma separated list of fields to group the partition by") - .addValidator(StandardValidators.COMMA_SEPARATED_LIST_VALIDATOR) - .required(false) - .build - - val STATE_TIMEOUT_MS: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("state.timeout.ms") - .description("the time in ms before we invalidate the microbatch state") - .addValidator(StandardValidators.LONG_VALIDATOR) - .required(false) - .defaultValue("2000") - .build - - val CHUNK_SIZE: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("chunk.size") - .description("the number of records to group into chunks") - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .required(false) - .defaultValue("100") - .build - ////////////////////////////////////// - // MQTT options - ////////////////////////////////////// - - val MQTT_BROKER_URL: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("mqtt.broker.url") - .description("brokerUrl A url MqttClient connects to. Set this or path as the url of the Mqtt Server. e.g. tcp://localhost:1883") - .addValidator(StandardValidators.URL_VALIDATOR) - .defaultValue("tcp://localhost:1883") - .required(false) - .build - - val MQTT_PERSISTENCE: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("mqtt.persistence") - .description("persistence By default it is used for storing incoming messages on disk. " + - "If memory is provided as value for this option, then recovery on restart is not supported.") - .defaultValue("memory") - .required(false) - .build - - val MQTT_TOPIC: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("mqtt.topic") - .description("Topic MqttClient subscribes to.") - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .required(true) - .build - - val MQTT_CLIENTID: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("mqtt.client.id") - .description("clientID this client is associated. Provide the same value to recover a stopped client.") - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .required(true) - .build - - val MQTT_QOS: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("mqtt.qos") - .description(" QoS The maximum quality of service to subscribe each topic at.Messages published at a lower " + - "quality of service will be received at the published QoS.Messages published at a higher quality of " + - "service will be received using the QoS specified on the subscribe") - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("0") - .required(false) - .build - - val MQTT_USERNAME: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("mqtt.username") - .description(" username Sets the user name to use for the connection to Mqtt Server. " + - "Do not set it, if server does not need this. Setting it empty will lead to errors.") - .required(false) - .build - - val MQTT_PASSWORD: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("mqtt.password") - .description("password Sets the password to use for the connection") - .required(false) - .build - - val MQTT_CLEAN_SESSION: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("mqtt.clean.session") - .description("cleanSession Setting it true starts a clean session, removes all checkpointed messages by " + - "a previous run of this source. This is set to false by default.") - .addValidator(StandardValidators.BOOLEAN_VALIDATOR) - .defaultValue("true") - .required(false) - .build - - val MQTT_CONNECTION_TIMEOUT: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("mqtt.connection.timeout") - .description("connectionTimeout Sets the connection timeout, a value of 0 is interpreted as " + - "wait until client connects. See MqttConnectOptions.setConnectionTimeout for more information") - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("5000") - .required(false) - .build - - val MQTT_KEEP_ALIVE: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("mqtt.keep.alive") - .description("keepAlive Same as MqttConnectOptions.setKeepAliveInterval.") - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("5000") - .required(false) - .build - - - val MQTT_VERSION: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("mqtt.version") - .description("mqttVersion Same as MqttConnectOptions.setMqttVersion") - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("5000") - .required(false) - .build - - val READ_TOPICS: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("read.topics") - .description("the input path for any topic to be read from") - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .required(true) - .build - - val READ_TOPICS_SERIALIZER: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("read.topics.serializer") - .description("the serializer to use") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .allowableValues(KRYO_SERIALIZER, JSON_SERIALIZER, EXTENDED_JSON_SERIALIZER, AVRO_SERIALIZER, BYTESARRAY_SERIALIZER, STRING_SERIALIZER, NO_SERIALIZER, KURA_PROTOCOL_BUFFER_SERIALIZER) - .defaultValue(NO_SERIALIZER.getValue) - .build - - val READ_TOPICS_KEY_SERIALIZER: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("read.topics.key.serializer") - .description("The key serializer to use") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .allowableValues(KRYO_SERIALIZER, JSON_SERIALIZER, EXTENDED_JSON_SERIALIZER, AVRO_SERIALIZER, BYTESARRAY_SERIALIZER, KURA_PROTOCOL_BUFFER_SERIALIZER, STRING_SERIALIZER, NO_SERIALIZER) - .defaultValue(NO_SERIALIZER.getValue) - .build - - val READ_STREAM_SERVICE_PROVIDER: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("read.stream.service.provider") - .description("the controller service that gives connection information") - .required(true) - .identifiesControllerService(classOf[StructuredStreamProviderService]) - .build - - - val WRITE_TOPICS: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("write.topics") - .description("the input path for any topic to be written to") - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .required(true) - .build - - val WRITE_TOPICS_SERIALIZER: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("write.topics.serializer") - .description("the serializer to use") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .allowableValues(KRYO_SERIALIZER, JSON_SERIALIZER, EXTENDED_JSON_SERIALIZER, AVRO_SERIALIZER, BYTESARRAY_SERIALIZER, STRING_SERIALIZER, NO_SERIALIZER, KURA_PROTOCOL_BUFFER_SERIALIZER) - .defaultValue(NO_SERIALIZER.getValue) - .build - - val WRITE_TOPICS_KEY_SERIALIZER: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("write.topics.key.serializer") - .description("The key serializer to use") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .allowableValues(KRYO_SERIALIZER, JSON_SERIALIZER, EXTENDED_JSON_SERIALIZER, AVRO_SERIALIZER, BYTESARRAY_SERIALIZER, STRING_SERIALIZER, NO_SERIALIZER, KURA_PROTOCOL_BUFFER_SERIALIZER) - .defaultValue(NO_SERIALIZER.getValue) - .build - - val WRITE_STREAM_SERVICE_PROVIDER: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("write.stream.service.provider") - .description("the controller service that gives connection information") - .required(true) - .identifiesControllerService(classOf[StructuredStreamProviderService]) - .build - - - ////////////////////////////////////// - // HDFS options - ////////////////////////////////////// - val FILE_FORMAT_PARQUET = "parquet" - val FILE_FORMAT_ORC = "orc" - val FILE_FORMAT_JSON = "json" - val FILE_FORMAT_TXT = "txt" - - val OUTPUT_FOLDER_PATH = new PropertyDescriptor.Builder() - .name("output.folder.path") - .description("the location where to put files : file:///tmp/out") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build - - - val INPUT_FORMAT = new PropertyDescriptor.Builder() - .name("input.format") - .description("Used to load data from a raw record_value. Only json supported") - .required(false) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .defaultValue("") - .build - - val OUTPUT_FORMAT = new PropertyDescriptor.Builder() - .name("output.format") - .description("can be parquet, orc csv") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .allowableValues(FILE_FORMAT_PARQUET, FILE_FORMAT_TXT, FILE_FORMAT_JSON, FILE_FORMAT_JSON) - .build - - val RECORD_TYPE = new PropertyDescriptor.Builder() - .name("record.type") - .description("the type of event to filter") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build - - val NUM_PARTITIONS = new PropertyDescriptor.Builder() - .name("num.partitions") - .description("the numbers of physical files on HDFS") - .required(false) - .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR) - .defaultValue("4") - .build - - val EXCLUDE_ERRORS = new PropertyDescriptor.Builder() - .name("exclude.errors") - .description("do we include records with errors ?") - .required(false) - .addValidator(StandardValidators.BOOLEAN_VALIDATOR) - .defaultValue("true") - .build - - val DATE_FORMAT = new PropertyDescriptor.Builder() - .name("date.format") - .description("The format of the date for the partition") - .required(false) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .defaultValue("yyyy-MM-dd") - .build - - - ////////////////////////////////////// - // SQL options - ////////////////////////////////////// - val SQL_QUERY = new PropertyDescriptor.Builder() - .name("sql.query") - .description("The SQL query to execute, " + - "please note that the table name must exists in input topics names") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build - - val MAX_RESULTS_COUNT = new PropertyDescriptor.Builder() - .name("max.results.count") - .description("the max number of rows to output. (-1 for no limit)") - .required(false) - .addValidator(StandardValidators.INTEGER_VALIDATOR) - .defaultValue("-1") - .build - - val OUTPUT_RECORD_TYPE = new PropertyDescriptor.Builder() - .name("output.record.type") - .description("the output type of the record") - .required(false) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .defaultValue("aggregation") - .build - - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectBaseProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectBaseProviderService.scala deleted file mode 100644 index 61d3d2592..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectBaseProviderService.scala +++ /dev/null @@ -1,112 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark.provider - -import java.util -import java.util.Collections - -import com.hurence.logisland.annotation.lifecycle.OnEnabled -import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} -import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} -import com.hurence.logisland.record.Record -import com.hurence.logisland.stream.StreamContext -import com.hurence.logisland.stream.spark.structured.provider.StructuredStreamProviderService -import com.hurence.logisland.util.spark.ControllerServiceLookupSink -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.sql.streaming.DataStreamWriter -import org.apache.spark.sql.{Dataset, SparkSession} - -class KafkaConnectBaseProviderService extends AbstractControllerService with StructuredStreamProviderService { - - var connectorProperties = "" - var keyConverter = "" - var valueConverter = "" - var keyConverterProperties = "" - var valueConverterProperties = "" - var maxConfigurations = 1 - var delegateConnectorClass = "" - var offsetBackingStore = "" - var offsetBackingStoreProperties = "" - - @OnEnabled - @throws[InitializationException] - override def init(context: ControllerServiceInitializationContext): Unit = { - super.init(context) - this.synchronized { - try { - delegateConnectorClass = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_CONNECTOR_CLASS).asString() - connectorProperties = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_CONNECTOR_PROPERTIES).asString() - valueConverter = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER).asString() - valueConverterProperties = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER_PROPERTIES).asString() - keyConverter = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER).asString() - keyConverterProperties = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER_PROPERTIES).asString() - maxConfigurations = (context getPropertyValue StreamOptions.KAFKA_CONNECT_MAX_TASKS).asInteger() - offsetBackingStore = (context getPropertyValue StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE).asString() - offsetBackingStoreProperties = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE_PROPERTIES).asString() - } catch { - case e: Exception => - throw new InitializationException(e) - } - } - } - - - /** - * Allows subclasses to register which property descriptor objects are - * supported. - * - * @return PropertyDescriptor objects this processor currently supports - */ - override def getSupportedPropertyDescriptors() = { - val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] - descriptors.add(StreamOptions.KAFKA_CONNECT_CONNECTOR_CLASS) - descriptors.add(StreamOptions.KAFKA_CONNECT_CONNECTOR_PROPERTIES) - descriptors.add(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER) - descriptors.add(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER_PROPERTIES) - descriptors.add(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER) - descriptors.add(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER_PROPERTIES) - descriptors.add(StreamOptions.KAFKA_CONNECT_MAX_TASKS) - descriptors.add(StreamOptions.KAFKA_CONNECT_MAX_PARTITIONS) - descriptors.add(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE) - descriptors.add(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE_PROPERTIES) - Collections.unmodifiableList(descriptors) - } - - - /** - * create a streaming DataFrame that represents data received - * - * @param spark - * @param streamContext - * @return DataFrame currently loaded - */ - override def read(spark: SparkSession, streamContext: StreamContext): Dataset[Record] = { - throw new UnsupportedOperationException("Operation not supported. Please be sure to use the right component") - } - - - /** - * create a streaming DataFrame that represents data to be written - * - * @param streamContext - * @return DataFrame currently loaded - */ - override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): DataStreamWriter[_] = { - throw new UnsupportedOperationException("Operation not supported. Please be sure to use the right component") - } - - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSinkProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSinkProviderService.scala deleted file mode 100644 index 066a41711..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSinkProviderService.scala +++ /dev/null @@ -1,122 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark.provider - -import com.hurence.logisland.annotation.lifecycle.OnEnabled -import com.hurence.logisland.component.InitializationException -import com.hurence.logisland.connect.Utils -import com.hurence.logisland.connect.sink.KafkaConnectStreamSink -import com.hurence.logisland.controller.ControllerServiceInitializationContext -import com.hurence.logisland.record.{FieldDictionary, Record} -import com.hurence.logisland.stream.{StreamContext, StreamProperties} -import com.hurence.logisland.util.spark.ControllerServiceLookupSink -import org.apache.kafka.connect.sink.SinkConnector -import org.apache.spark.TaskContext -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.sql._ -import org.apache.spark.sql.streaming.DataStreamWriter - -class KafkaConnectStructuredSinkProviderService extends KafkaConnectBaseProviderService { - - - var maxPartitions = 1 - @transient var writer: KafkaConnectStreamSink = null - - @OnEnabled - @throws[InitializationException] - override def init(context: ControllerServiceInitializationContext): Unit = { - super.init(context) - this.synchronized { - try { - maxPartitions = maxConfigurations - if (context.getPropertyValue(StreamOptions.KAFKA_CONNECT_MAX_PARTITIONS).isSet) { - maxPartitions = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_MAX_PARTITIONS).asInteger() - } - } catch { - case e: Exception => - throw new InitializationException(e) - } - } - } - - - /** - * create a streaming DataFrame that represents data to be written - * - * @param streamContext - * @return DataFrame currently loaded - */ - override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): DataStreamWriter[_] = { - implicit val encoder = Encoders.tuple(Encoders.BINARY, Encoders.BINARY) - val df2 = df - .mapPartitions(record => record.map(record => (record.getField(FieldDictionary.RECORD_KEY).getRawValue().asInstanceOf[Array[Byte]], - record.getField(FieldDictionary.RECORD_VALUE).getRawValue().asInstanceOf[Array[Byte]]))) - .toDF("key", "value") - - val topicName = streamContext.getPropertyValue(StreamProperties.WRITE_TOPICS).asString().split(",")(0).trim - - def writer() = controllerServiceLookupSink.value.getControllerService(getIdentifier).asInstanceOf[KafkaConnectStructuredSinkProviderService] - .createWriter(SparkSession.builder().getOrCreate().sqlContext, streamContext, topicName) - - df2/*.repartition(maxPartitions, df2.col("key"))*/ - .writeStream - .foreach(new ForeachWriter[Row] { - - override def process(value: Row): Unit = { - writer().enqueueOnPartition(TaskContext.getPartitionId(), value.getAs(0), value.getAs(1)) - } - - override def close(errorOrNull: Throwable): Unit = { - if (errorOrNull != null) { - logger.error("Error while storing data", errorOrNull) - } - writer().flushPartition(TaskContext.getPartitionId()) - } - - override def open(partitionId: Long, version: Long): Boolean = { - writer().openPartition(partitionId.intValue()) - } - }) - } - - - def createWriter(sqlContext: SQLContext, streamContext: StreamContext, topic: String): KafkaConnectStreamSink = - synchronized { - - if (writer == null) { - val keyConverterInstance = Utils.createConverter(keyConverter, keyConverterProperties, true) - val valueConverterInstance = Utils.createConverter(valueConverter, valueConverterProperties, false) - //create the right backing store - val offsetBackingStoreInstance = Utils.createOffsetBackingStore(offsetBackingStore, Utils.propertiesToMap(offsetBackingStoreProperties)) - - writer = new KafkaConnectStreamSink( - sqlContext, - Utils.propertiesToMap(connectorProperties), - keyConverterInstance, - valueConverterInstance, - offsetBackingStoreInstance, - maxConfigurations, - topic, - delegateConnectorClass, - streamContext.getIdentifier) - writer.start() - } - - writer - } - - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSourceProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSourceProviderService.scala deleted file mode 100644 index 97b30d87b..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/KafkaConnectStructuredSourceProviderService.scala +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.hurence.logisland.stream.spark.provider - -import com.hurence.logisland.annotation.lifecycle.OnEnabled -import com.hurence.logisland.component.InitializationException -import com.hurence.logisland.controller.ControllerServiceInitializationContext -import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} -import com.hurence.logisland.stream.StreamContext -import org.apache.spark.SparkContext -import org.apache.spark.sql.{Dataset, SparkSession} - -class KafkaConnectStructuredSourceProviderService extends KafkaConnectBaseProviderService { - - var maxPartitions = 1 - - @OnEnabled - @throws[InitializationException] - override def init(context: ControllerServiceInitializationContext): Unit = { - super.init(context) - this.synchronized { - try { - maxPartitions = SparkContext.getOrCreate().defaultParallelism - if (context.getPropertyValue(StreamOptions.KAFKA_CONNECT_MAX_PARTITIONS).isSet) { - maxPartitions = context.getPropertyValue(StreamOptions.KAFKA_CONNECT_MAX_PARTITIONS).asInteger() - } - } catch { - case e: Exception => - throw new InitializationException(e) - } - } - } - - - /** - * create a streaming DataFrame that represents data received - * - * @param spark - * @param streamContext - * @return DataFrame currently loaded - */ - override def read(spark: SparkSession, streamContext: StreamContext): Dataset[Record] = { - import spark.implicits._ - implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] - - getLogger.info(s"Connecting kafka-connect source $delegateConnectorClass") - spark.readStream - .format("com.hurence.logisland.connect.source.KafkaConnectStreamSourceProvider") - .option(StreamOptions.KAFKA_CONNECT_CONNECTOR_PROPERTIES.getName, connectorProperties) - .option(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER.getName, keyConverter) - .option(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER_PROPERTIES.getName, keyConverterProperties) - .option(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER.getName, valueConverter) - .option(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER_PROPERTIES.getName, valueConverterProperties) - .option(StreamOptions.KAFKA_CONNECT_MAX_TASKS.getName, maxConfigurations) - .option(StreamOptions.KAFKA_CONNECT_CONNECTOR_CLASS.getName, delegateConnectorClass) - .option(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE.getName, offsetBackingStore) - .option(StreamOptions.KAFKA_CONNECT_OFFSET_BACKING_STORE_PROPERTIES.getName, offsetBackingStoreProperties) - .load(streamContext.getIdentifier) - //Topic, Partition, Key, Value - .as[(String, String, String, Array[Byte], Array[Byte])] - .map(r => - new StandardRecord("kafka_connect") - .setField(FieldDictionary.RECORD_KEY, FieldType.BYTES, r._4) - .setField(FieldDictionary.RECORD_VALUE, FieldType.BYTES, r._5)) - .coalesce(maxPartitions) - } - - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/package.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/package.scala deleted file mode 100644 index 0c7ff6c9f..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/provider/package.scala +++ /dev/null @@ -1,126 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.hurence.logisland.stream.spark.provider - -import com.hurence.logisland.component.{AllowableValue, PropertyDescriptor} -import com.hurence.logisland.validator.StandardValidators -import org.apache.kafka.connect.connector.Connector -import org.apache.kafka.connect.runtime.standalone.StandaloneConfig -import org.apache.kafka.connect.storage.Converter - -object StreamOptions { - - val MEMORY_BACKING_STORE = new AllowableValue("memory", "In memory backing store", - "Standalone in memory offset backing store. Not suitable for clustered deployments unless source is unique or stateless") - - val FILE_BACKING_STORE = new AllowableValue("file", "File based backing store", - "Standalone filesystem based offset backing store. " + - "You have to specify the property " + StandaloneConfig.OFFSET_STORAGE_FILE_FILENAME_CONFIG + " for the file path." + - "Not suitable for clustered deployments unless source is unique or standalone") - - val KAFKA_BACKING_STORE = new AllowableValue("kafka", "Kafka topic based backing store", - "Distributed kafka topic based offset backing store. " + - "See the javadoc of class org.apache.kafka.connect.storage.KafkaOffsetBackingStore for the configuration options." + - "This backing store is well suited for distributed deployments.") - - - ////////////////////////////////////// - // Kafka Connect options - ////////////////////////////////////// - - - val KAFKA_CONNECT_CONNECTOR_CLASS = new PropertyDescriptor.Builder() - .name("kc.connector.class") - .description("The class canonical name of the kafka connector to use.") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build - - val KAFKA_CONNECT_CONNECTOR_PROPERTIES = new PropertyDescriptor.Builder() - .name("kc.connector.properties") - .description("The properties (key=value) for the connector.") - .required(false) - .defaultValue("") - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build - - val KAFKA_CONNECT_MAX_TASKS = new PropertyDescriptor.Builder() - .name("kc.worker.tasks.max") - .description("Max number of threads for this connector") - .required(true) - .defaultValue("1") - .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) - .build - - val KAFKA_CONNECT_MAX_PARTITIONS = new PropertyDescriptor.Builder() - .name("kc.partitions.max") - .description("Max number of partitions for this connector.") - .required(false) - .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) - .build - - val KAFKA_CONNECT_KEY_CONVERTER = new PropertyDescriptor.Builder() - .name("kc.data.key.converter") - .description("Key converter class") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .addValidator(StandardValidators.TYPE_VALIDATOR(classOf[Converter])) - .build - - val KAFKA_CONNECT_VALUE_CONVERTER = new PropertyDescriptor.Builder() - .name("kc.data.value.converter") - .description("Value converter class") - .required(true) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .addValidator(StandardValidators.TYPE_VALIDATOR(classOf[Converter])) - .build - - val KAFKA_CONNECT_KEY_CONVERTER_PROPERTIES = new PropertyDescriptor.Builder() - .name("kc.data.key.converter.properties") - .description("Key converter properties") - .required(false) - .defaultValue("") - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build - - val KAFKA_CONNECT_VALUE_CONVERTER_PROPERTIES = new PropertyDescriptor.Builder() - .name("kc.data.value.converter.properties") - .description("Value converter properties") - .required(false) - .defaultValue("") - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build - - - val KAFKA_CONNECT_OFFSET_BACKING_STORE = new PropertyDescriptor.Builder() - .name("kc.connector.offset.backing.store") - .required(false) - .description("The underlying backing store to be used.") - .defaultValue(MEMORY_BACKING_STORE.getValue) - .allowableValues(MEMORY_BACKING_STORE, FILE_BACKING_STORE, KAFKA_BACKING_STORE) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build() - - val KAFKA_CONNECT_OFFSET_BACKING_STORE_PROPERTIES = new PropertyDescriptor.Builder() - .name("kc.connector.offset.backing.store.properties") - .description("Properties to configure the offset backing store") - .required(false) - .defaultValue("") - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala deleted file mode 100644 index d5ebc9427..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala +++ /dev/null @@ -1,184 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark.structured - -import java.util -import java.util.Collections - -import com.hurence.logisland.component.PropertyDescriptor -import com.hurence.logisland.engine.EngineContext -import com.hurence.logisland.engine.spark.remote.PipelineConfigurationBroadcastWrapper -import com.hurence.logisland.stream.StreamProperties._ -import com.hurence.logisland.stream.spark.SparkRecordStream -import com.hurence.logisland.stream.spark.structured.provider.StructuredStreamProviderService -import com.hurence.logisland.stream.{AbstractRecordStream, StreamContext} -import com.hurence.logisland.util.spark._ -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.groupon.metrics.UserMetricsSystem -import org.apache.spark.sql.{Dataset, SQLContext, SparkSession} -import org.apache.spark.streaming.StreamingContext -import org.slf4j.LoggerFactory - - -class StructuredStream extends AbstractRecordStream with SparkRecordStream { - - - protected var provider: StructuredStreamProviderService = _ - - - protected var appName: String = "" - @transient protected var ssc: StreamingContext = _ - @transient protected var streamContext: StreamContext = _ - protected var engineContext: EngineContext = _ - protected var controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink] = _ - protected var needMetricsReset = false - - - private val logger = LoggerFactory.getLogger(this.getClass) - - override def getSupportedPropertyDescriptors() = { - val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] - - descriptors.add(READ_STREAM_SERVICE_PROVIDER) - descriptors.add(READ_TOPICS_SERIALIZER) - descriptors.add(READ_TOPICS_KEY_SERIALIZER) - descriptors.add(WRITE_STREAM_SERVICE_PROVIDER) - descriptors.add(WRITE_TOPICS_SERIALIZER) - descriptors.add(WRITE_TOPICS_KEY_SERIALIZER) - descriptors.add(GROUPBY) - descriptors.add(STATE_TIMEOUT_MS) - descriptors.add(CHUNK_SIZE) - - Collections.unmodifiableList(descriptors) - } - - override def setup(appName: String, ssc: StreamingContext, streamContext: StreamContext, engineContext: EngineContext) = { - this.appName = appName - this.ssc = ssc - this.streamContext = streamContext - this.engineContext = engineContext - } - - override def getStreamContext(): StreamingContext = this.ssc - - override def start() = { - if (ssc == null) - throw new IllegalStateException("stream not initialized") - - try { - - val pipelineMetricPrefix = streamContext.getIdentifier /*+ ".partition" + partitionId*/ + "." - val pipelineTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + "Pipeline.processing_time_ms").time() - - controllerServiceLookupSink = ssc.sparkContext.broadcast( - ControllerServiceLookupSink(engineContext.getControllerServiceConfigurations) - ) - val spark = SparkSession.builder() - .config(this.ssc.sparkContext.getConf) - .getOrCreate() - - spark.sqlContext.setConf("spark.sql.shuffle.partitions", "4")//TODO make this configurable - - - val controllerServiceLookup = controllerServiceLookupSink.value.getControllerServiceLookup() - streamContext.setControllerServiceLookup(controllerServiceLookup) - - - val readStreamService = streamContext.getPropertyValue(READ_STREAM_SERVICE_PROVIDER) - .asControllerService() - .asInstanceOf[StructuredStreamProviderService] - - //TODO stange way to update streamcontext, should'nt it be broadcasted ? - // moreover the streamcontext should always be the last updated one in this function for me. - // If driver wants to change it, it should call setup which would use a broadcast value for example ? - // Unfortunately we should not attempt changes before having good unit test so that we do not broke streams - // while cleaning streams code... Indeed I am afraid the remote api engines use this strange behaviour here - // to change config on the fly when it should use the setup method (maybe using broadcast as well). - // In this method start, the config should be considered already up to date in my opinion. - streamContext.getProcessContexts.clear() - streamContext.getProcessContexts.addAll( - PipelineConfigurationBroadcastWrapper.getInstance().get(streamContext.getIdentifier)) - - val readDF = readStreamService.load(spark, controllerServiceLookupSink, streamContext) - - val writeStreamService = streamContext.getPropertyValue(WRITE_STREAM_SERVICE_PROVIDER) - .asControllerService() - .asInstanceOf[StructuredStreamProviderService] - - // Write key-value data from a DataFrame to a specific Kafka topic specified in an option - val ds = writeStreamService.save(readDF, controllerServiceLookupSink, streamContext) - pipelineTimerContext.stop() - - } - catch { - case ex: Throwable => - logger.error("Error while processing the streaming query. ", ex) - throw new IllegalStateException("Error while processing the streaming query", ex) - } - } - - override def stop(): Unit - - = { - super.stop() - //stop the source - val thisStream = SQLContext.getOrCreate(getStreamContext().sparkContext).streams.active.find(stream => streamContext.getIdentifier.equals(stream.name)); - if (thisStream.isDefined) { - if (!getStreamContext().sparkContext.isStopped && thisStream.get.isActive) { - try { - thisStream.get.stop() - thisStream.get.awaitTermination() - } catch { - case ex: Throwable => logger.warn(s"Stream ${streamContext.getIdentifier} may not have been correctly stopped") - } - } - } else { - logger.warn(s"Unable to find an active streaming query for stream ${streamContext.getIdentifier}") - } - } -} - - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/ConsoleStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/ConsoleStructuredStreamProviderService.scala deleted file mode 100644 index af129a23d..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/ConsoleStructuredStreamProviderService.scala +++ /dev/null @@ -1,183 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark.structured.provider - -import java.util -import java.util.Collections - -import com.hurence.logisland.annotation.documentation.CapabilityDescription -import com.hurence.logisland.annotation.lifecycle.OnEnabled -import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} -import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} -import com.hurence.logisland.record.Record -import com.hurence.logisland.serializer.SerializerProvider -import com.hurence.logisland.stream.StreamContext -import com.hurence.logisland.stream.StreamProperties.{AVRO_OUTPUT_SCHEMA, WRITE_TOPICS_KEY_SERIALIZER, WRITE_TOPICS_SERIALIZER} -import com.hurence.logisland.util.spark.ControllerServiceLookupSink -import com.hurence.logisland.validator.StandardValidators -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.sql.streaming.{DataStreamWriter, OutputMode, StreamingQuery} -import org.apache.spark.sql.{Dataset, ForeachWriter, SparkSession} - -@CapabilityDescription("Provide a ways to print output in console in a StructuredStream streams") -class ConsoleStructuredStreamProviderService extends AbstractControllerService with StructuredStreamProviderService { - - val NUM_ROWS_TO_SHOW: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("rows") - .description("Number of rows to print every trigger (default: 20 see spark documentation)") - .addValidator(StandardValidators.POSITIVE_LONG_VALIDATOR) - .required(true) - .build - - val TRUNCATE_OUTPUT: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("truncate") - .description("Whether to truncate the output if too long (default: true see spark documentation) ") - .addValidator(StandardValidators.BOOLEAN_VALIDATOR) - .required(false) - .build - - var numRows: Option[Long] = _ - var truncate: Option[Boolean] = _ - - @OnEnabled - @throws[InitializationException] - override def init(context: ControllerServiceInitializationContext): Unit = { - super.init(context) - this.synchronized { - try { - if (context.getPropertyValue(NUM_ROWS_TO_SHOW).isSet) { - numRows = Some(context.getPropertyValue(NUM_ROWS_TO_SHOW).asLong()) - } else { - numRows = None - } - if (context.getPropertyValue(TRUNCATE_OUTPUT).isSet) { - truncate = Some(context.getPropertyValue(TRUNCATE_OUTPUT).asBoolean()) - } else { - truncate = None - } - } catch { - case e: Exception => - throw new InitializationException(e) - } - } - } - - /** - * Allows subclasses to register which property descriptor objects are - * supported. - * - * @return PropertyDescriptor objects this processor currently supports - */ - override def getSupportedPropertyDescriptors() = { - val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] - - Collections.unmodifiableList(descriptors) - } - - /** - * create a streaming DataFrame that represents data received - * - * @param spark - * @param streamContext - * @return DataFrame currently loaded - */ - override def read(spark: SparkSession, streamContext: StreamContext) = { - throw new IllegalArgumentException("ConsoleStructuredStreamProviderService class does not support read operation yet"); - } - - /** - * create a streaming DataFrame that represents data received - * - * @param streamContext - * @return DataFrame currently loaded - */ - override def save(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): StreamingQuery = { - - implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] - - // make sure controller service lookup won't be serialized !! - streamContext.setControllerServiceLookup(null) - - // create serializer - val serializer = SerializerProvider.getSerializer( - streamContext.getPropertyValue(WRITE_TOPICS_SERIALIZER).asString, - streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) - - // create serializer - val keySerializer = SerializerProvider.getSerializer( - streamContext.getPropertyValue(WRITE_TOPICS_KEY_SERIALIZER).asString, null) - - // do the parallel processing - val df2 = df.mapPartitions(record => record.map(record => serializeRecords(serializer, keySerializer, record))) - - - write(df2, controllerServiceLookupSink, streamContext) - .queryName(streamContext.getIdentifier) - // .outputMode("update") - .foreach(new ForeachWriter[Record] { - def open(partitionId: Long, version: Long): Boolean = { - // open connection - true - } - - def process(record: Record) = { - println(record) - // write string to connection - } - - def close(errorOrNull: Throwable): Unit = { - // close the connection - } - }).start() - - // .processAllAvailable() - - } - /** - * create a streaming DataFrame that represents data received - * - * @param streamContext - * @return DataFrame currently loaded - */ - override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): DataStreamWriter[Record] = { -// implicit val myObjEncoder = org.apache.spark.sql.Encoders.kryo[Record] - - val dataStreamWriter = df.writeStream - .format("console") - if (numRows.isDefined) { - dataStreamWriter.option("numRows", numRows.get) - } - if (truncate.isDefined) { - dataStreamWriter.option("truncate", truncate.get) - } - dataStreamWriter - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/KafkaStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/KafkaStructuredStreamProviderService.scala deleted file mode 100644 index fa55cd6d9..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/KafkaStructuredStreamProviderService.scala +++ /dev/null @@ -1,280 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark.structured.provider - -import java.util -import java.util.Collections - -import com.hurence.logisland.annotation.documentation.CapabilityDescription -import com.hurence.logisland.annotation.lifecycle.OnEnabled -import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} -import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} -import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} -import com.hurence.logisland.runner.GlobalOptions -import com.hurence.logisland.stream.StreamContext -import com.hurence.logisland.stream.StreamProperties._ -import com.hurence.logisland.util.kafka.KafkaSink -import com.hurence.logisland.util.spark.ControllerServiceLookupSink -import kafka.admin.AdminUtils -import org.apache.kafka.clients.consumer.ConsumerConfig -import org.apache.kafka.clients.producer.ProducerConfig -import org.apache.kafka.common.security.JaasUtils -import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySerializer} -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.sql.{Dataset, SparkSession} - -/** - * Compatible with kafka 2.4 or higher - */ -@CapabilityDescription("Provide a ways to use kafka as input or output in StructuredStream streams") -class KafkaStructuredStreamProviderService() extends AbstractControllerService with StructuredStreamProviderService { - - // private val logger = LoggerFactory.getLogger(this.getClass) - - - var appName = "" - var kafkaSinkParams: Map[String, Object] = _ - var kafkaParams: Map[String, Object] = _ - // Define the Kafka parameters, broker list must be specified - var inputTopics = Set[String]() - var outputTopics = Set[String]() - var errorTopics = Set[String]() - var metricsTopics = Set[String]() - var topicAutocreate = true - var topicDefaultPartitions = 3 - var topicDefaultReplicationFactor = 1 - var brokerList = "" - var zkQuorum = "" - var kafkaBatchSize = "16384" - var kafkaLingerMs = "5" - var kafkaAcks = "0" - var kafkaOffset = "latest" - var inputSerializerType = "" - var outputSerializerType = "" - - @OnEnabled - @throws[InitializationException] - override def init(context: ControllerServiceInitializationContext): Unit = { - super.init(context) - this.synchronized { - try { - - // Define the Kafka parameters, broker list must be specified - inputTopics = context.getPropertyValue(INPUT_TOPICS).asString.split(",").toSet - outputTopics = context.getPropertyValue(OUTPUT_TOPICS).asString.split(",").toSet - errorTopics = context.getPropertyValue(ERROR_TOPICS).asString.split(",").toSet - metricsTopics = DEFAULT_METRICS_TOPIC.getValue.split(",").toSet - - inputSerializerType = context.getPropertyValue(INPUT_SERIALIZER).asString() - outputSerializerType = context.getPropertyValue(OUTPUT_SERIALIZER).asString() - - topicAutocreate = context.getPropertyValue(KAFKA_TOPIC_AUTOCREATE).asBoolean().booleanValue() - topicDefaultPartitions = context.getPropertyValue(KAFKA_TOPIC_DEFAULT_PARTITIONS).asInteger().intValue() - topicDefaultReplicationFactor = context.getPropertyValue(KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR).asInteger().intValue() - brokerList = context.getPropertyValue(KAFKA_METADATA_BROKER_LIST).asString - zkQuorum = context.getPropertyValue(KAFKA_ZOOKEEPER_QUORUM).asString - - - kafkaBatchSize = context.getPropertyValue(KAFKA_BATCH_SIZE).asString - kafkaLingerMs = context.getPropertyValue(KAFKA_LINGER_MS).asString - kafkaAcks = context.getPropertyValue(KAFKA_ACKS).asString - kafkaOffset = context.getPropertyValue(KAFKA_MANUAL_OFFSET_RESET).asString - - - kafkaSinkParams = Map( - ProducerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokerList, - ProducerConfig.CLIENT_ID_CONFIG -> appName, - ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getCanonicalName, - ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName, - ProducerConfig.ACKS_CONFIG -> kafkaAcks, - ProducerConfig.RETRIES_CONFIG -> "3", - ProducerConfig.LINGER_MS_CONFIG -> kafkaLingerMs, - ProducerConfig.BATCH_SIZE_CONFIG -> kafkaBatchSize, - ProducerConfig.RETRY_BACKOFF_MS_CONFIG -> "1000", - ProducerConfig.RECONNECT_BACKOFF_MS_CONFIG -> "1000") - - -// // TODO deprecate topic creation here (must be done through the agent) -// if (topicAutocreate) { -// val zkUtils = ZkUtils.apply(zkQuorum, 10000, 10000, JaasUtils.isZkSecurityEnabled) -// createTopicsIfNeeded(zkUtils, inputTopics, topicDefaultPartitions, topicDefaultReplicationFactor) -// createTopicsIfNeeded(zkUtils, outputTopics, topicDefaultPartitions, topicDefaultReplicationFactor) -// createTopicsIfNeeded(zkUtils, errorTopics, topicDefaultPartitions, topicDefaultReplicationFactor) -// createTopicsIfNeeded(zkUtils, metricsTopics, 1, 1) -// } - - - kafkaParams = Map[String, Object]( - ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokerList, - ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer], - ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer], - ConsumerConfig.GROUP_ID_CONFIG -> appName, - ConsumerConfig.RECONNECT_BACKOFF_MS_CONFIG -> "50", - ConsumerConfig.RETRY_BACKOFF_MS_CONFIG -> "100", - ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> kafkaOffset, - ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> "false", - ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG -> "30000" - /*, - ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG -> "5000"*/ - ) - - } catch { - case e: Exception => - throw new InitializationException(e) - } - } - } - - /** - * create a streaming DataFrame that represents data received - * - * @param spark - * @param streamContext - * @return DataFrame currently loaded - */ - override def read(spark: SparkSession, streamContext: StreamContext) = { - import spark.implicits._ - implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] - - logger.info(s"starting Kafka direct stream on topics $inputTopics from $kafkaOffset offsets") - val df = spark.readStream - .format("kafka") - .option("kafka.bootstrap.servers", brokerList) - .option("subscribe", inputTopics.mkString(",")) - .load() - .selectExpr("CAST(key AS STRING)", "CAST(value AS BINARY)") - // .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") - .as[(String, Array[Byte])] - .map(r => { - new StandardRecord(inputTopics.head) - .setField(FieldDictionary.RECORD_KEY, FieldType.STRING, r._1) - .setField(FieldDictionary.RECORD_VALUE, FieldType.BYTES, r._2) - }) - - df - } - - /** - * Allows subclasses to register which property descriptor objects are - * supported. - * - * @return PropertyDescriptor objects this processor currently supports - */ - override def getSupportedPropertyDescriptors() = { - val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] - descriptors.add(ERROR_TOPICS) - descriptors.add(INPUT_TOPICS) - descriptors.add(OUTPUT_TOPICS) - descriptors.add(AVRO_INPUT_SCHEMA) - descriptors.add(AVRO_OUTPUT_SCHEMA) - descriptors.add(INPUT_SERIALIZER) - descriptors.add(OUTPUT_SERIALIZER) - descriptors.add(ERROR_SERIALIZER) - descriptors.add(KAFKA_TOPIC_AUTOCREATE) - descriptors.add(KAFKA_TOPIC_DEFAULT_PARTITIONS) - descriptors.add(KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR) - descriptors.add(KAFKA_METADATA_BROKER_LIST) - descriptors.add(KAFKA_ZOOKEEPER_QUORUM) - descriptors.add(KAFKA_MANUAL_OFFSET_RESET) - descriptors.add(KAFKA_BATCH_SIZE) - descriptors.add(KAFKA_LINGER_MS) - descriptors.add(KAFKA_ACKS) - descriptors.add(WINDOW_DURATION) - descriptors.add(SLIDE_DURATION) - Collections.unmodifiableList(descriptors) - } - -// /** -// * Topic creation -// * -// * @param zkUtils -// * @param topics -// * @param topicDefaultPartitions -// * @param topicDefaultReplicationFactor -// */ -// def createTopicsIfNeeded(zkUtils: ZkUtils, -// topics: Set[String], -// topicDefaultPartitions: Int, -// topicDefaultReplicationFactor: Int): Unit = { -// -// topics.foreach(topic => { -// -// if (!topic.equals(NONE_TOPIC) && !AdminUtils.topicExists(zkUtils, topic)) { -// AdminUtils.createTopic(zkUtils, topic, topicDefaultPartitions, topicDefaultReplicationFactor) -// Thread.sleep(1000) -// logger.info(s"created topic $topic with" + -// s" $topicDefaultPartitions partitions and" + -// s" $topicDefaultReplicationFactor replicas") -// } -// }) -// } - - case class RecordWrapper(record:Record) - - /** - * create a streaming DataFrame that represents data received - * - * @param streamContext - * @return DataFrame currently loaded - */ - override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext) = { - val sender = df.sparkSession.sparkContext.broadcast(KafkaSink(kafkaSinkParams)) - - import df.sparkSession.implicits._ - - var checkpointLocation = "checkpoints" - if (GlobalOptions.checkpointLocation != null) { - checkpointLocation = GlobalOptions.checkpointLocation - } - - // Write key-value data from a DataFrame to a specific Kafka topic specified in an option - df .map(r => { - (r.getField(FieldDictionary.RECORD_KEY).asString(), r.getField(FieldDictionary.RECORD_VALUE).asBytes()) - }) - .as[(String, Array[Byte])] - .toDF("key","value") - .writeStream - .format("kafka") - .option("kafka.bootstrap.servers", brokerList) - .option("topic", outputTopics.mkString(",")) - .option("checkpointLocation", checkpointLocation) - } - - private def getOrElse[T](record: Record, field: String, defaultValue: T): T = { - val value = record.getField(field) - if (value != null && value.isSet) { - return value.getRawValue.asInstanceOf[T] - } - defaultValue - } - - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderService.scala deleted file mode 100644 index feb83d33d..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderService.scala +++ /dev/null @@ -1,167 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark.structured.provider - -import java.util -import java.util.Collections - -import com.hurence.logisland.annotation.documentation.CapabilityDescription -import com.hurence.logisland.annotation.lifecycle.OnEnabled -import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} -import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} -import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} -import com.hurence.logisland.stream.StreamContext -import com.hurence.logisland.util.spark.ControllerServiceLookupSink -import com.hurence.logisland.validator.StandardValidators -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.sql.streaming.DataStreamWriter -import org.apache.spark.sql.{Dataset, SparkSession} - -/** - * You can look at spark documentation for detail on some options : - * @author bailett - */ -@CapabilityDescription("Provide a way to read a local file as input in StructuredStream streams") -class LocalFileStructuredStreamProviderService extends AbstractControllerService with StructuredStreamProviderService { - - - val LOCAL_INPUT_PATH: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("local.input.path") - .description("the location of the directory of files to be loaded. All files inside the directory will be taked as input") - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR)//TODO directory validator - .required(true) - .build - - val MAX_FILES_PER_TRIGGER: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("max.files.per.trigger") - .description(" maximum number of new files to be considered in every trigger (default: no max) ") - .addValidator(StandardValidators.POSITIVE_LONG_VALIDATOR) - .required(false) - .build - - val LATEST_FIRST: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("latest.first") - .description("whether to processs the latest new files first, useful when there is a large backlog of files (default: false)") - .addValidator(StandardValidators.BOOLEAN_VALIDATOR) - .required(false) - .build - - val FILENAME_ONLY: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("filename.only") - .description("whether to check new files based on only the filename instead of on the full path (default: false). " + - "With this set to `true`, the following files would be considered as the same file, because their filenames, \"dataset.txt\", " + - "are the same:\n\"file:///dataset.txt\"\n\"s3://a/dataset.txt\"\n\"s3n://a/b/dataset.txt\"\n\"s3a://a/b/c/dataset.txt\"") - .addValidator(StandardValidators.BOOLEAN_VALIDATOR) - .required(false) - .build - - var path: String = _ - var maxFilesPerTrigger: Option[Long] = _ - var latestFirst: Option[Boolean] = _ - var fileNameOnly: Option[Boolean] = _ - - @OnEnabled - @throws[InitializationException] - override def init(context: ControllerServiceInitializationContext): Unit = { - super.init(context) - path = context.getPropertyValue(LOCAL_INPUT_PATH).asString() - if (context.getPropertyValue(MAX_FILES_PER_TRIGGER).isSet) { - maxFilesPerTrigger = Some(context.getPropertyValue(MAX_FILES_PER_TRIGGER).asLong()) - } else { - maxFilesPerTrigger = None - } - if (context.getPropertyValue(LATEST_FIRST).isSet) { - latestFirst = Some(context.getPropertyValue(LATEST_FIRST).asBoolean()) - } else { - latestFirst = None - } - if (context.getPropertyValue(FILENAME_ONLY).isSet) { - fileNameOnly = Some(context.getPropertyValue(FILENAME_ONLY).asBoolean()) - } else { - fileNameOnly = None - } - } - - /** - * Allows subclasses to register which property descriptor objects are - * supported. - * - * @return PropertyDescriptor objects this processor currently supports - */ - override def getSupportedPropertyDescriptors() = { - val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] - descriptors.add(LOCAL_INPUT_PATH) - descriptors.add(MAX_FILES_PER_TRIGGER) - descriptors.add(LATEST_FIRST) - descriptors.add(FILENAME_ONLY) - Collections.unmodifiableList(descriptors) - } - - /** - * create a streaming DataFrame that represents data received - * - * @param spark - * @param streamContext - * @return DataFrame currently loaded - */ - override def read(spark: SparkSession, streamContext: StreamContext) = { - import spark.implicits._ - implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] - - val dataStreamReader = spark.readStream - .format("text") - if (maxFilesPerTrigger.isDefined) { - dataStreamReader.option("maxFilesPerTrigger", maxFilesPerTrigger.get) - } - if (latestFirst.isDefined) { - dataStreamReader.option("latestFirst", latestFirst.get) - } - if (fileNameOnly.isDefined) { - dataStreamReader.option("fileNameOnly", fileNameOnly.get) - } - dataStreamReader.load(path) - .as[String] - .map(r => { - new StandardRecord("line") - .setField(FieldDictionary.RECORD_VALUE, FieldType.STRING, r) - }) - } - - /** - * create a streaming DataFrame that represents data received - * - * @param streamContext - * @return DataFrame currently loaded - */ - override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): DataStreamWriter[_] = { - throw new IllegalArgumentException("LocalFileStructuredStreamProviderService class does not support write operation yet") - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/MQTTStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/MQTTStructuredStreamProviderService.scala deleted file mode 100644 index a0440517f..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/MQTTStructuredStreamProviderService.scala +++ /dev/null @@ -1,174 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark.structured.provider - - -import java.sql.Timestamp -import java.util -import java.util.Collections - -import com.hurence.logisland.annotation.documentation.CapabilityDescription -import com.hurence.logisland.annotation.lifecycle.OnEnabled -import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} -import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} -import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} -import com.hurence.logisland.stream.StreamContext -import com.hurence.logisland.stream.StreamProperties._ -import com.hurence.logisland.util.spark.ControllerServiceLookupSink -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.sql.streaming.DataStreamWriter -import org.apache.spark.sql.{Dataset, SparkSession} - -@CapabilityDescription("Provide a ways to use Mqtt a input or output in StructuredStream streams") -class MQTTStructuredStreamProviderService extends AbstractControllerService with StructuredStreamProviderService { - - - var brokerUrl = "" - var persistence = "" - var clientId = "" - var QoS = 0 - var username = "" - var password = "" - var cleanSession = true - var connectionTimeout = 5000 - var keepAlive = 30000 - var mqttVersion = "3.1.1" - var topic = "" - - @OnEnabled - @throws[InitializationException] - override def init(context: ControllerServiceInitializationContext): Unit = { - super.init(context) - this.synchronized { - try { - - // Define the MQTT parameters, broker list must be specified - brokerUrl = context.getPropertyValue(MQTT_BROKER_URL).asString - persistence = context.getPropertyValue(MQTT_PERSISTENCE).asString - clientId = context.getPropertyValue(MQTT_CLIENTID).asString - QoS = context.getPropertyValue(MQTT_QOS).asInteger().intValue() - username = context.getPropertyValue(MQTT_USERNAME).asString - password = context.getPropertyValue(MQTT_PASSWORD).asString - cleanSession = context.getPropertyValue(MQTT_CLEAN_SESSION).asBoolean().booleanValue() - connectionTimeout = context.getPropertyValue(MQTT_CONNECTION_TIMEOUT).asInteger().intValue() - keepAlive = context.getPropertyValue(MQTT_KEEP_ALIVE).asInteger().intValue() - mqttVersion = context.getPropertyValue(MQTT_VERSION).asString - topic = context.getPropertyValue(MQTT_TOPIC).asString - } catch { - case e: Exception => - throw new InitializationException(e) - } - } - } - - /** - * Allows subclasses to register which property descriptor objects are - * supported. - * - * @return PropertyDescriptor objects this processor currently supports - */ - override def getSupportedPropertyDescriptors() = { - val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] - descriptors.add(MQTT_BROKER_URL) - descriptors.add(MQTT_CLEAN_SESSION) - descriptors.add(MQTT_CLIENTID) - descriptors.add(MQTT_CONNECTION_TIMEOUT) - descriptors.add(MQTT_KEEP_ALIVE) - descriptors.add(MQTT_PASSWORD) - descriptors.add(MQTT_PERSISTENCE) - descriptors.add(MQTT_VERSION) - descriptors.add(MQTT_USERNAME) - descriptors.add(MQTT_QOS) - descriptors.add(MQTT_TOPIC) - Collections.unmodifiableList(descriptors) - } - - /** - * create a streaming DataFrame that represents data received - * - * @param spark - * @param streamContext - * @return DataFrame currently loaded - */ - override def read(spark: SparkSession, streamContext: StreamContext) = { - import spark.implicits._ - implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] - - getLogger.info("connecting to MQTT") - spark.readStream - .format("com.hurence.logisland.util.mqtt.MQTTStreamSourceProvider") - .option("topic", topic) - .option("persistence", persistence) - .option("clientId", clientId) - .option("QoS", QoS) - .option("username", username) - .option("password", password) - .option("cleanSession", cleanSession) - .option("connectionTimeout", connectionTimeout) - .option("keepAlive", keepAlive) - .option("mqttVersion", mqttVersion) - .load(brokerUrl) - .as[(String, Array[Byte], Timestamp)] - .map(r => { - new StandardRecord("kura_metric") - .setTime(r._3) - .setField(FieldDictionary.RECORD_VALUE, FieldType.BYTES, r._2) - .setField(FieldDictionary.RECORD_NAME, FieldType.STRING, r._1) - }) - - } - - - /** - * create a streaming DataFrame that represents data received - * - * @param streamContext - * @return DataFrame currently loaded - */ - override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): DataStreamWriter[_] = { - - - // Create DataFrame representing the stream of input lines from connection to mqtt server - df.writeStream - .format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSourceProvider") - .option("topic", topic) - .option("persistence", persistence) - .option("clientId", clientId) - .option("QoS", QoS) - .option("username", username) - .option("password", password) - .option("cleanSession", cleanSession) - .option("connectionTimeout", connectionTimeout) - .option("keepAlive", keepAlive) - .option("mqttVersion", mqttVersion) - - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/RateStructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/RateStructuredStreamProviderService.scala deleted file mode 100644 index ebb36107c..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/RateStructuredStreamProviderService.scala +++ /dev/null @@ -1,202 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark.structured.provider - -import java.io.{File, FileReader} -import java.util -import java.util.Collections - -import com.hurence.logisland.annotation.documentation.CapabilityDescription -import com.hurence.logisland.annotation.lifecycle.OnEnabled -import com.hurence.logisland.component.{InitializationException, PropertyDescriptor} -import com.hurence.logisland.controller.{AbstractControllerService, ControllerServiceInitializationContext} -import com.hurence.logisland.record.{FieldDictionary, FieldType, Record, StandardRecord} -import com.hurence.logisland.stream.StreamContext -import com.hurence.logisland.util.spark.ControllerServiceLookupSink -import com.hurence.logisland.validator.StandardValidators -import org.apache.commons.csv.CSVFormat -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.sql.streaming.DataStreamWriter -import org.apache.spark.sql.{Dataset, SparkSession} - -import scala.collection.JavaConversions._ - -/** - * - * @author bailett - */ - -@CapabilityDescription("Generates data at the specified number of rows per second, each output row contains a timestamp and value. " + - "Where timestamp is a Timestamp type containing the time of message dispatch, and value is of Long type containing the message count, " + - "starting from 0 as the first row. This source is intended for testing and benchmarking. Used in StructuredStream streams.") -class RateStructuredStreamProviderService extends AbstractControllerService with StructuredStreamProviderService { - - - val LOCAL_FILE_INPUT_PATH: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("local.file.input.path") - .description("the location of the file to be loaded") - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .required(true) - .build - - val HAS_CSV_HEADER: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("has.csv.header") - .description("Is this a csv file with the first line as a header") - .addValidator(StandardValidators.BOOLEAN_VALIDATOR) - .required(false) - .defaultValue("true") - .build - - val CSV_DELIMITER: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("csv.delimiter") - .description("the delimiter") - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .required(false) - .defaultValue(",") - .build - - val LOCAL_FILE_OUTPUT_PATH: PropertyDescriptor = new PropertyDescriptor.Builder() - .name("local.file.output.path") - .description("the location of the file to be writen") - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .required(false) - .build - - - var recordSeq:Seq[Record] = _ - - @OnEnabled - @throws[InitializationException] - override def init(context: ControllerServiceInitializationContext): Unit = { - super.init(context) - this.synchronized { - try { - - val delimiter = context.getPropertyValue(CSV_DELIMITER).asString() - val path = context.getPropertyValue(LOCAL_FILE_INPUT_PATH).asString() - val f = new File(path) - - if (f.exists && !f.isDirectory) { - val in = new FileReader(path) - val csv = CSVFormat.DEFAULT.withDelimiter(delimiter.charAt(0)).withFirstRecordAsHeader - val records = csv.withHeader().withSkipHeaderRecord(false).parse(in) - recordSeq = records.map(record => { - val logislandRecord:Record = new StandardRecord() - .setField(FieldDictionary.RECORD_VALUE, FieldType.STRING, record.toString) - for (columnName <- record.toMap) { - logislandRecord.setField(columnName._1, FieldType.STRING, columnName._2) - } - logislandRecord - }).toSeq - }else{ - val resourcePath = classOf[RateStructuredStreamProviderService].getResource(path).getPath - val in = new FileReader(resourcePath) - val csv = CSVFormat.DEFAULT.withDelimiter(delimiter.charAt(0)).withFirstRecordAsHeader - val records = csv.withHeader().withSkipHeaderRecord(false).parse(in).getRecords - - recordSeq = records.map(record => { - val logislandRecord:Record = new StandardRecord() - .setField(FieldDictionary.RECORD_VALUE, FieldType.STRING, record.toString) - for (columnName <- record.toMap) { - logislandRecord.setField(columnName._1, FieldType.STRING, columnName._2) - } - logislandRecord - }).toSeq - } - - - - } catch { - case e: Exception => - throw new InitializationException(e) - } - } - } - - /** - * Allows subclasses to register which property descriptor objects are - * supported. - * - * @return PropertyDescriptor objects this processor currently supports - */ - override def getSupportedPropertyDescriptors() = { - val descriptors: util.List[PropertyDescriptor] = new util.ArrayList[PropertyDescriptor] - - descriptors.add(LOCAL_FILE_INPUT_PATH) - descriptors.add(LOCAL_FILE_OUTPUT_PATH) - descriptors.add(HAS_CSV_HEADER) - descriptors.add(CSV_DELIMITER) - Collections.unmodifiableList(descriptors) - } - - /** - * create a streaming DataFrame that represents data received - * - * @param spark - * @param streamContext - * @return DataFrame currently loaded - */ - override def read(spark: SparkSession, streamContext: StreamContext) = { - import spark.implicits._ - implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] - - // val headers = records.iterator.next.toMap.keySet - - - - recordSeq.toDS() - } - - /** - * create a streaming DataFrame that represents data received - * - * @param streamContext - * @return DataFrame currently loaded - */ - override def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): DataStreamWriter[_] = { - throw new IllegalArgumentException("RateStructuredStreamProviderService class does not support write operation"); - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala deleted file mode 100644 index 47721a9ac..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/stream/spark/structured/provider/StructuredStreamProviderService.scala +++ /dev/null @@ -1,398 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark.structured.provider - -import java.io.{ByteArrayInputStream, ByteArrayOutputStream} -import java.util -import java.util.Date - -import com.hurence.logisland.controller.ControllerService -import com.hurence.logisland.record._ -import com.hurence.logisland.runner.GlobalOptions -import com.hurence.logisland.serializer.{JsonSerializer, NoopSerializer, RecordSerializer, SerializerProvider} -import com.hurence.logisland.stream.StreamContext -import com.hurence.logisland.stream.StreamProperties._ -import com.hurence.logisland.util.spark.{ControllerServiceLookupSink, ProcessorMetrics} -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.groupon.metrics.UserMetricsSystem -import org.apache.spark.sql.streaming._ -import org.apache.spark.sql.{Dataset, SparkSession} -import org.slf4j.LoggerFactory - -import scala.collection.JavaConversions._ -import scala.collection.JavaConverters._ - - -trait StructuredStreamProviderService extends ControllerService { - - val logger = LoggerFactory.getLogger(this.getClass) - - - /** - * create a streaming DataFrame that represents data received - * - * @param spark - * @param streamContext - * @return DataFrame currently loaded - */ - protected def read(spark: SparkSession, streamContext: StreamContext): Dataset[Record] - - /** - * create a streaming DataFrame that represents data received - * - * @param streamContext - * @return DataFrame currently loaded - */ - protected def write(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): DataStreamWriter[_] - - /** - * - * - * @param spark - * @param streamContext - * @return - */ - def load(spark: SparkSession, controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): Dataset[Record] = { - - import spark.implicits._ - implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] - - val df = read(spark, streamContext) - - /** - * create serializers - */ - val serializer = SerializerProvider.getSerializer( - streamContext.getPropertyValue(READ_TOPICS_SERIALIZER).asString, - streamContext.getPropertyValue(AVRO_INPUT_SCHEMA).asString) - - val keySerializer = SerializerProvider.getSerializer( - streamContext.getPropertyValue(READ_TOPICS_KEY_SERIALIZER).asString, - null) - - - // convert to logisland records - - val processingRecords: Dataset[Record] = df.flatMap(r => { - serializer match { - case sr: NoopSerializer => Some(r) - case _ => deserializeRecords(serializer, keySerializer, r) - } - }) - - - if (streamContext.getPropertyValue(GROUPBY).isSet) { - - val keys = streamContext.getPropertyValue(GROUPBY).asString() - val stateTimeoutDuration = streamContext.getPropertyValue(STATE_TIMEOUT_MS).asLong() - val chunkSize = streamContext.getPropertyValue(CHUNK_SIZE).asInteger() - - processingRecords - .filter(_.hasField(keys)) - .groupByKey(_.getField(keys).asString()) - .flatMapGroupsWithState(outputMode = OutputMode.Append, timeoutConf = GroupStateTimeout.ProcessingTimeTimeout())( - mappingFunction(controllerServiceLookupSink, streamContext, chunkSize, stateTimeoutDuration) - ) - - } else { - processingRecords.mapPartitions(iterator => { - executePipeline(controllerServiceLookupSink, streamContext, iterator) - }) - } - - - } - - val ALL_RECORDS = "all_records" - val CHUNK_CREATION_TS = "chunk_creation_ts" - - def mappingFunction(controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], - streamContext: StreamContext, - chunkSize: Int, - timeOutDuration: Long) - (key: String, - value: Iterator[Record], - state: GroupState[Record]): Iterator[Record] = { - - - val currentTimestamp = new Date().getTime - val inputRecords = value.toList - val allRecords = if (state.exists) state.get.getField(ALL_RECORDS).getRawValue.asInstanceOf[List[Record]] ++ inputRecords else inputRecords - val recordChunks = allRecords.grouped(chunkSize).toList - - - if (state.hasTimedOut || (state.exists && (currentTimestamp - state.get.getField(CHUNK_CREATION_TS).asLong()) >= timeOutDuration)) { - state.remove() - // logger.debug("TIMEOUT key " + key + ", flushing " + allRecords.size + " records in " + recordChunks.size + "chunks") - recordChunks - .flatMap(subset => executePipeline(controllerServiceLookupSink, streamContext, subset.iterator)) - .iterator - } - else if (recordChunks.last.size == chunkSize) { - state.remove() - //logger.debug("REMOVE key " + key + ", flushing " + allRecords.size + " records in " + recordChunks.size + "chunks") - recordChunks - .flatMap(subset => executePipeline(controllerServiceLookupSink, streamContext, subset.iterator)) - .iterator - } - else if (!state.exists) { - - val newChunk = new StandardRecord("chunk_record") //Chunk(key, recordChunks.last) - newChunk.setObjectField(ALL_RECORDS, recordChunks.last) - newChunk.setStringField(FieldDictionary.RECORD_KEY, key) - newChunk.setLongField(CHUNK_CREATION_TS, new Date().getTime ) - // logger.debug("CREATE key " + key + " new chunk with " + allRecords.size + " records") - - state.update(newChunk) - state.setTimeoutDuration(timeOutDuration) - - recordChunks - .slice(0, recordChunks.length - 1) - .flatMap(subset => executePipeline(controllerServiceLookupSink, streamContext, subset.iterator)) - .iterator - } - - - else { - val currentChunk = state.get - if (recordChunks.size == 1) { - currentChunk.setObjectField(ALL_RECORDS, allRecords) - state.update(currentChunk) - // logger.debug("UPDATE key " + key + ", allRecords " + allRecords.size + ", recordChunks " + recordChunks.size) - Iterator.empty - }else{ - currentChunk.setObjectField(ALL_RECORDS, recordChunks.last) - //logger.debug("UPDATE key " + key + ", allRecords " + allRecords.size + ", recordChunks " + recordChunks.size) - - state.update(currentChunk) - state.setTimeoutDuration(timeOutDuration) - - recordChunks - .slice(0, recordChunks.length - 1) - .flatMap(subset => executePipeline(controllerServiceLookupSink, streamContext, subset.iterator)) - .iterator - } - - } - - - } - - private def executePipeline(controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext, iterator: Iterator[Record]) - - = { - val controllerServiceLookup = controllerServiceLookupSink.value.getControllerServiceLookup() - - - // convert to logisland records - var processingRecords: util.Collection[Record] = iterator.toList - - val pipelineMetricPrefix = streamContext.getIdentifier + "." - // loop over processor chain - streamContext.getProcessContexts.foreach(processorContext => { - val startTime = System.currentTimeMillis() - val processor = processorContext.getProcessor - - val processorTimerContext = UserMetricsSystem.timer(pipelineMetricPrefix + - processorContext.getIdentifier + ".processing_time_ms").time() - - // injects controller service lookup into processor context - if (processor.hasControllerService) { - processorContext.setControllerServiceLookup(controllerServiceLookup) - } - - // processor setup (don't forget that) - if(!processor.isInitialized) - processor.init(processorContext) - - // do the actual processing - processingRecords = processor.process(processorContext, processingRecords) - - // compute metrics - ProcessorMetrics.computeMetrics( - pipelineMetricPrefix + processorContext.getIdentifier + ".", - processingRecords, - processingRecords, - 0, - processingRecords.size, - System.currentTimeMillis() - startTime) - - processorTimerContext.stop() - }) - - - processingRecords.asScala.iterator - } - - /** - * create a streaming DataFrame that represents data received - * - * @param streamContext - * @return DataFrame currently loaded - */ - def save(df: Dataset[Record], controllerServiceLookupSink: Broadcast[ControllerServiceLookupSink], streamContext: StreamContext): StreamingQuery = { - - - implicit val recordEncoder = org.apache.spark.sql.Encoders.kryo[Record] - - // make sure controller service lookup won't be serialized !! - streamContext.setControllerServiceLookup(null) - - // create serializer - val serializer = SerializerProvider.getSerializer( - streamContext.getPropertyValue(WRITE_TOPICS_SERIALIZER).asString, - streamContext.getPropertyValue(AVRO_OUTPUT_SCHEMA).asString) - - // create serializer - val keySerializer = SerializerProvider.getSerializer( - streamContext.getPropertyValue(WRITE_TOPICS_KEY_SERIALIZER).asString, null) - - // do the parallel processing - val df2 = df.mapPartitions(record => record.map(record => serializeRecords(serializer, keySerializer, record))) - - var checkpointLocation = "checkpoints/" + streamContext.getIdentifier - if (GlobalOptions.checkpointLocation != null) { - checkpointLocation = GlobalOptions.checkpointLocation - logger.info(s"Saving structured stream using checkpointLocation: $checkpointLocation") - } - - write(df2, controllerServiceLookupSink, streamContext) - .queryName(streamContext.getIdentifier) - // .outputMode("update") - .option("checkpointLocation", checkpointLocation) - .start() - // .processAllAvailable() - - } - - - protected def serializeRecords(valueSerializer: RecordSerializer, keySerializer: RecordSerializer, record: Record) - - = { - - try { - val ret = valueSerializer match { - case s: JsonSerializer => - new StandardRecord() - .setField(FieldDictionary.RECORD_VALUE, FieldType.STRING, doSerializeAsString(valueSerializer, record)) - case _ => - new StandardRecord() - .setField(FieldDictionary.RECORD_VALUE, FieldType.BYTES, doSerialize(valueSerializer, record)) - } - val fieldKey = record.getField(FieldDictionary.RECORD_KEY); - if (fieldKey != null) { - ret.setField(FieldDictionary.RECORD_KEY, FieldType.BYTES, doSerialize(keySerializer, new StandardRecord().setField(fieldKey))) - } else { - ret.setField(FieldDictionary.RECORD_KEY, FieldType.NULL, null) - - } - ret - - } catch { - case t: Throwable => - logger.error(s"exception while serializing events ${ - t.getMessage - }") - null - } - - - } - - private def doSerializeAsString(serializer: RecordSerializer, record: Record): String - - = { - val baos: ByteArrayOutputStream = new ByteArrayOutputStream - serializer.serialize(baos, record) - val bytes = baos.toByteArray - baos.close() - new String(bytes) - - - } - - private def doSerialize(serializer: RecordSerializer, record: Record): Array[Byte] - - = { - val baos: ByteArrayOutputStream = new ByteArrayOutputStream - serializer.serialize(baos, record) - val bytes = baos.toByteArray - baos.close() - bytes - - - } - - private def doDeserialize(serializer: RecordSerializer, field: Field): Record - - = { - val f = field.getRawValue - val s = if (f.isInstanceOf[String]) f.asInstanceOf[String].getBytes else f; - val bais = new ByteArrayInputStream(s.asInstanceOf[Array[Byte]]) - try { - serializer.deserialize(bais) - } finally { - bais.close() - } - } - - protected def deserializeRecords(serializer: RecordSerializer, keySerializer: RecordSerializer, r: Record) - - = { - try { - val deserialized = doDeserialize(serializer, r.getField(FieldDictionary.RECORD_VALUE)) - // copy root record field - if (r.hasField(FieldDictionary.RECORD_NAME)) - deserialized.setField(r.getField(FieldDictionary.RECORD_NAME)) - - if (r.hasField(FieldDictionary.RECORD_KEY) && r.getField(FieldDictionary.RECORD_KEY).getRawValue != null) { - val deserializedKey = doDeserialize(keySerializer, r.getField(FieldDictionary.RECORD_KEY)) - if (deserializedKey.hasField(FieldDictionary.RECORD_VALUE) && deserializedKey.getField(FieldDictionary.RECORD_VALUE).getRawValue != null) { - val f = deserializedKey.getField(FieldDictionary.RECORD_VALUE) - deserialized.setField(FieldDictionary.RECORD_KEY, f.getType, f.getRawValue) - } else { - logger.warn("Unable to serialize key for record $r with serializer $keySerializer") - } - } - - Some(deserialized) - - } catch { - case t: Throwable => - logger.error(s"exception while deserializing events ${ - t.getMessage - }") - None - } - } - - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaReporter.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaReporter.scala deleted file mode 100644 index accb2f1c6..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaReporter.scala +++ /dev/null @@ -1,224 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.util.kafka - -import java.util.Properties -import java.util.concurrent.TimeUnit - -import com.codahale.metrics._ -import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} -import org.json4s.JsonAST.JObject -import org.json4s.JsonDSL._ -import org.json4s.jackson.JsonMethods._ -import org.slf4j.{Logger, LoggerFactory} - -import scala.collection.JavaConverters._ -import scala.language.existentials -import scala.util.{Failure, Success, Try} - -class KafkaReporter( - registry: MetricRegistry, - kafkaEndpoint: String, - kafkaTopic: String, - properties: Properties) - extends ScheduledReporter( - registry, - "kafka-reporter", - MetricFilter.ALL, - TimeUnit.SECONDS, - TimeUnit.MILLISECONDS) { - - val logger: Logger = LoggerFactory.getLogger(this.getClass) - - var producer: Option[KafkaProducer[String, String]] = None - - // Any user properties set in the metrics config file - // prodconf_foo=this.setting.key=value - // prodconf_bar=this.setting.key2=value2 - private def setUserProperties(props: Properties) { - for { - entry <- properties.entrySet().asScala - if (entry.getKey().asInstanceOf[String].startsWith("prodconf_")) - } { - val kv = entry.getValue().asInstanceOf[String].split('=') - if (kv.length != 2) { - logger.error(s"Ignoring bad prodconf_* setting: ${entry.getValue()}") - } else { - props.put(kv(0), kv(1)) - } - } - } - - override def start(period: Long, unit: TimeUnit): Unit = { - super.start(period, unit) - val status = for { - kp <- Try { - logger.info(s"Opening Kafka endpoint $kafkaEndpoint") - val props = new Properties() - - // Set these, but may be overridden in setUserProperties - props.put("client.id", (s"KafkaReporter-$kafkaEndpoint-$kafkaTopic").replace(':', '-')) - - // load any KafkaProducer conf settings passed in from metrics config - setUserProperties(props) - - // Anything here takes precedence over user settings - props.put("bootstrap.servers", kafkaEndpoint) - props.put("key.serializer", - "org.apache.kafka.common.serialization.StringSerializer") - props.put("value.serializer", - "org.apache.kafka.common.serialization.StringSerializer") - - logger.info(s"Kafka producer properties:\n$props") - - new KafkaProducer[String, String](props) - } - } yield { - kp - } - status match { - case Success(kp) => { - logger.info(s"Kafka producer connected to $kafkaEndpoint") - producer = Some(kp) - } - case Failure(err) => { - logger.error(s"Failure opening Kafka endpoint $kafkaEndpoint:\n$err") - } - } - } - - override def stop(): Unit = { - logger.info(s"Stopping Kafka reporter at $kafkaEndpoint") - super.stop() - } - - def report( - gauges: java.util.SortedMap[String, Gauge[_]], - counters: java.util.SortedMap[String, Counter], - histograms: java.util.SortedMap[String, Histogram], - meters: java.util.SortedMap[String, Meter], - timers: java.util.SortedMap[String, Timer]): Unit = { - - if (producer.isEmpty) { - logger.error(s"Failed Kafka client for $kafkaEndpoint: metric output ignored") - } else { - // dump metric output to the kafka topic - val prod = producer.get - for {entry <- gauges.entrySet().asScala} { - gaugeJSON(entry.getValue()).foreach { jv => prod.send(metricRec(entry.getKey(), jv)) } - } - for {entry <- counters.entrySet().asScala} { - counterJSON(entry.getValue()).foreach { jv => prod.send(metricRec(entry.getKey(), jv)) } - } - for {entry <- histograms.entrySet().asScala} { - histJSON(entry.getValue()).foreach { jv => prod.send(metricRec(entry.getKey(), jv)) } - } - for {entry <- meters.entrySet().asScala} { - meterJSON(entry.getValue()).foreach { jv => prod.send(metricRec(entry.getKey(), jv)) } - } - for {entry <- timers.entrySet().asScala} { - timerJSON(entry.getValue()).foreach { jv => prod.send(metricRec(entry.getKey(), jv)) } - } - } - } - - private def metricRec(key: String, value: String) = - new ProducerRecord[String, String](kafkaTopic, key, value) - - private def gaugeJSON(gauge: Gauge[_]): Option[String] = { - val tpe = ("type" -> "gauge") - gauge.getValue() match { - case v: Int => Some(compact(render(tpe ~ ("value" -> v)))) - case v: Long => Some(compact(render(tpe ~ ("value" -> v)))) - case v: Float => Some(compact(render(tpe ~ ("value" -> v)))) - case v: Double => Some(compact(render(tpe ~ ("value" -> v)))) - case v => { - logger.error(s"Ignoring unexpected Gauge value: $v") - None - } - } - } - - private def counterJSON(counter: Counter): Option[String] = { - val tpe = ("type" -> "counter") - Some(compact(render(tpe ~ ("value" -> counter.getCount())))) - } - - private def histJSON(hist: Histogram): Option[String] = { - for { - hsub <- samplingAST(hist, "histquantiles") - nsub <- Some(("n" -> hist.getCount())) - } yield { - compact(render(("type" -> "histogram") ~ ("value" -> (nsub ~ hsub)))) - } - } - - private def meterJSON(meter: Meter): Option[String] = { - for { - - msub <- meteredAST(meter) - nsub <- Some(("n" -> meter.getCount())) - } yield { - compact(render(("type" -> "meter") ~ ("value" -> (nsub ~ msub)))) - } - } - - private def timerJSON(timer: Timer): Option[String] = { - for { - hsub <- samplingAST(timer, "timerquantiles") - msub <- meteredAST(timer) - nsub <- Some(("n" -> timer.getCount())) - } yield { - compact(render(("type" -> "timer") ~ ("value" -> (nsub ~ hsub ~ msub)))) - } - } - - private def samplingAST(hist: Sampling, qsetting: String): Option[JObject] = { - val snapshot = hist.getSnapshot() - Try { - val hqs = Option(properties.getProperty(qsetting)).getOrElse( - "0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0") - val q = hqs.split(",").map(_.toDouble).toVector - val x = q.map { z => snapshot.getValue(z) } - (q, x) - } match { - case Failure(_) => { - val hqs = properties.getProperty(qsetting) - logger.error(s"Bad quantile setting: $hqs\nIgnoring histogram metric output") - None - } - case Success((q, x)) => { - val hsub = - ("q" -> q) ~ - ("x" -> x) ~ - ("min" -> snapshot.getMin()) ~ - ("max" -> snapshot.getMax()) ~ - ("mean" -> snapshot.getMean()) ~ - ("stdv" -> snapshot.getStdDev()) - Some(hsub) - } - } - } - - private def meteredAST(meter: Metered): Option[JObject] = { - val msub = - ("rate1" -> meter.getOneMinuteRate()) ~ - ("rate5" -> meter.getFiveMinuteRate()) ~ - ("rate15" -> meter.getFifteenMinuteRate()) ~ - ("rateMean" -> meter.getMeanRate()) - Some(msub) - } -} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaSink.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaSink.scala deleted file mode 100644 index fa293413a..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/kafka/KafkaSink.scala +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.util.kafka - -import java.io.ByteArrayOutputStream - -import com.hurence.logisland.record.{FieldDictionary, Record} -import com.hurence.logisland.serializer.RecordSerializer -import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} - -import scala.collection.JavaConversions._ - -class KafkaSink(createProducer: () => KafkaProducer[Array[Byte], Array[Byte]]) extends Serializable { - - lazy val producer = createProducer() - - - def send(topic: String, key: Array[Byte], value: Array[Byte]): Unit = - producer.send(new ProducerRecord(topic, value)) - - /** - * Send events to Kafka topics - * - * @param events - */ - def produce(topic: String, events: List[Record], serializer:RecordSerializer) = { - - // do nothing if topic name is 'none' - if (!topic.equals("none")) { - val messages = events.map(event => { - // messages are serialized with kryo first - val baos: ByteArrayOutputStream = new ByteArrayOutputStream - serializer.serialize(baos, event) - - // and then converted to KeyedMessage - val key = if (event.hasField(FieldDictionary.RECORD_ID)) - event.getField(FieldDictionary.RECORD_ID).asString() - else - "" - val message = new ProducerRecord(topic, key.getBytes(), baos.toByteArray) - baos.close() - - - producer.send(message) - }) - } - } -} - -object KafkaSink { - def apply(config: Map[String, Object]): KafkaSink = { - val f = () => { - val producer = new KafkaProducer[Array[Byte], Array[Byte]](config) - - /* sys.addShutdownHook { - producer.close() - } -*/ - producer - } - new KafkaSink(f) - } -} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MQTTStreamSource.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MQTTStreamSource.scala deleted file mode 100644 index f5c390822..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MQTTStreamSource.scala +++ /dev/null @@ -1,240 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.util.mqtt - -import java.nio.charset.Charset -import java.sql.Timestamp -import java.text.SimpleDateFormat -import java.util.Calendar -import java.util.concurrent.CountDownLatch - -import org.apache.bahir.utils.Logging -import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Source} -import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider} -import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, SQLContext} -import org.eclipse.paho.client.mqttv3._ -import org.eclipse.paho.client.mqttv3.persist.{MemoryPersistence, MqttDefaultFilePersistence} - -import scala.collection.concurrent.TrieMap -import scala.collection.mutable.ArrayBuffer -import scala.util.{Failure, Success, Try} - - -object MQTTStreamConstants { - - val DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") - - val SCHEMA_DEFAULT = StructType(StructField("topic", StringType) - :: StructField("payload", BinaryType) - :: StructField("timestamp", TimestampType) :: Nil) -} - -/** - * A Text based mqtt stream source, it interprets the payload of each incoming message by converting - * the bytes to String using Charset.defaultCharset as charset. Each value is associated with a - * timestamp of arrival of the message on the source. It can be used to operate a window on the - * incoming stream. - * - * @param brokerUrl url MqttClient connects to. - * @param persistence an instance of MqttClientPersistence. By default it is used for storing - * incoming messages on disk. If memory is provided as option, then recovery on - * restart is not supported. - * @param topic topic MqttClient subscribes to. - * @param clientId clientId, this client is assoicated with. Provide the same value to recover - * a stopped client. - * @param messageParser parsing logic for processing incoming messages from Mqtt Server. - * @param sqlContext Spark provided, SqlContext. - * @param mqttConnectOptions an instance of MqttConnectOptions for this Source. - * @param qos the maximum quality of service to subscribe each topic at.Messages published at - * a lower quality of service will be received at the published QoS. Messages - * published at a higher quality of service will be received using the QoS specified - * on the subscribe. - */ -class MQTTTextStreamSource(brokerUrl: String, persistence: MqttClientPersistence, - topic: String, clientId: String, messageParser: (String, Array[Byte]) => (String, Array[Byte], Timestamp), - sqlContext: SQLContext, mqttConnectOptions: MqttConnectOptions, qos: Int) - extends Source with Logging { - - override def schema: StructType = MQTTStreamConstants.SCHEMA_DEFAULT - - private val store = new LocalMessageStore(persistence, sqlContext.sparkContext.getConf) - - private val messages = new TrieMap[Int, (String, Array[Byte], Timestamp)] - - private val initLock = new CountDownLatch(1) - - private var offset = 0 - - private var client: MqttClient = _ - - private def fetchLastProcessedOffset(): Int = { - Try(store.maxProcessedOffset) match { - case Success(x) => - log.info(s"Recovering from last stored offset $x") - x - case Failure(e) => 0 - } - } - - initialize() - private def initialize(): Unit = { - - client = new MqttClient(brokerUrl, clientId, persistence) - - val callback = new MqttCallbackExtended() { - - override def messageArrived(topic_ : String, message: MqttMessage): Unit = synchronized { - initLock.await() // Wait for initialization to complete. - val temp = offset + 1 - messages.put(temp, messageParser(topic_, message.getPayload)) - offset = temp - log.trace(s"Message arrived, $topic_ $message") - } - - override def deliveryComplete(token: IMqttDeliveryToken): Unit = { - } - - override def connectionLost(cause: Throwable): Unit = { - log.warn("Connection to mqtt server lost.", cause) - } - - override def connectComplete(reconnect: Boolean, serverURI: String): Unit = { - log.info(s"Connect complete $serverURI. Is it a reconnect?: $reconnect") - } - } - client.setCallback(callback) - client.connect(mqttConnectOptions) - client.subscribe(topic, qos) - // It is not possible to initialize offset without `client.connect` - offset = fetchLastProcessedOffset() - initLock.countDown() // Release. - } - - /** Stop this source and free any resources it has allocated. */ - override def stop(): Unit = { - client.disconnect() - persistence.close() - client.close() - } - - /** Returns the maximum available offset for this source. */ - override def getOffset: Option[Offset] = { - if (offset == 0) { - None - } else { - Some(LongOffset(offset)) - } - } - - /** - * Returns the data that is between the offsets (`start`, `end`]. When `start` is `None` then - * the batch should begin with the first available record. This method must always return the - * same data for a particular `start` and `end` pair. - */ - override def getBatch(start: Option[Offset], end: Offset): DataFrame = synchronized { - val startIndex = start.getOrElse(LongOffset(0L)).asInstanceOf[LongOffset].offset.toInt - val endIndex = end.asInstanceOf[LongOffset].offset.toInt - val data: ArrayBuffer[(String, Array[Byte], Timestamp)] = ArrayBuffer.empty - // Move consumed messages to persistent store. - (startIndex + 1 to endIndex).foreach { id => - val element: (String, Array[Byte], Timestamp) = messages.getOrElse(id, store.retrieve(id)) - data += element - store.store(id, element) - messages.remove(id, element) - } - log.trace(s"Get Batch invoked, ${data.mkString}") - import sqlContext.implicits._ - data.toDF("topic", "payload", "timestamp") - } - -} - -class MQTTStreamSourceProvider extends StreamSourceProvider with DataSourceRegister with Logging { - - override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType], - providerName: String, parameters: Map[String, String]): (String, StructType) = { - ("mqtt", MQTTStreamConstants.SCHEMA_DEFAULT) - } - - override def createSource(sqlContext: SQLContext, metadataPath: String, - schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { - - def e(s: String) = new IllegalArgumentException(s) - - val brokerUrl: String = parameters.getOrElse("brokerUrl", parameters.getOrElse("path", - throw e("Please provide a `brokerUrl` by specifying path or .options(\"brokerUrl\",...)"))) - - - val persistence: MqttClientPersistence = parameters.get("persistence") match { - case Some("memory") => new MemoryPersistence() - case _ => val localStorage: Option[String] = parameters.get("localStorage") - localStorage match { - case Some(x) => new MqttDefaultFilePersistence(x) - case None => new MqttDefaultFilePersistence() - } - } - - val messageParserWithTimeStamp = (x: Array[Byte]) => - (new String(x, Charset.defaultCharset()), Timestamp.valueOf( - MQTTStreamConstants.DATE_FORMAT.format(Calendar.getInstance().getTime))) - - - val messageNOPParser = (x: Array[Byte]) => (x,Timestamp.valueOf( - MQTTStreamConstants.DATE_FORMAT.format(Calendar.getInstance().getTime))) - - val messageKVParser = (topic:String, x: Array[Byte]) => (topic, x,Timestamp.valueOf( - MQTTStreamConstants.DATE_FORMAT.format(Calendar.getInstance().getTime))) - - // if default is subscribe everything, it leads to getting lot unwanted system messages. - val topic: String = parameters.getOrElse("topic", - throw e("Please specify a topic, by .options(\"topic\",...)")) - - val clientId: String = parameters.getOrElse("clientId", { - log.warn("If `clientId` is not set, a random value is picked up." + - "\nRecovering from failure is not supported in such a case.") - MqttClient.generateClientId()}) - - val username: Option[String] = parameters.get("username") - val password: Option[String] = parameters.get("password") - val connectionTimeout: Int = parameters.getOrElse("connectionTimeout", - MqttConnectOptions.CONNECTION_TIMEOUT_DEFAULT.toString).toInt - val keepAlive: Int = parameters.getOrElse("keepAlive", MqttConnectOptions - .KEEP_ALIVE_INTERVAL_DEFAULT.toString).toInt - val mqttVersion: Int = parameters.getOrElse("mqttVersion", MqttConnectOptions - .MQTT_VERSION_DEFAULT.toString).toInt - val cleanSession: Boolean = parameters.getOrElse("cleanSession", "false").toBoolean - val qos: Int = parameters.getOrElse("QoS", "1").toInt - - val mqttConnectOptions: MqttConnectOptions = new MqttConnectOptions() - mqttConnectOptions.setAutomaticReconnect(true) - mqttConnectOptions.setCleanSession(cleanSession) - mqttConnectOptions.setConnectionTimeout(connectionTimeout) - mqttConnectOptions.setKeepAliveInterval(keepAlive) - mqttConnectOptions.setMqttVersion(mqttVersion) - (username, password) match { - case (Some(u: String), Some(p: String)) => - mqttConnectOptions.setUserName(u) - mqttConnectOptions.setPassword(p.toCharArray) - case _ => - } - - new MQTTTextStreamSource(brokerUrl, persistence, topic, clientId, - messageKVParser, sqlContext, mqttConnectOptions, qos) - } - - override def shortName(): String = "mqtt" -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MessageStore.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MessageStore.scala deleted file mode 100644 index 6228e2901..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/mqtt/MessageStore.scala +++ /dev/null @@ -1,109 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.hurence.logisland.util.mqtt - -import java.nio.ByteBuffer -import java.util - -import org.apache.bahir.utils.Logging -import org.apache.spark.SparkConf -import org.apache.spark.serializer.{JavaSerializer, Serializer, SerializerInstance} -import org.eclipse.paho.client.mqttv3.{MqttClientPersistence, MqttPersistable, MqttPersistenceException} - -import scala.reflect.ClassTag - - -/** A message store for MQTT stream source for SQL Streaming. */ -trait MessageStore { - - /** Store a single id and corresponding serialized message */ - def store[T: ClassTag](id: Int, message: T): Boolean - - /** Retrieve messages corresponding to certain offset range */ - def retrieve[T: ClassTag](start: Int, end: Int): Seq[T] - - /** Retrieve message corresponding to a given id. */ - def retrieve[T: ClassTag](id: Int): T - - /** Highest offset we have stored */ - def maxProcessedOffset: Int - -} - -private[mqtt] class MqttPersistableData(bytes: Array[Byte]) extends MqttPersistable { - - override def getHeaderLength: Int = bytes.length - - override def getHeaderOffset: Int = 0 - - override def getPayloadOffset: Int = 0 - - override def getPayloadBytes: Array[Byte] = null - - override def getHeaderBytes: Array[Byte] = bytes - - override def getPayloadLength: Int = 0 -} - -/** - * A message store to persist messages received. This is not intended to be thread safe. - * It uses `MqttDefaultFilePersistence` for storing messages on disk locally on the client. - */ -private[mqtt] class LocalMessageStore(val persistentStore: MqttClientPersistence, - val serializer: Serializer) extends MessageStore with Logging { - - val classLoader = Thread.currentThread.getContextClassLoader - - def this(persistentStore: MqttClientPersistence, conf: SparkConf) = - this(persistentStore, new JavaSerializer(conf)) - - val serializerInstance: SerializerInstance = serializer.newInstance() - - private def get(id: Int) = { - persistentStore.get(id.toString).getHeaderBytes - } - - import scala.collection.JavaConverters._ - - def maxProcessedOffset: Int = { - val keys: util.Enumeration[_] = persistentStore.keys() - keys.asScala.map(x => x.toString.toInt).max - } - - /** Store a single id and corresponding serialized message */ - override def store[T: ClassTag](id: Int, message: T): Boolean = { - val bytes: Array[Byte] = serializerInstance.serialize(message).array() - try { - persistentStore.put(id.toString, new MqttPersistableData(bytes)) - true - } catch { - case e: MqttPersistenceException => log.warn(s"Failed to store message Id: $id", e) - false - } - } - - /** Retrieve messages corresponding to certain offset range */ - override def retrieve[T: ClassTag](start: Int, end: Int): Seq[T] = { - (start until end).map(x => retrieve(x)) - } - - /** Retrieve message corresponding to a given id. */ - override def retrieve[T: ClassTag](id: Int): T = { - serializerInstance.deserialize(ByteBuffer.wrap(get(id)), classLoader) - } - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/ControllerServiceLookupSink.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/ControllerServiceLookupSink.scala deleted file mode 100644 index 702900c60..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/ControllerServiceLookupSink.scala +++ /dev/null @@ -1,52 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.util.spark - -import java.util -import java.util.Objects._ - -import com.hurence.logisland.component.PropertyDescriptor -import com.hurence.logisland.config.ControllerServiceConfiguration -import com.hurence.logisland.controller._ - - -class ControllerServiceLookupSink(createControllerServiceLookup: () => ControllerServiceLookup) extends Serializable { - - lazy val controllerServiceLookup = createControllerServiceLookup() - - - def getControllerServiceLookup(): ControllerServiceLookup = controllerServiceLookup - - def getControllerService(serviceIdentifier: String): ControllerService = - controllerServiceLookup.getControllerService(serviceIdentifier) - - def addControllerService(serviceIdentifier: String, controllerService: ControllerService, properties: Map[PropertyDescriptor, String]) { - requireNonNull(controllerService) - } - - -} - -object ControllerServiceLookupSink { - def apply(configs: util.Collection[ControllerServiceConfiguration]): ControllerServiceLookupSink = { - val f = () => { - new StandardControllerServiceLookup(configs) - - - } - new ControllerServiceLookupSink(f) - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/SparkUtils.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/SparkUtils.scala deleted file mode 100644 index 412ec6262..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/com/hurence/logisland/util/spark/SparkUtils.scala +++ /dev/null @@ -1,273 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (bailet.thomas@gmail.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - Copyright 2016 Hurence - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - */ - -package com.hurence.logisland.util.spark - -import java.text.SimpleDateFormat -import java.util -import java.util.Date - -import com.hurence.logisland.record._ -import com.typesafe.scalalogging.LazyLogging -import com.typesafe.scalalogging.slf4j.LazyLogging -import org.apache.avro.Schema -import org.apache.avro.Schema.Type -import org.apache.log4j.{Level, Logger} -import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.apache.spark.{SparkConf, SparkContext} - -/** - * Created by tom on 11/06/15. - */ - -object SparkUtils extends LazyLogging { - - - def initContext(appName: String, - blockInterval: String = "", - maxRatePerPartition: String = "", - master: String = ""): SparkContext = { - - // job configuration - val conf = new SparkConf() - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") - if (maxRatePerPartition.nonEmpty) { - conf.set("spark.streaming.kafka.maxRatePerPartition", maxRatePerPartition) - } - if (blockInterval.nonEmpty) { - conf.set("spark.streaming.blockInterval", blockInterval) - } - conf.set("spark.streaming.backpressure.enabled", "true") - conf.set("spark.streaming.unpersist", "false") - conf.set("spark.ui.port", "4050") - conf.setAppName(appName) - - if (master.nonEmpty) { - conf.setMaster(master) - } - - val sc = new SparkContext(conf) - - logger.info(s"spark context initialized with master:$master, appName:$appName, " + - s"blockInterval:$blockInterval, maxRatePerPartition:$maxRatePerPartition") - - sc - } - - - /** - * Get a file and a schema and convert this to a dataframe - * - * @param schema - * @param filePath - * @param tableName - */ - def registerDataFrame( - schema: String, - filePath: String, - tableName: String, - sc: SparkContext, - sqlContext: SQLContext, - separator: String = "\u0001"): DataFrame = { - // Generate the schema based on the string of schema - val parsedSchema = StructType(schema.split(" ").map(fieldName => StructField(fieldName, StringType, true))) - - // Convert records of the RDD (people) to Rows. - val schemaLength = schema.split(" ").length - val rawRDD = sc.textFile(filePath) - .map(_.split(separator)) - .filter(_.length == schemaLength) - .map(tokens => Row.fromSeq(tokens)) - - // Apply the schema to the RDD. - val dataFrame = sqlContext.createDataFrame(rawRDD, parsedSchema) - - // Register the DataFrames as a table. - dataFrame.createOrReplaceTempView(tableName) - dataFrame - } - - - def registerUdfs(sqlContext: SQLContext) = { - - - sqlContext.udf.register("timestamp", (date: String) => { - try { - val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S") - - sdf.parse(date).getTime - } catch { - case e: Exception => 0 - } - }) - - } - - /** - * convert a Record to a SQL Row - * - * @param record the Record to convert - * @return the Spar SQL row - */ - def convertToRow(record: Record, schema: StructType): Row = { - - Row.fromSeq(schema.map(structField => { - val fieldName = structField.name - - if (record.hasField(fieldName)) { - structField.dataType match { - case DataTypes.StringType => - if (record.getField(fieldName).getType == FieldType.ARRAY) - record.getField(fieldName).getRawValue.asInstanceOf[util.ArrayList[String]].toArray.mkString - else - record.getField(fieldName).asString() - case DataTypes.IntegerType => record.getField(fieldName).asInteger() - case DataTypes.LongType => record.getField(fieldName).asLong() - case DataTypes.FloatType => record.getField(fieldName).asFloat() - case DataTypes.DoubleType => record.getField(fieldName).asDouble() - case _ => record.getField(fieldName).asString() - } - } else { - null - } - - - })) - } - - /** - * convert a SQL Row to a Record to - * - * @param row the Row to convert - * @return the Record - */ - def convertToRecord(row: Row, inRecordType: String = "logisland_record"): Record = { - - var recordType = inRecordType - var recordTime = new Date().getTime - val fields = row.schema.map(structField => { - val fieldName = structField.name - - structField.dataType match { - case DataTypes.StringType => - if (fieldName == FieldDictionary.RECORD_TYPE) { - recordType = row.getAs[String](fieldName) - } - new Field(fieldName, FieldType.STRING, row.getAs[String](fieldName)) - case DataTypes.IntegerType => new Field(fieldName, FieldType.INT, row.getAs[Int](fieldName)) - case DataTypes.LongType => - if (fieldName == FieldDictionary.RECORD_TIME) { - recordTime = row.getAs[Long](fieldName) - } - new Field(fieldName, FieldType.LONG, row.getAs[Long](fieldName)) - case DataTypes.FloatType => new Field(fieldName, FieldType.FLOAT, row.getAs[Float](fieldName)) - case DataTypes.DoubleType => new Field(fieldName, FieldType.DOUBLE, row.getAs[Double](fieldName)) - case _ => new Field(fieldName, FieldType.STRING, row.getAs[String](fieldName)) - } - - }) - - // construct new Record with type and time from the row - val outputRecord = new StandardRecord() - .setType(recordType) - .setTime(new Date(recordTime)) - fields.foreach(field => outputRecord.setField(field)) - outputRecord - } - - /** - * create a dataframe schema from a Record - * - * @param record the Record to infer schema - * @return th schema - */ - def convertFieldsNameToSchema(record: Record): StructType = { - StructType( - record.getAllFieldsSorted.toArray(Array[Field]()).map(f => { - f.getType match { - case FieldType.INT => StructField(f.getName, DataTypes.IntegerType, nullable = true) - case FieldType.LONG => StructField(f.getName, DataTypes.LongType, nullable = true) - case FieldType.FLOAT => StructField(f.getName, DataTypes.FloatType, nullable = true) - case FieldType.DOUBLE => StructField(f.getName, DataTypes.DoubleType, nullable = true) - case FieldType.STRING => StructField(f.getName, DataTypes.StringType, nullable = true) - case _ => StructField(f.getName, DataTypes.StringType, nullable = true) - } - }) - ) - } - - /** - * create a dataframe schema from an Avro one - * - * @param avroSchema the Avro Schema - * @return th schema - */ - def convertAvroSchemaToDataframeSchema(avroSchema: Schema): StructType = { - val types = avroSchema.getFields.toArray(Array[Schema.Field]()) - .map(s => { - (s.name(), - s.schema() - .getTypes - .toArray(Array[Schema]()) - .filter(t => t.getType != Type.NULL) - .toList - .head) - }) - - StructType(types.map(f => { - f._2.getType match { - case Type.INT => StructField(f._1, DataTypes.IntegerType, nullable = true) - case Type.LONG => StructField(f._1, DataTypes.LongType, nullable = true) - case Type.FLOAT => StructField(f._1, DataTypes.FloatType, nullable = true) - case Type.DOUBLE => StructField(f._1, DataTypes.DoubleType, nullable = true) - case Type.STRING => StructField(f._1, DataTypes.StringType, nullable = true) - case _ => StructField(f._1, DataTypes.StringType, nullable = true) - } - }) - ) - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/org/apache/spark/metrics/sink/KafkaSink.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/org/apache/spark/metrics/sink/KafkaSink.scala deleted file mode 100644 index ea0adbd78..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/main/scala/org/apache/spark/metrics/sink/KafkaSink.scala +++ /dev/null @@ -1,91 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.metrics.sink - -import java.util.{Locale, Properties} -import java.util.concurrent.TimeUnit - -import org.slf4j.Logger -import org.slf4j.LoggerFactory -import com.codahale.metrics.MetricRegistry -import com.hurence.logisland.util.kafka.KafkaReporter -import org.apache.spark.SecurityManager - -/** - * A Kafka metric sink for Apache Spark - - -Configure your spark metrics.properties file - -Edit /path/to/spark/conf/metrics.properites to look like this: - -master.source.jvm.class=org.apache.spark.metrics.source.JvmSource -worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource -driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource -executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource - - *.sink.kafka.class=org.apache.spark.metrics.sink.KafkaSink - *.sink.kafka.broker=127.0.0.1:9092 - *.sink.kafka.topic=test - *.sink.kafka.period=10 - *.sink.kafka.unit=seconds - -# histquantiles and timerquantiles have following defaults: -#*.sink.kafka.histquantiles=0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0 -#*.sink.kafka.timerquantiles=0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0 - -# These carry configure settings to the KafkaProducer -# *.sink.kafka.prodconf_xxx, where xxx can be anything, just has to -# be unique per setting: - *.sink.kafka.prodconf_a=retries=0 - *.sink.kafka.prodconf_b=acks=all - *.sink.kafka.prodconf_c=request.timeout.ms=5 - *.sink.kafka.prodconf_d=max.block.ms=5 - - - */ -class KafkaSink(val properties: Properties, val registry: MetricRegistry, - securityMgr: SecurityManager) extends org.apache.spark.metrics.sink.Sink { - - val logger: Logger = LoggerFactory.getLogger(this.getClass) - - private def popt(prop: String): Option[String] = - Option(properties.getProperty(prop)) - - // These are non-negotiable - val broker = popt("broker").get - val topic = popt("topic").get - - lazy val reporter = new KafkaReporter(registry, broker, topic, properties) - - def start(): Unit = { - logger.info(s"Starting Kafka metric reporter at $broker, topic $topic") - val period = popt("period").getOrElse("10").toLong - val tstr = popt("unit").getOrElse("seconds").toUpperCase(Locale.ROOT) - val tunit = TimeUnit.valueOf(tstr) - reporter.start(period, tunit) - } - - def stop(): Unit = { - logger.info(s"Stopping Kafka metric reporter at $broker, topic $topic") - reporter.stop() - } - - def report(): Unit = { - logger.info(s"Reporting metrics to Kafka reporter at $broker, topic $topic") - reporter.report() - } -} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/KafkaConnectTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/KafkaConnectTest.java deleted file mode 100644 index 2ad335080..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/KafkaConnectTest.java +++ /dev/null @@ -1,84 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect; - -import com.hurence.logisland.component.ComponentFactory; -import com.hurence.logisland.config.ConfigReader; -import com.hurence.logisland.config.LogislandConfiguration; -import com.hurence.logisland.engine.EngineContext; -import com.hurence.logisland.util.runner.TestRunner; -import org.junit.Assert; -import org.junit.Ignore; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Optional; -import java.util.Scanner; - - -/** - * End to end test. - */ -public class KafkaConnectTest { - private static Logger logger = LoggerFactory.getLogger(KafkaConnectTest.class); - - private static final String JOB_CONF_FILE = "/conf/kafka-connect-stream.yml"; - - @Test - @Ignore - public void remoteTest() { - - - logger.info("starting StreamProcessingRunner"); - - Optional engineInstance = Optional.empty(); - try { - - String configFile = KafkaConnectTest.class.getResource(JOB_CONF_FILE).getPath(); - - // load the YAML config - LogislandConfiguration sessionConf = ConfigReader.loadConfig(configFile); - - // instantiate engine and all the processor from the config - engineInstance = ComponentFactory.getEngineContext(sessionConf.getEngine()); - assert engineInstance.isPresent(); - assert engineInstance.get().isValid(); - - logger.info("starting Logisland session version {}", sessionConf.getVersion()); - logger.info(sessionConf.getDocumentation()); - } catch (Exception e) { - logger.error("unable to launch runner : {}", e); - } - - try { - // start the engine - EngineContext engineContext = engineInstance.get(); - engineInstance.get().getEngine().start(engineContext); - new Scanner(System.in).nextLine(); - } catch (Exception e) { - Assert.fail("something went bad while running the job : " + e); - - } - - - - - - - } - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/converter/LogIslandRecordConverterTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/converter/LogIslandRecordConverterTest.java deleted file mode 100644 index 1618799b1..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/converter/LogIslandRecordConverterTest.java +++ /dev/null @@ -1,129 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect.converter; - -import com.hurence.logisland.record.Field; -import com.hurence.logisland.record.FieldDictionary; -import com.hurence.logisland.record.Record; -import com.hurence.logisland.serializer.BytesArraySerializer; -import com.hurence.logisland.serializer.KryoSerializer; -import com.hurence.logisland.serializer.RecordSerializer; -import com.hurence.logisland.serializer.SerializerProvider; -import org.apache.kafka.connect.data.Schema; -import org.apache.kafka.connect.data.SchemaBuilder; -import org.apache.kafka.connect.data.Struct; -import org.junit.Test; - -import java.io.ByteArrayInputStream; -import java.util.*; - -import static org.junit.Assert.*; - -public class LogIslandRecordConverterTest { - - private LogIslandRecordConverter setupInstance(Class serializerClass, boolean isKey) { - final LogIslandRecordConverter instance = new LogIslandRecordConverter(); - instance.configure( - Collections.singletonMap(LogIslandRecordConverter.PROPERTY_RECORD_SERIALIZER, serializerClass.getCanonicalName()), - isKey); - return instance; - } - - private void assertFieldEquals(Record record, String fieldName, Object expected) { - Field field = record.getField(fieldName); - if (expected == null) { - assertNull(field); - } else { - assertNotNull(field); - assertEquals(expected, field.getRawValue()); - } - } - - private void assertFieldEquals(Record record, String fieldName, byte[] expected) { - Field field = record.getField(fieldName); - if (expected == null) { - assertNull(field); - } else { - assertNotNull(field); - assertArrayEquals(expected, (byte[]) field.getRawValue()); - } - } - - - @Test - public void testBytesSchema() { - byte[] data = new byte[16]; - new Random().nextBytes(data); - RecordSerializer serializer = new BytesArraySerializer(); - LogIslandRecordConverter instance = setupInstance(serializer.getClass(), false); - byte[] serialized = instance.fromConnectData("", Schema.BYTES_SCHEMA, data); - Record record = serializer.deserialize(new ByteArrayInputStream(serialized)); - assertNotNull(record); - assertFieldEquals(record, FieldDictionary.RECORD_VALUE, data); - } - - @Test - public void testComplexSchema() { - //our schema - - final Schema complexSchema = SchemaBuilder - .struct() - .field("f1", SchemaBuilder.bool()) - .field("f2", SchemaBuilder.string()) - .field("f3", SchemaBuilder.int8()) - .field("f4", SchemaBuilder.int16()) - .field("f5", SchemaBuilder.string().optional()) - .field("f6", SchemaBuilder.float32()) - .field("arr", SchemaBuilder.array(SchemaBuilder.int32())) - .field("map", SchemaBuilder.map(SchemaBuilder.string(), SchemaBuilder.string())) - .field("struct", SchemaBuilder.struct() - .field("child", SchemaBuilder.string()).build()) - .build(); - - //setup converters - LogIslandRecordConverter instance = setupInstance(KryoSerializer.class, false); - RecordSerializer serializer = SerializerProvider.getSerializer(KryoSerializer.class.getName(), null); - Struct complex = new Struct(complexSchema) - .put("f1", true) - .put("f2", "test") - .put("f3", (byte) 0) - .put("f4", (short) 1) - .put("f5", null) - .put("f6", 3.1415f) - .put("arr", new ArrayList<>(Arrays.asList(0, 1, 2))) - .put("map", new HashMap<>(Collections.singletonMap("key", "value"))) - .put("struct", - new Struct(complexSchema.field("struct").schema()) - .put("child", "child")); - - Record record = serializer.deserialize(new ByteArrayInputStream(instance.fromConnectData(null, complexSchema, complex))); - System.out.println(record); - //assertions - assertNotNull(record); - Record extracted = record.getField(FieldDictionary.RECORD_VALUE).asRecord(); - assertNotNull(extracted); - assertFieldEquals(extracted, "f1", true); - assertFieldEquals(extracted, "f2", "test"); - assertFieldEquals(extracted, "f3", (byte) 0); - assertFieldEquals(extracted, "f4", (short) 1); - assertFieldEquals(extracted, "f5", null); - assertFieldEquals(extracted, "f6", (float) 3.1415); - assertFieldEquals(extracted, "arr", new ArrayList<>(Arrays.asList(0, 1, 2))); - assertFieldEquals(extracted, "map", new HashMap<>(Collections.singletonMap("key", "value"))); - //assertFieldEquals(((Map)extracted.getField("struct").getRawValue()).get("child"), "child", "child"); - - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/FakeConnector.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/FakeConnector.java deleted file mode 100644 index 0e15152f9..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/FakeConnector.java +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect.fake; - -import org.apache.commons.lang3.RandomStringUtils; -import org.apache.kafka.common.config.ConfigDef; -import org.apache.kafka.connect.connector.Task; -import org.apache.kafka.connect.data.Schema; -import org.apache.kafka.connect.data.SchemaBuilder; -import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.source.SourceConnector; -import org.apache.kafka.connect.source.SourceRecord; -import org.apache.kafka.connect.source.SourceTask; - -import java.util.*; -import java.util.concurrent.SynchronousQueue; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -public class FakeConnector extends SourceConnector { - - - public static class FakeTask extends SourceTask { - - private SynchronousQueue queue = new SynchronousQueue<>(); - private final Timer timer = new Timer(); - - - @Override - public void start(Map props) { - - - } - - @Override - public List poll() throws InterruptedException { - Random random = new Random(); - - return IntStream.range(0, 1000).mapToObj(i -> { - int p = random.nextInt(10); - Schema schema = SchemaBuilder.struct() - .field("partition", SchemaBuilder.int32()) - .field("val", SchemaBuilder.string()) - .build(); - return new SourceRecord( - Collections.singletonMap("partition", p), - Collections.singletonMap("offset", System.currentTimeMillis()), - "", - null, - schema, - new Struct(schema) - .put("partition", p) - .put("val", RandomStringUtils.randomAscii(30))); - } - ).collect(Collectors.toList()); - } - - - @Override - public void stop() { - timer.cancel(); - } - - @Override - public String version() { - return "1.0"; - } - - } - - @Override - public String version() { - return "1.0"; - } - - @Override - public void start(Map props) { - } - - @Override - public Class taskClass() { - return FakeTask.class; - } - - @Override - public List> taskConfigs(int maxTasks) { - List> ret = new ArrayList<>(); - for (int i = 0; i < maxTasks; i++) { - ret.add(Collections.emptyMap()); - } - return ret; - } - - @Override - public void stop() { - - } - - @Override - public ConfigDef config() { - return new ConfigDef(); - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSink.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSink.java deleted file mode 100644 index c0d557489..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSink.java +++ /dev/null @@ -1,57 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect.fake; - -import org.apache.kafka.common.config.ConfigDef; -import org.apache.kafka.connect.connector.Task; -import org.apache.kafka.connect.sink.SinkConnector; - -import java.util.Collections; -import java.util.List; -import java.util.Map; - -public class TestSink extends SinkConnector { - - @Override - public String version() { - return null; - } - - @Override - public void start(Map props) { - - } - - @Override - public Class taskClass() { - return TestSinkTask.class; - } - - @Override - public List> taskConfigs(int maxTasks) { - return Collections.singletonList(Collections.emptyMap()); - } - - @Override - public void stop() { - - } - - @Override - public ConfigDef config() { - return new ConfigDef(); - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSinkTask.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSinkTask.java deleted file mode 100644 index e874dc1f8..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/connect/fake/TestSinkTask.java +++ /dev/null @@ -1,55 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.connect.fake; - -import org.apache.kafka.clients.consumer.OffsetAndMetadata; -import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.connect.sink.SinkRecord; -import org.apache.kafka.connect.sink.SinkTask; - -import java.util.Collection; -import java.util.Map; - -public class TestSinkTask extends SinkTask { - - @Override - public void start(Map props) { - System.out.println("Task started"); - } - - @Override - public void put(Collection records) { - - System.out.println("Adding " + records.size() + " records"); - records.stream().findFirst().ifPresent(System.out::println); - } - - @Override - public void flush(Map offsets) { - System.out.println("Flushed offset: " +offsets); - } - - @Override - public void stop() { - System.out.println("Task stopped"); - - } - - @Override - public String version() { - return ""; - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/AbstractStreamProcessingIntegrationTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/AbstractStreamProcessingIntegrationTest.java deleted file mode 100644 index fb01c96bf..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/AbstractStreamProcessingIntegrationTest.java +++ /dev/null @@ -1,246 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine; - -import com.hurence.logisland.record.Record; -import com.hurence.logisland.serializer.KryoSerializer; -import com.hurence.logisland.stream.StreamProperties; -import kafka.admin.RackAwareMode; -import kafka.server.KafkaConfig; -import kafka.server.KafkaServer; -import kafka.utils.*; -import kafka.zk.AdminZkClient; -import kafka.zk.EmbeddedZookeeper; -import kafka.zk.KafkaZkClient; -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.clients.producer.KafkaProducer; -import org.apache.kafka.clients.producer.ProducerRecord; -import org.apache.kafka.common.utils.Time; -import org.junit.After; -import org.junit.Before; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.net.InetAddress; -import java.net.ServerSocket; -import java.nio.file.Files; -import java.util.*; - -import static org.junit.Assert.assertTrue; - -/** - * Abstract class for integration testing - */ -public abstract class AbstractStreamProcessingIntegrationTest { - - - protected static final String ZKHOST = "127.0.0.1"; - protected static final String BROKERHOST = "127.0.0.1"; - protected static final int BROKERPORT = choosePorts(2)[0]; - protected static final String INPUT_TOPIC = "mock_in"; - protected static final String OUTPUT_TOPIC = "mock_out"; - - private static Logger logger = LoggerFactory.getLogger(AbstractStreamProcessingIntegrationTest.class); - - private static KafkaProducer producer; - private static KafkaConsumer consumer; - private static ProcessingEngine engine; - private static EngineContext engineContext; - protected EmbeddedZookeeper zkServer; - private KafkaServer kafkaServer; - private KafkaZkClient kafkaZkClient; - private AdminZkClient adminZkClient; - - /** - * Choose a number of random available ports - */ - public static int[] choosePorts(int count) { - try { - ServerSocket[] sockets = new ServerSocket[count]; - int[] ports = new int[count]; - for (int i = 0; i < count; i++) { - sockets[i] = new ServerSocket(0, 0, InetAddress.getByName("0.0.0.0")); - ports[i] = sockets[i].getLocalPort(); - } - for (int i = 0; i < count; i++) - sockets[i].close(); - return ports; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - @Before - public void setUp() throws InterruptedException, IOException { - - // setup Zookeeper - zkServer = new EmbeddedZookeeper(); - String zkConnect = ZKHOST + ":" + zkServer.port(); - - // setup Broker - Properties brokerProps = new Properties(); - brokerProps.setProperty("zookeeper.connect", zkConnect); - brokerProps.setProperty("broker.id", "0"); - brokerProps.setProperty("log.dirs", Files.createTempDirectory("kafka-").toAbsolutePath().toString()); - brokerProps.setProperty("listeners", "PLAINTEXT://" + BROKERHOST + ":" + BROKERPORT); - KafkaConfig config = new KafkaConfig(brokerProps); - Time mock = new MockTime(); - kafkaServer = TestUtils.createServer(config, mock); - kafkaZkClient = kafkaServer.zkClient(); - adminZkClient = new AdminZkClient(kafkaZkClient); - - // create topics - if (!kafkaZkClient.topicExists(StreamProperties.DEFAULT_ERRORS_TOPIC().getValue())) - adminZkClient.createTopic( - StreamProperties.DEFAULT_ERRORS_TOPIC().getValue(), - 1, - 1, - new Properties(), - RackAwareMode.Disabled$.MODULE$); - if (!kafkaZkClient.topicExists(StreamProperties.DEFAULT_RECORDS_TOPIC().getValue())) - adminZkClient.createTopic(StreamProperties.DEFAULT_RECORDS_TOPIC().getValue(), 1, 1, new Properties(), RackAwareMode.Disabled$.MODULE$); - if (!kafkaZkClient.topicExists(StreamProperties.DEFAULT_RAW_TOPIC().getValue())) - adminZkClient.createTopic(StreamProperties.DEFAULT_RAW_TOPIC().getValue(), 1, 1, new Properties(), RackAwareMode.Disabled$.MODULE$); - if (!kafkaZkClient.topicExists(StreamProperties.DEFAULT_METRICS_TOPIC().getValue())) - adminZkClient.createTopic(StreamProperties.DEFAULT_METRICS_TOPIC().getValue(), 1, 1, new Properties(), RackAwareMode.Disabled$.MODULE$); - - - // deleting zookeeper information to make sure the consumer starts from the beginning - adminZkClient.deleteTopic("/consumers/group0"); - -/* - File checkpointDir = new File("checkpoints"); - if (checkpointDir.isDirectory()) - FileUtils.forceDelete(checkpointDir); -*/ - - Optional instance = getEngineContext(); - assertTrue(instance.isPresent()); - assertTrue(instance.get().isValid()); - engine = instance.get().getEngine(); - engineContext = instance.get(); - - - - System.setProperty("hadoop.home.dir", "/"); - - Runnable testRunnable = () -> { - engine.start(engineContext); - }; - - Thread t = new Thread(testRunnable); - logger.info("starting engine thread {}", t.getId()); - t.start(); - - } - - @After - public void tearDown() throws NoSuchFieldException, IllegalAccessException, InterruptedException { - - engine.shutdown(engineContext); - Thread.sleep(2000); - - if (kafkaServer != null) { - kafkaServer.shutdown(); - // Remove any persistent data - CoreUtils.delete(kafkaServer.config().logDirs()); - } - - if (kafkaZkClient.topicExists(StreamProperties.DEFAULT_ERRORS_TOPIC().getValue())) - adminZkClient.deleteTopic(StreamProperties.DEFAULT_ERRORS_TOPIC().getValue()); - if (kafkaZkClient.topicExists(StreamProperties.DEFAULT_RECORDS_TOPIC().getValue())) - adminZkClient.deleteTopic(StreamProperties.DEFAULT_RECORDS_TOPIC().getValue()); - if (kafkaZkClient.topicExists(StreamProperties.DEFAULT_RAW_TOPIC().getValue())) - adminZkClient.deleteTopic(StreamProperties.DEFAULT_RAW_TOPIC().getValue()); - if (kafkaZkClient.topicExists(StreamProperties.DEFAULT_METRICS_TOPIC().getValue())) - adminZkClient.deleteTopic(StreamProperties.DEFAULT_METRICS_TOPIC().getValue()); - - if (zkServer != null) { - zkServer.shutdown(); - } - } - - - abstract Optional getEngineContext(); - - - protected static void sendRecord(String topic, Record record) throws IOException { - - // setup producer - Properties producerProps = new Properties(); - producerProps.setProperty("bootstrap.servers", BROKERHOST + ":" + BROKERPORT); - producerProps.setProperty("key.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); - producerProps.setProperty("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); - producer = new KafkaProducer(producerProps); - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - final KryoSerializer kryoSerializer = new KryoSerializer(true); - kryoSerializer.serialize(baos, record); - ProducerRecord data = new ProducerRecord<>(topic, null, baos.toByteArray()); - producer.send(data); - baos.close(); - - logger.info("sent record : " + record + " to topic " + topic); - producer.close(); - } - - protected static List readRecords(String topic) { - - - // setup consumer - Properties consumerProps = new Properties(); - consumerProps.setProperty("bootstrap.servers", BROKERHOST + ":" + BROKERPORT); - consumerProps.setProperty("group.id", "group0"); - consumerProps.setProperty("client.id", "consumer0"); - consumerProps.setProperty("key.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer"); - consumerProps.setProperty("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer"); - consumerProps.put("auto.offset.reset", "earliest"); // to make sure the consumer starts from the beginning of the topic - consumer = new KafkaConsumer<>(consumerProps); - consumer.subscribe(Arrays.asList(topic)); - - - List outputRecords = new ArrayList<>(); - - // starting consumer - ConsumerRecords records = consumer.poll(1000); - - // verify the integrity of the retrieved event - for (ConsumerRecord record : records) { - final KryoSerializer deserializer = new KryoSerializer(true); - - ByteArrayInputStream bais = new ByteArrayInputStream(record.value()); - Record deserializedRecord = deserializer.deserialize(bais); - logger.info(deserializedRecord.toString()); - outputRecords.add(deserializedRecord); - try { - bais.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - consumer.close(); - - return outputRecords; - } - - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/ProgrammaticStreamProcessingIntegrationTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/ProgrammaticStreamProcessingIntegrationTest.java deleted file mode 100644 index fa1a74fcb..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/ProgrammaticStreamProcessingIntegrationTest.java +++ /dev/null @@ -1,170 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine; - -import com.hurence.logisland.component.ComponentFactory; -import com.hurence.logisland.component.ComponentType; -import com.hurence.logisland.config.EngineConfiguration; -import com.hurence.logisland.config.ProcessorConfiguration; -import com.hurence.logisland.config.StreamConfiguration; -import com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine; -import com.hurence.logisland.stream.StreamProperties; -import com.hurence.logisland.util.runner.MockProcessor; -import com.hurence.logisland.record.FieldType; -import com.hurence.logisland.record.Record; -import com.hurence.logisland.record.StandardRecord; -import com.hurence.logisland.stream.spark.AbstractKafkaRecordStream; -import com.hurence.logisland.stream.spark.KafkaRecordStreamParallelProcessing; -import org.junit.Ignore; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.*; - -import static org.junit.Assert.assertTrue; - -/** - * Empty Java class for source jar generation (need to publish on OSS sonatype) - */ -public class ProgrammaticStreamProcessingIntegrationTest extends AbstractStreamProcessingIntegrationTest { - - - public static final String MAGIC_STRING = "the world is so big"; - - - private static Logger logger = LoggerFactory.getLogger(ProgrammaticStreamProcessingIntegrationTest.class); - - - Optional getEngineContext() { - Map properties = new HashMap<>(); - properties.put(KafkaStreamProcessingEngine.SPARK_APP_NAME().getName(), "testApp"); - properties.put(KafkaStreamProcessingEngine.SPARK_STREAMING_BATCH_DURATION().getName(), "500"); - properties.put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[4]"); - properties.put(KafkaStreamProcessingEngine.SPARK_STREAMING_TIMEOUT().getName(), "12000"); - - - EngineConfiguration conf = new EngineConfiguration(); - conf.setComponent(KafkaStreamProcessingEngine.class.getName()); - conf.setType(ComponentType.ENGINE.toString()); - conf.setConfiguration(properties); - conf.addPipelineConfigurations(createStreamConfig()); - - return ComponentFactory.getEngineContext(conf); - } - - - private StreamConfiguration createStreamConfig() { - Map properties = new HashMap<>(); - properties.put(StreamProperties.KAFKA_METADATA_BROKER_LIST().getName(), BROKERHOST + ":" + BROKERPORT); - properties.put(StreamProperties.KAFKA_ZOOKEEPER_QUORUM().getName(), ZKHOST + ":" + zkServer.port()); - properties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR().getName(), "1"); - properties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_PARTITIONS().getName(), "1"); - properties.put(StreamProperties.INPUT_SERIALIZER().getName(), StreamProperties.KRYO_SERIALIZER().getValue()); - properties.put(StreamProperties.OUTPUT_SERIALIZER().getName(), StreamProperties.KRYO_SERIALIZER().getValue()); - properties.put(StreamProperties.KAFKA_MANUAL_OFFSET_RESET().getName(), StreamProperties.LATEST_OFFSET().getValue()); - - properties.put(StreamProperties.INPUT_TOPICS().getName(), INPUT_TOPIC); - properties.put(StreamProperties.OUTPUT_TOPICS().getName(), OUTPUT_TOPIC); - - StreamConfiguration conf = new StreamConfiguration(); - conf.setComponent(KafkaRecordStreamParallelProcessing.class.getName()); - conf.setType(ComponentType.STREAM.toString()); - conf.setConfiguration(properties); - conf.setStream("KafkaStream"); - conf.addProcessorConfiguration(createProcessorConfiguration()); - - return conf; - } - - private ProcessorConfiguration createProcessorConfiguration() { - Map properties = new HashMap<>(); - properties.put(MockProcessor.FAKE_MESSAGE.getName(), MAGIC_STRING); - - ProcessorConfiguration conf = new ProcessorConfiguration(); - conf.setComponent(MockProcessor.class.getName()); - conf.setType(ComponentType.PROCESSOR.toString()); - conf.setConfiguration(properties); - conf.setProcessor("mock"); - - return conf; - } - - - @Test - @Ignore - public void validateIntegration() throws NoSuchFieldException, IllegalAccessException, InterruptedException, IOException { - - final List records = new ArrayList<>(); - - Runnable testRunnable = () -> { - - - // send message - Record record = new StandardRecord("cisco"); - record.setId("firewall_record1"); - record.setField("method", FieldType.STRING, "GET"); - record.setField("ip_source", FieldType.STRING, "123.34.45.123"); - record.setField("ip_target", FieldType.STRING, "255.255.255.255"); - record.setField("url_scheme", FieldType.STRING, "http"); - record.setField("url_host", FieldType.STRING, "origin-www.20minutes.fr"); - record.setField("url_port", FieldType.STRING, "80"); - record.setField("url_path", FieldType.STRING, "/r15lgc-100KB.js"); - record.setField("request_size", FieldType.INT, 1399); - record.setField("response_size", FieldType.INT, 452); - record.setField("is_outside_office_hours", FieldType.BOOLEAN, false); - record.setField("is_host_blacklisted", FieldType.BOOLEAN, false); - record.setField("tags", FieldType.ARRAY, new ArrayList<>(Arrays.asList("spam", "filter", "mail"))); - - - try { - Thread.sleep(8000); - } catch (InterruptedException e) { - e.printStackTrace(); - } - try { - sendRecord(INPUT_TOPIC, record); - } catch (IOException e) { - e.printStackTrace(); - } - - - try { - Thread.sleep(2000); - } catch (InterruptedException e) { - e.printStackTrace(); - } - records.addAll(readRecords(OUTPUT_TOPIC)); - }; - - Thread t = new Thread(testRunnable); - logger.info("starting validation thread {}", t.getId()); - t.start(); - - - try{ - Thread.sleep(15000); - assertTrue(records.size() == 1); - assertTrue(records.get(0).size() == 13); - assertTrue(records.get(0).getField("message").asString().equals(MAGIC_STRING)); - }catch (Exception e){ - logger.error("issue durring validation {}", e.getMessage()); - } - - - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RecordStreamProcessingDebuggerTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RecordStreamProcessingDebuggerTest.java deleted file mode 100644 index 3e565afdd..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RecordStreamProcessingDebuggerTest.java +++ /dev/null @@ -1,282 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine; - -import com.hurence.logisland.component.ComponentFactory; -import com.hurence.logisland.component.ComponentType; -import com.hurence.logisland.config.EngineConfiguration; -import com.hurence.logisland.config.ProcessorConfiguration; -import com.hurence.logisland.config.StreamConfiguration; -import com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine; -import com.hurence.logisland.stream.StreamProperties; -import com.hurence.logisland.stream.spark.KafkaRecordStreamDebugger; -import com.hurence.logisland.stream.spark.KafkaRecordStreamHDFSBurner; -import com.hurence.logisland.stream.spark.KafkaRecordStreamParallelProcessing; -import com.hurence.logisland.stream.spark.KafkaRecordStreamSQLAggregator; -import com.hurence.logisland.util.runner.MockProcessor; -import org.junit.Ignore; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.HashMap; -import java.util.Map; -import java.util.Optional; - - -public class RecordStreamProcessingDebuggerTest { - private static Logger logger = LoggerFactory.getLogger(RecordStreamProcessingDebuggerTest.class); - - private static final String APACHE_LOG_FIELDS = "src_ip,identd,user,record_time,http_method,http_query,http_version,http_status,bytes_out"; - private static final String APACHE_LOG_REGEX = "(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+\\[([\\w:\\/]+\\s[+\\-]\\d{4})\\]\\s+\"(\\S+)\\s+(\\S+)\\s*(\\S*)\"\\s+(\\S+)\\s+(\\S+)"; - - - @Test - @Ignore - public void remoteTest() { - - logger.info("starting StreamProcessingRunner"); - - // ProcessorConfiguration processorConf = getSplitTextProcessorConfiguration(); - StreamConfiguration chainConf = getSQLStreamConfiguration(); - EngineConfiguration engineConf = getStandardEngineConfiguration(); - engineConf.addPipelineConfigurations(chainConf); - // chainConf.addProcessorConfiguration(processorConf); - - - try { - - // instanciate engine and all the processor from the config - Optional engineInstance = ComponentFactory.getEngineContext(engineConf); - - assert engineInstance.isPresent(); - assert engineInstance.get().isValid(); - - // start the engine - EngineContext engineContext = engineInstance.get(); - engineInstance.get().getEngine().start(engineContext); - - - } catch (Exception e) { - logger.error("unable to launch runner : {}", e); - } - - - } - - private EngineConfiguration getStandardEngineConfiguration() { - Map engineProperties = new HashMap<>(); - engineProperties.put(KafkaStreamProcessingEngine.SPARK_APP_NAME().getName(), "testApp"); - engineProperties.put(KafkaStreamProcessingEngine.SPARK_STREAMING_BATCH_DURATION().getName(), "5000"); - engineProperties.put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[4]"); - engineProperties.put(KafkaStreamProcessingEngine.SPARK_EXECUTOR_CORES().getName(), "4"); - engineProperties.put(KafkaStreamProcessingEngine.SPARK_STREAMING_TIMEOUT().getName(), "-1"); - - EngineConfiguration engineConf = new EngineConfiguration(); - engineConf.setComponent(KafkaStreamProcessingEngine.class.getName()); - engineConf.setType(ComponentType.ENGINE.toString()); - engineConf.setConfiguration(engineProperties); - return engineConf; - } - - private StreamConfiguration getBurnerStreamConfiguration() { - Map streamProperties = new HashMap<>(); - /*chainProperties.put(AbstractKafkaRecordStream.KAFKA_METADATA_BROKER_LIST().getName(), - "sd-84190:6667,sd-84191:6667,sd-84192:6667,sd-84196:6667"); - chainProperties.put(AbstractKafkaRecordStream.KAFKA_ZOOKEEPER_QUORUM().getName(), - "sd-76387:2181,sd-84186:2181,sd-84189:2181");*/ - streamProperties.put(StreamProperties.KAFKA_METADATA_BROKER_LIST().getName(), - "sandbox:9092"); - streamProperties.put(StreamProperties.KAFKA_ZOOKEEPER_QUORUM().getName(), - "sandbox:2181"); - streamProperties.put(StreamProperties.INPUT_TOPICS().getName(), "logisland_events"); - streamProperties.put(StreamProperties.OUTPUT_TOPICS().getName(), "none"); - streamProperties.put(StreamProperties.INPUT_SERIALIZER().getName(), StreamProperties.KRYO_SERIALIZER().getValue()); - streamProperties.put(StreamProperties.OUTPUT_SERIALIZER().getName(), StreamProperties.NO_SERIALIZER().getValue()); - streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR().getName(), "1"); - streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_PARTITIONS().getName(), "2"); - - streamProperties.put(StreamProperties.OUTPUT_FOLDER_PATH().getName(), "data/logisland_events"); - streamProperties.put(StreamProperties.OUTPUT_FORMAT().getName(), "parquet"); - streamProperties.put(StreamProperties.RECORD_TYPE().getName(), "record"); - - StreamConfiguration chainConf = new StreamConfiguration(); - chainConf.setComponent(KafkaRecordStreamHDFSBurner.class.getName()); - chainConf.setType(ComponentType.STREAM.toString()); - chainConf.setConfiguration(streamProperties); - chainConf.setStream("KafkaStream"); - return chainConf; - } - - - private StreamConfiguration getParallelStreamConfiguration() { - Map streamProperties = new HashMap<>(); - streamProperties.put(StreamProperties.KAFKA_METADATA_BROKER_LIST().getName(), - "sandbox:9092"); - streamProperties.put(StreamProperties.KAFKA_ZOOKEEPER_QUORUM().getName(), - "sandbox:2181"); - streamProperties.put(StreamProperties.OUTPUT_TOPICS().getName(), "logisland_events"); - streamProperties.put(StreamProperties.INPUT_TOPICS().getName(), "logisland_raw"); - streamProperties.put(StreamProperties.ERROR_TOPICS().getName(), "logisland_errors"); - streamProperties.put(StreamProperties.INPUT_SERIALIZER().getName(), - StreamProperties.NO_SERIALIZER().getValue()); - streamProperties.put(StreamProperties.OUTPUT_SERIALIZER().getName(), - StreamProperties.JSON_SERIALIZER().getValue()); - streamProperties.put(StreamProperties.ERROR_SERIALIZER().getName(), - StreamProperties.JSON_SERIALIZER().getValue()); - - streamProperties.put(StreamProperties.AVRO_OUTPUT_SCHEMA().getName(), - "{ \"version\":1,\n" + - " \"type\": \"record\",\n" + - " \"name\": \"com.hurence.logisland.record.apache_log\",\n" + - " \"fields\": [\n" + - " { \"name\": \"record_raw_value\", \"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"record_errors\", \"type\": [ {\"type\": \"array\", \"items\": \"string\"},\"null\"] },\n" + - " { \"name\": \"record_id\", \"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"record_time\", \"type\": [\"long\",\"null\"] },\n" + - " { \"name\": \"record_type\", \"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"src_ip\", \"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"http_method\", \"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"bytes_out\", \"type\": [\"long\",\"null\"] },\n" + - " { \"name\": \"http_query\", \"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"http_version\",\"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"http_status\", \"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"identd\", \"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"user\", \"type\": [\"string\",\"null\"] } ]}"); - - streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR().getName(), "1"); - streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_PARTITIONS().getName(), "2"); - - - StreamConfiguration chainConf = new StreamConfiguration(); - chainConf.setComponent(KafkaRecordStreamParallelProcessing.class.getName()); - chainConf.setType(ComponentType.STREAM.toString()); - chainConf.setConfiguration(streamProperties); - chainConf.setStream("KafkaStream"); - return chainConf; - } - - - private StreamConfiguration getDebuggerStreamConfiguration() { - Map streamProperties = new HashMap<>(); - streamProperties.put(StreamProperties.KAFKA_METADATA_BROKER_LIST().getName(), "sandbox:9092"); - streamProperties.put(StreamProperties.KAFKA_ZOOKEEPER_QUORUM().getName(), "sandbox:2181"); - streamProperties.put(StreamProperties.INPUT_TOPICS().getName(), "logisland_raw"); - streamProperties.put(StreamProperties.OUTPUT_TOPICS().getName(), "logisland_events"); - streamProperties.put(StreamProperties.INPUT_SERIALIZER().getName(), StreamProperties.NO_SERIALIZER().getValue()); - streamProperties.put(StreamProperties.OUTPUT_SERIALIZER().getName(), StreamProperties.JSON_SERIALIZER().getValue()); - streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR().getName(), "1"); - streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_PARTITIONS().getName(), "4"); - - - StreamConfiguration chainConf = new StreamConfiguration(); - chainConf.setComponent(KafkaRecordStreamDebugger.class.getName()); - chainConf.setType(ComponentType.STREAM.toString()); - chainConf.setConfiguration(streamProperties); - chainConf.setStream("KafkaSQLStream"); - return chainConf; - } - - - private StreamConfiguration getSQLStreamConfiguration() { - Map streamProperties = new HashMap<>(); - streamProperties.put(StreamProperties.OUTPUT_RECORD_TYPE().getName(), "product_metric"); - streamProperties.put(StreamProperties.KAFKA_METADATA_BROKER_LIST().getName(), - "sd-84190:6667,sd-84191:6667,sd-84192:6667,sd-84186:6667"); - streamProperties.put(StreamProperties.KAFKA_ZOOKEEPER_QUORUM().getName(), - "sd-76387:2181,sd-84186:2181,sd-84189:2181"); - streamProperties.put(StreamProperties.INPUT_TOPICS().getName(), "ffact_products"); - streamProperties.put(StreamProperties.OUTPUT_TOPICS().getName(), "ffact_metrics"); - streamProperties.put(StreamProperties.INPUT_SERIALIZER().getName(), StreamProperties.JSON_SERIALIZER().getValue()); - - streamProperties.put(StreamProperties.OUTPUT_SERIALIZER().getName(), StreamProperties.JSON_SERIALIZER().getValue()); - streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_REPLICATION_FACTOR().getName(), "1"); - streamProperties.put(StreamProperties.KAFKA_TOPIC_DEFAULT_PARTITIONS().getName(), "1"); - - streamProperties.put(StreamProperties.MAX_RESULTS_COUNT().getName(), "10"); - streamProperties.put(StreamProperties.SQL_QUERY().getName(), "SELECT count(*)/first(theoretical_cadence) AS product_trs, count(*) as product_count, factory, line, first(product_type) as product_type, first(theoretical_cadence) as theoretical_cadence, max(record_time) as record_time\n" + - " FROM ffact_products\n" + - " GROUP BY factory, line\n" + - " LIMIT 20"); - - - streamProperties.put(StreamProperties.AVRO_INPUT_SCHEMA().getName(), - "{ \"version\": 1,\n" + - " \"type\": \"record\",\n" + - " \"name\": \"com.hurence.logisland.ffact.product\",\n" + - " \"fields\": [\n" + - " { \"name\": \"record_errors\", \"type\": [ {\"type\": \"array\", \"items\": \"string\"},\"null\"] },\n" + - " { \"name\": \"record_raw_key\", \"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"record_raw_value\", \"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"record_id\", \"type\": [\"string\"] },\n" + - " { \"name\": \"record_time\", \"type\": [\"long\"] },\n" + - " { \"name\": \"record_type\", \"type\": [\"string\"] },\n" + - " { \"name\": \"label\", \"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"product_type\", \"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"operator\", \"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"factory\", \"type\": [\"string\",\"null\"] },\n" + - " { \"name\": \"latitude\", \"type\": [\"float\",\"null\"] },\n" + - " { \"name\": \"longitude\", \"type\": [\"float\",\"null\"] },\n" + - " { \"name\": \"theoretical_cadence\",\"type\": [\"float\",\"null\"] },\n" + - " { \"name\": \"line\", \"type\": [\"string\",\"null\"] } \n" + - " ]}"); - - StreamConfiguration chainConf = new StreamConfiguration(); - chainConf.setComponent(KafkaRecordStreamSQLAggregator.class.getName()); - chainConf.setType("stream"); - chainConf.setConfiguration(streamProperties); - chainConf.setStream("KafkaSQLStream"); - return chainConf; - } - - private ProcessorConfiguration getSplitTextProcessorConfiguration() { - Map processorProperties = new HashMap<>(); - processorProperties.put("value.regex", APACHE_LOG_REGEX); - processorProperties.put("value.fields", APACHE_LOG_FIELDS); - processorProperties.put("key.regex", "(\\S*):(\\S*):(\\S*):(\\S*):(\\S*)"); - processorProperties.put("key.field", "search_index,sub_project_code,record_type,host_name,uuid"); - - ProcessorConfiguration processorConf = new ProcessorConfiguration(); - processorConf.setComponent("com.hurence.logisland.processor.SplitText"); - processorConf.setType("parser"); - processorConf.setConfiguration(processorProperties); - processorConf.setProcessor("parser"); - return processorConf; - } - - - private ProcessorConfiguration getMockProcessorConfiguration() { - - - ProcessorConfiguration processorConf = new ProcessorConfiguration(); - processorConf.setComponent(MockProcessor.class.getName()); - processorConf.setType(ComponentType.PROCESSOR.toString()); - processorConf.setProcessor("debugger"); - return processorConf; - } - - private ProcessorConfiguration getDebugProcessorConfiguration() { - Map processorProperties = new HashMap<>(); - processorProperties.put("event.serializer", "json"); - - ProcessorConfiguration processorConf = new ProcessorConfiguration(); - processorConf.setComponent("com.hurence.logisland.processor.DebugStream"); - processorConf.setType(ComponentType.PROCESSOR.toString()); - processorConf.setConfiguration(processorProperties); - processorConf.setProcessor("debugger"); - return processorConf; - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RemoteApiEngineTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RemoteApiEngineTest.java deleted file mode 100644 index 59291b45e..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/RemoteApiEngineTest.java +++ /dev/null @@ -1,85 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine; - -import com.hurence.logisland.component.ComponentFactory; -import com.hurence.logisland.config.ConfigReader; -import com.hurence.logisland.config.LogislandConfiguration; -import com.hurence.logisland.util.spark.SparkUtils; -import okhttp3.mockwebserver.MockResponse; -import okhttp3.mockwebserver.MockWebServer; -import org.junit.Ignore; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Optional; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.TimeUnit; - - -public class RemoteApiEngineTest { - private static Logger logger = LoggerFactory.getLogger(RemoteApiEngineTest.class); - - private static final String JOB_CONF_FILE = "/conf/remote-engine.yml"; - - @Test - @Ignore - public void remoteTest() { - - logger.info("starting StreamProcessingRunner"); - - Optional engineInstance = Optional.empty(); - try { - - String configFile = RemoteApiEngineTest.class.getResource(JOB_CONF_FILE).getPath(); - - // load the YAML config - LogislandConfiguration sessionConf = ConfigReader.loadConfig(configFile); - - // instanciate engine and all the processor from the config - engineInstance = ComponentFactory.getEngineContext(sessionConf.getEngine()); - assert engineInstance.isPresent(); - assert engineInstance.get().isValid(); - - logger.info("starting Logisland session version {}", sessionConf.getVersion()); - logger.info(sessionConf.getDocumentation()); - } catch (Exception e) { - logger.error("unable to launch runner : {}", e); - } - - try { - // start the engine - final EngineContext engineContext = engineInstance.get(); - engineInstance.get().getEngine().start(engineContext); - - - engineContext.getEngine().awaitTermination(engineContext); - - } catch (Exception e) { - logger.error("something went bad while running the job : {}", e); - System.exit(-1); - } - - - - - - - } - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/SparkEngineConfTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/SparkEngineConfTest.java deleted file mode 100644 index e1e75a0ad..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/SparkEngineConfTest.java +++ /dev/null @@ -1,182 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine; - -import com.hurence.logisland.component.ComponentFactory; -import com.hurence.logisland.component.ComponentType; -import com.hurence.logisland.config.EngineConfiguration; -import com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine; -import org.junit.Assert; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.HashMap; -import java.util.Map; -import java.util.Optional; - -public class SparkEngineConfTest { - private static Logger logger = LoggerFactory.getLogger(SparkEngineConfTest.class); - - /** - * testing all value correct (see https://spark.apache.org/docs/latest/submitting-applications.html#master-urls 2.4.1 at time of this test) - * make sure it is compatible as well with first version 2.x https://spark.apache.org/docs/2.0.0/submitting-applications.html#master-urls - */ - @Test - public void sparkMasterConfigTest() { - EngineConfiguration engineConf = getStandardEngineConfiguration(); - - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[4]"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[2,1]"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[2,123]"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[*]"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[*,32]"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[33,32]"); - testConfIsValid(engineConf); - //spark://HOST:PORT - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://045.478.874.4785217"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://aze0484.44-44:089"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://aze0484.44-44"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://htrh"); - testConfIsValid(engineConf); - //spark://HOST1:PORT1,HOST2:PORT2 - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://cn1:2181,cn2:2181,cn3:2181"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://cn1:2181,cn2:2181"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://cn1:2181"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://cn1"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "spark://cn1,cn2"); - testConfIsValid(engineConf); - //mesos://HOST:PORT - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "mesos://zk://cn1:2181,cn2:2181,cn3:2181/mesos"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "mesos://zk://cn1:2181,cn2:2181/mesos"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "mesos://zk://cn1:2181/mesos"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "mesos://207.184.161.138:7077"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "mesos://207.184.161.138"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "mesos://gregh:"); - testConfIsNotValid(engineConf); - //yarn - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "yarn"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "yarn-client"); - testConfIsNotValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "yarn-cluster"); - testConfIsNotValid(engineConf); - //k8s://HOST:PORT - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "k8s://hrgjtdyj:4589"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "k8s://http://1245.444.444.444:4589"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "k8s://https://WHATEVER:41587"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "k8s://WHATEVER"); - testConfIsValid(engineConf); - } - - @Test - public void somePropertiesConfigTest() { - EngineConfiguration engineConf = getStandardEngineConfiguration(); - - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_APP_NAME().getName(), "FSV-OracleDataCollectionWithSnapshot-2outof4-PROD-Next1"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.JAVA_MESOS_LIBRARY_PATH().getName(), "/opt/mesos-1.6.0/build/src/.libs/libmesos.so"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_DRIVER_MEMORY().getName(), "2G"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_DRIVER_CORES().getName(), "1"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_EXECUTOR_CORES().getName(), "5"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_EXECUTOR_INSTANCES().getName(), "1"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_EXECUTOR_MEMORY().getName(), "4G"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_MESOS_CORE_MAX().getName(), "8"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_TASK_MAX_FAILURES().getName(), "8"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_SERIALIZER().getName(), "org.apache.spark.serializer.KryoSerializer"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_BATCH_DURATION().getName(), "20000"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_BACKPRESSURE_ENABLED().getName(), "false"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_UNPERSIST().getName(), "false"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_BLOCK_INTERVAL().getName(), "500"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_KAFKA_MAX_RATE_PER_PARTITION().getName(), "10000"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_TIMEOUT().getName(), "-1"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_KAFKA_MAXRETRIES().getName(), "3"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_UI_RETAINED_BATCHES().getName(), "200"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_STREAMING_RECEIVER_WAL_ENABLE().getName(), "false"); - testConfIsValid(engineConf); - engineConf.getConfiguration().put(KafkaStreamProcessingEngine.SPARK_UI_PORT().getName(), "4050"); - testConfIsValid(engineConf); - } - - private void testConfIsValid(EngineConfiguration engineConf) { - Optional engineInstance = ComponentFactory.getEngineContext(engineConf); - Assert.assertTrue(engineInstance.isPresent()); - Assert.assertTrue(engineInstance.get().isValid()); - engineInstance.get(); - } - - private void testConfIsNotValid(EngineConfiguration engineConf) { - Optional engineInstance = ComponentFactory.getEngineContext(engineConf); - Assert.assertTrue(engineInstance.isPresent()); - Assert.assertFalse(engineInstance.get().isValid()); - engineInstance.get(); - } - - private EngineConfiguration getStandardEngineConfiguration() { - Map engineProperties = new HashMap<>(); - engineProperties.put(KafkaStreamProcessingEngine.SPARK_APP_NAME().getName(), "testApp"); - engineProperties.put(KafkaStreamProcessingEngine.SPARK_STREAMING_BATCH_DURATION().getName(), "5000"); - engineProperties.put(KafkaStreamProcessingEngine.SPARK_MASTER().getName(), "local[4]"); - engineProperties.put(KafkaStreamProcessingEngine.SPARK_EXECUTOR_CORES().getName(), "4"); - engineProperties.put(KafkaStreamProcessingEngine.SPARK_STREAMING_TIMEOUT().getName(), "-1"); - - EngineConfiguration engineConf = new EngineConfiguration(); - engineConf.setComponent(KafkaStreamProcessingEngine.class.getName()); - engineConf.setType(ComponentType.ENGINE.toString()); - engineConf.setConfiguration(engineProperties); - return engineConf; - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/StreamDebuggerTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/StreamDebuggerTest.java deleted file mode 100644 index a781e63aa..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/StreamDebuggerTest.java +++ /dev/null @@ -1,79 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine; - -import com.hurence.logisland.component.ComponentFactory; -import com.hurence.logisland.config.ConfigReader; -import com.hurence.logisland.config.LogislandConfiguration; -import com.hurence.logisland.util.spark.SparkUtils; -import org.junit.Ignore; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Optional; - - -public class StreamDebuggerTest { - private static Logger logger = LoggerFactory.getLogger(StreamDebuggerTest.class); - - private static final String JOB_CONF_FILE = "/conf/structured-stream.yml"; - - @Test - @Ignore - public void remoteTest() { - - - logger.info("starting StreamProcessingRunner"); - - Optional engineInstance = Optional.empty(); - try { - - String configFile = StreamDebuggerTest.class.getResource(JOB_CONF_FILE).getPath(); - - // load the YAML config - LogislandConfiguration sessionConf = ConfigReader.loadConfig(configFile); - - // instanciate engine and all the processor from the config - engineInstance = ComponentFactory.getEngineContext(sessionConf.getEngine()); - assert engineInstance.isPresent(); - assert engineInstance.get().isValid(); - - logger.info("starting Logisland session version {}", sessionConf.getVersion()); - logger.info(sessionConf.getDocumentation()); - } catch (Exception e) { - logger.error("unable to launch runner : {}", e); - } - - try { - // start the engine - EngineContext engineContext = engineInstance.get(); - engineInstance.get().getEngine().start(engineContext); - - engineContext.getEngine().awaitTermination(engineContext); - } catch (Exception e) { - logger.error("something went bad while running the job : {}", e); - System.exit(-1); - } - - - - - - - } - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/RemoteApiClientTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/RemoteApiClientTest.java deleted file mode 100644 index 0f1c820aa..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/RemoteApiClientTest.java +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote; - -import okhttp3.Credentials; -import okhttp3.mockwebserver.MockResponse; -import okhttp3.mockwebserver.MockWebServer; -import okhttp3.mockwebserver.RecordedRequest; -import org.junit.Assert; -import org.junit.Test; - -import javax.ws.rs.core.HttpHeaders; -import java.time.Duration; -import java.util.concurrent.TimeUnit; - -public class RemoteApiClientTest { - - private final String dataflowName = "dummy"; - - private RemoteApiClient createInstance(MockWebServer server, String user, String password) { - return new RemoteApiClient(new RemoteApiClient.ConnectionSettings( server.url("/").toString(), - Duration.ofSeconds(2), Duration.ofSeconds(2), user, password)); - } - - @Test - public void testAllUnsecured() throws Exception { - - try (MockWebServer mockWebServer = new MockWebServer()) { - mockWebServer.enqueue(new MockResponse().setResponseCode(404)); - mockWebServer.enqueue(new MockResponse().setBodyDelay(3, TimeUnit.SECONDS)); - final String dummy = "\"name\":\"myName\", \"component\":\"myComponent\""; - mockWebServer.enqueue(new MockResponse().setBody("{" + dummy + ",\"lastModified\":\"1983-06-04T10:01:02Z\"," + - "\"streams\":[{" + dummy + "}]}")); - RemoteApiClient client = createInstance(mockWebServer, null, null); - Assert.assertFalse(client.fetchDataflow(dataflowName, new RemoteApiClient.State()).isPresent()); - Assert.assertFalse(client.fetchDataflow(dataflowName, new RemoteApiClient.State()).isPresent()); - Assert.assertTrue(client.fetchDataflow(dataflowName, new RemoteApiClient.State()).isPresent()); - - } - - - } - - @Test - public void testValidationFails() throws Exception { - try (MockWebServer mockWebServer = new MockWebServer()) { - mockWebServer.enqueue(new MockResponse().setBody("{\"name\":\"divPo\", \"lastModified\":\"1983-06-04T10:01:02Z\",\"services\":[{}],\"streams\":[{}]}")); - RemoteApiClient client = createInstance(mockWebServer, null, null); - Assert.assertFalse(client.fetchDataflow(dataflowName, new RemoteApiClient.State()).isPresent()); - } - - - } - - @Test - public void testAuthentication() throws Exception { - try (MockWebServer mockWebServer = new MockWebServer()) { - RemoteApiClient client = createInstance(mockWebServer, "test", "test"); - mockWebServer.enqueue(new MockResponse().setBody("{}")); - client.fetchDataflow(dataflowName, new RemoteApiClient.State()); - RecordedRequest request = mockWebServer.takeRequest(); - String auth = request.getHeader(HttpHeaders.AUTHORIZATION); - Assert.assertEquals(Credentials.basic("test", "test"), auth); - } - } - - @Test - public void testUri() throws Exception { - try (MockWebServer mockWebServer = new MockWebServer()) { - RemoteApiClient client = createInstance(mockWebServer, null, null); - mockWebServer.enqueue(new MockResponse().setBody("{}")); - client.fetchDataflow(dataflowName, new RemoteApiClient.State()); - RecordedRequest request = mockWebServer.takeRequest(); - Assert.assertEquals("/dataflows/"+dataflowName, request.getPath()); - } - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockProcessor.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockProcessor.java deleted file mode 100644 index 5d125dc7b..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockProcessor.java +++ /dev/null @@ -1,37 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote.mock; - -import com.hurence.logisland.component.PropertyDescriptor; -import com.hurence.logisland.processor.AbstractProcessor; -import com.hurence.logisland.processor.ProcessContext; -import com.hurence.logisland.record.Record; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -public class MockProcessor extends AbstractProcessor { - @Override - public List getSupportedPropertyDescriptors() { - return new ArrayList<>(); - } - - @Override - public Collection process(ProcessContext context, Collection records) { - return records; - } -} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockServiceController.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockServiceController.java deleted file mode 100644 index 17931be0e..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockServiceController.java +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote.mock; - -import com.hurence.logisland.component.PropertyDescriptor; -import com.hurence.logisland.controller.AbstractControllerService; - -import java.util.ArrayList; -import java.util.List; - -public class MockServiceController extends AbstractControllerService { - @Override - public List getSupportedPropertyDescriptors() { - return new ArrayList<>(); - } -} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockStream.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockStream.java deleted file mode 100644 index cff86c2bc..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/engine/spark/remote/mock/MockStream.java +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.engine.spark.remote.mock; - -import com.hurence.logisland.component.PropertyDescriptor; -import com.hurence.logisland.stream.AbstractRecordStream; - -import java.util.ArrayList; -import java.util.List; - -public class MockStream extends AbstractRecordStream { - @Override - public List getSupportedPropertyDescriptors() { - return new ArrayList<>(); - } -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/StructuredStreamTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/StructuredStreamTest.java deleted file mode 100644 index b7fede78c..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/StructuredStreamTest.java +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark.structured; - -import com.hurence.logisland.component.ComponentFactory; -import com.hurence.logisland.config.ConfigReader; -import com.hurence.logisland.config.LogislandConfiguration; -import com.hurence.logisland.engine.EngineContext; -import org.junit.Assert; -import org.junit.Ignore; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Optional; -import java.util.Scanner; - - -/** - * End to end test. - */ -public class StructuredStreamTest { - private static Logger logger = LoggerFactory.getLogger(StructuredStreamTest.class); - - private static final String JOB_CONF_FILE = "/conf/timeseries-structured-stream.yml"; - - @Test - @Ignore - public void remoteTest() { - - - logger.info("starting StreamProcessingRunner"); - - Optional engineInstance = Optional.empty(); - try { - - String configFile = StructuredStreamTest.class.getResource(JOB_CONF_FILE).getPath(); - - // load the YAML config - LogislandConfiguration sessionConf = ConfigReader.loadConfig(configFile); - - // instantiate engine and all the processor from the config - engineInstance = ComponentFactory.getEngineContext(sessionConf.getEngine()); - assert engineInstance.isPresent(); - assert engineInstance.get().isValid(); - - logger.info("starting Logisland session version {}", sessionConf.getVersion()); - logger.info(sessionConf.getDocumentation()); - } catch (Exception e) { - logger.error("unable to launch runner : {}", e); - } - - try { - // start the engine - EngineContext engineContext = engineInstance.get(); - engineInstance.get().getEngine().start(engineContext); - new Scanner(System.in).nextLine(); - } catch (Exception e) { - Assert.fail("something went bad while running the job : " + e.toString()); - - } - - - - - - - } - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderServiceTest.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderServiceTest.java deleted file mode 100644 index ab19ea8d1..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/LocalFileStructuredStreamProviderServiceTest.java +++ /dev/null @@ -1,59 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark.structured.provider; - -import com.hurence.logisland.annotation.documentation.CapabilityDescription; -import org.junit.Ignore; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - - -/** - * - * @author bailett - */ -@CapabilityDescription("Provide a way to read a local file as input in StructuredStream streams") -public class LocalFileStructuredStreamProviderServiceTest { - - private Logger logger = LoggerFactory.getLogger(LocalFileStructuredStreamProviderServiceTest.class); - - private String JOB_CONF_FILE = "/conf/timeseries-structured-stream.yml"; - - - @Test - @Ignore - public void testLocalFileStructuredStreamProviderService() { - ProviderServiceAsReaderRunner runner = new ProviderServiceAsReaderRunner(null); - runner.run(); - } - -} \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/ProviderServiceAsReaderRunner.java b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/ProviderServiceAsReaderRunner.java deleted file mode 100644 index 1d6ae83c1..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/java/com/hurence/logisland/stream/spark/structured/provider/ProviderServiceAsReaderRunner.java +++ /dev/null @@ -1,121 +0,0 @@ -/** - * Copyright (C) 2016 Hurence (support@hurence.com) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.hurence.logisland.stream.spark.structured.provider; - -import com.hurence.logisland.component.ComponentFactory; -import com.hurence.logisland.config.ControllerServiceConfiguration; -import com.hurence.logisland.config.EngineConfiguration; -import com.hurence.logisland.config.ProcessorConfiguration; -import com.hurence.logisland.config.StreamConfiguration; -import com.hurence.logisland.engine.EngineContext; -import com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine; -//import com.hurence.logisland.processor.DebugStream; -import com.hurence.logisland.stream.StreamProperties; -import com.hurence.logisland.stream.spark.structured.StructuredStream; -import com.hurence.logisland.util.runner.MockControllerServiceLookup; -import org.junit.Assert; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.*; - -public class ProviderServiceAsReaderRunner { - - private static Logger logger = LoggerFactory.getLogger(ProviderServiceAsReaderRunner.class); - - private final StructuredStreamProviderService provider; - private final MockControllerServiceLookup serviceLookup; - - - public ProviderServiceAsReaderRunner(StructuredStreamProviderService provider) { - this.provider = provider; - this.serviceLookup = new MockControllerServiceLookup(); - } - - public void run() { - EngineContext engineContext = ComponentFactory.getEngineContext(getEngineConfiguration()).get(); - Assert.assertTrue(engineContext.isValid()); - try { - engineContext.getEngine().start(engineContext); - engineContext.getEngine().awaitTermination(engineContext); - } catch (Exception ex) { - engineContext.getEngine().shutdown(engineContext); - } - } - - private EngineConfiguration getEngineConfiguration() { - EngineConfiguration engineConfiguration = new EngineConfiguration(); - engineConfiguration.setType("engine"); - engineConfiguration.setDocumentation("Plain java engine"); - engineConfiguration.setComponent(KafkaStreamProcessingEngine.class.getCanonicalName()); - Map props = new HashMap<>(); - props.put(StreamProperties.READ_TOPICS_SERIALIZER().getName(), "none"); - props.put(StreamProperties.READ_STREAM_SERVICE_PROVIDER().getName(), "local_file_service"); - props.put(StreamProperties.WRITE_TOPICS_SERIALIZER().getName(), StreamProperties.JSON_SERIALIZER().getValue()); - props.put(StreamProperties.WRITE_STREAM_SERVICE_PROVIDER().getName(), "console_service"); - StreamConfiguration streamConfiguration = testStructuredStreamStream(props); -// streamConfiguration.addProcessorConfiguration(debugProcessorConfiguration(Collections.emptyMap())); - engineConfiguration.addPipelineConfigurations(streamConfiguration); - //set up services - Map propsFileProvider = new HashMap<>(); - propsFileProvider.put("local.input.path", getClass().getResource("/input").getFile()); - List services = new ArrayList<>(); - services.add(testLocalFileProvider(propsFileProvider)); - - Map propsConsoleProvider = new HashMap<>(); - propsConsoleProvider.put("truncate", "false"); - services.add(testConsoleProvider(propsConsoleProvider)); - engineConfiguration.setControllerServiceConfigurations(services); - return engineConfiguration; - } - - private StreamConfiguration testStructuredStreamStream(Map props) { - StreamConfiguration streamConfiguration = new StreamConfiguration(); - streamConfiguration.setStream("testStructuredStream"); - streamConfiguration.setComponent(StructuredStream.class.getCanonicalName()); - streamConfiguration.setType("stream"); - streamConfiguration.setConfiguration(props); - return streamConfiguration; - } - - private ControllerServiceConfiguration testLocalFileProvider(Map props) { - ControllerServiceConfiguration serviceConfiguration = new ControllerServiceConfiguration(); - serviceConfiguration.setControllerService("local_file_service"); - serviceConfiguration.setComponent(LocalFileStructuredStreamProviderService.class.getCanonicalName()); - serviceConfiguration.setType("provider"); - serviceConfiguration.setConfiguration(props); - return serviceConfiguration; - } - - private ControllerServiceConfiguration testConsoleProvider(Map props) { - ControllerServiceConfiguration serviceConfiguration = new ControllerServiceConfiguration(); - serviceConfiguration.setControllerService("console_service"); - serviceConfiguration.setComponent(ConsoleStructuredStreamProviderService.class.getCanonicalName()); - serviceConfiguration.setType("provider"); - serviceConfiguration.setConfiguration(props); - return serviceConfiguration; - } - - private ProcessorConfiguration debugProcessorConfiguration(Map props) { - ProcessorConfiguration ret = new ProcessorConfiguration(); - ret.setProcessor(UUID.randomUUID().toString()); -// ret.setComponent(DebugStream.class.getCanonicalName()); - ret.setType("processor"); - return ret; - } - - -} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/kafka-connect-stream.yml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/kafka-connect-stream.yml deleted file mode 100644 index 30d043e3a..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/kafka-connect-stream.yml +++ /dev/null @@ -1,138 +0,0 @@ -version: 1.2.0 -documentation: LogIsland future factory job - -engine: - component: com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine - type: engine - documentation: Index some apache logs with logisland - configuration: - spark.app.name: ConnectTest - spark.master: local[*] - spark.driver.memory: 512M - spark.driver.cores: 1 - spark.executor.memory: 512M - spark.executor.instances: 4 - spark.executor.cores: 2 - spark.yarn.queue: default - spark.yarn.maxAppAttempts: 4 - spark.yarn.am.attemptFailuresValidityInterval: 1h - spark.yarn.max.executor.failures: 20 - spark.yarn.executor.failuresValidityInterval: 1h - spark.task.maxFailures: 8 - spark.serializer: org.apache.spark.serializer.KryoSerializer - spark.streaming.batchDuration: 2000 - spark.streaming.backpressure.enabled: false - spark.streaming.blockInterval: 500 - spark.streaming.kafka.maxRatePerPartition: 10000 - spark.streaming.timeout: -1 - spark.streaming.unpersist: false - spark.streaming.kafka.maxRetries: 3 - spark.streaming.ui.retainedBatches: 200 - spark.streaming.receiver.writeAheadLog.enable: false - spark.ui.port: 4040 - - controllerServiceConfigurations: - - - controllerService: kc_source_service - component: com.hurence.logisland.stream.spark.provider.KafkaConnectStructuredSourceProviderService - configuration: - kc.data.value.converter: com.hurence.logisland.connect.converter.LogIslandRecordConverter - kc.data.value.converter.properties: | - record.serializer=com.hurence.logisland.serializer.KryoSerializer - kc.data.key.converter.properties: | - schemas.enable=false - kc.data.key.converter: org.apache.kafka.connect.storage.StringConverter - kc.worker.tasks.max: 3 - kc.partitions.max: 12 - kc.connector.class: com.hurence.logisland.connect.fake.FakeConnector - kc.connector.offset.backing.store: memory - kc.connector.properties: | - foo=bar - dummy=a long string - - - controllerService: kc_sink_service - component: com.hurence.logisland.stream.spark.provider.KafkaConnectStructuredSinkProviderService - configuration: - kc.data.value.converter: com.hurence.logisland.connect.converter.LogIslandRecordConverter - kc.data.value.converter.properties: | - record.serializer=com.hurence.logisland.serializer.KryoSerializer - kc.data.key.converter.properties: | - schemas.enable=false - kc.data.key.converter: org.apache.kafka.connect.storage.StringConverter - kc.worker.tasks.max: 2 - kc.partitions.max: 4 - kc.connector.class: com.hurence.logisland.connect.fake.TestSink - - - - - - controllerService: kafka_out_service - component: com.hurence.logisland.stream.spark.structured.provider.KafkaStructuredStreamProviderService - configuration: - kafka.output.topics: logisland_raw - kafka.error.topics: logisland_errors - kafka.input.topics.serializer: com.hurence.logisland.serializer.KryoSerializer - kafka.output.topics.serializer: com.hurence.logisland.serializer.KryoSerializer - kafka.error.topics.serializer: com.hurence.logisland.serializer.JsonSerializer - kafka.metadata.broker.list: sandbox:9092 - kafka.zookeeper.quorum: sandbox:2181 - kafka.topic.autoCreate: true - kafka.topic.default.partitions: 4 - kafka.topic.default.replicationFactor: 1 - - streamConfigurations: - ################ indexing stream ############### - - stream: indexing_stream - component: com.hurence.logisland.stream.spark.KafkaRecordStreamParallelProcessing - type: stream - documentation: a processor that converts raw excel file content into structured log records - configuration: - kafka.input.topics: logisland_raw - kafka.output.topics: none - kafka.error.topics: logisland_errors - kafka.input.topics.serializer: com.hurence.logisland.serializer.KryoSerializer - kafka.output.topics.serializer: com.hurence.logisland.serializer.KryoSerializer - kafka.error.topics.serializer: com.hurence.logisland.serializer.JsonSerializer - kafka.metadata.broker.list: sandbox:9092 - kafka.zookeeper.quorum: sandbox:2181 - kafka.topic.autoCreate: true - kafka.topic.default.partitions: 4 - kafka.topic.default.replicationFactor: 1 - processorConfigurations: - # do something useful here - - processor: stream_debugger - component: com.hurence.logisland.processor.DebugStream - type: processor - documentation: debug records - configuration: - event.serializer: json - - - ######### parsing stream ############## - - stream: parsing_stream_source - component: com.hurence.logisland.stream.spark.structured.StructuredStream - configuration: - read.topics: /a/in - read.topics.serializer: com.hurence.logisland.serializer.KryoSerializer - read.topics.key.serializer: com.hurence.logisland.serializer.StringSerializer - read.stream.service.provider: kc_source_service - write.topics: logisland_raw - write.topics.serializer: com.hurence.logisland.serializer.KryoSerializer - write.topics.key.serializer: com.hurence.logisland.serializer.StringSerializer - write.stream.service.provider: kc_sink_service - processorConfigurations: - - - processor: flatten - component: com.hurence.logisland.processor.FlatMap - type: processor - documentation: "extract from root record" - configuration: - keep.root.record: false - copy.root.record.fields: true - - processor: add_fields - component: com.hurence.logisland.processor.AddFields - type: processor - documentation: "rename fields for dynamic indexation in chronix : add *_s suffix" - configuration: - conflict.resolution.policy: overwrite_existing - record_key: ${partition} diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/opencv.yml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/opencv.yml deleted file mode 100644 index c0def1a37..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/opencv.yml +++ /dev/null @@ -1,62 +0,0 @@ -version: 1.2.0 -documentation: LogIsland computer vision sample - -engine: - component: com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine - configuration: - spark.app.name: OpenCV - spark.master: local[2] - spark.streaming.batchDuration: 200 - spark.streaming.kafka.maxRatePerPartition: 10000 - spark.streaming.timeout: -1 - - controllerServiceConfigurations: - - - controllerService: kafka_service - component: com.hurence.logisland.stream.spark.structured.provider.KafkaStructuredStreamProviderService - configuration: - kafka.input.topics: logisland_raw - kafka.output.topics: logisland_images - kafka.error.topics: logisland_errors - kafka.input.topics.serializer: com.hurence.logisland.serializer.BytesArraySerializer - kafka.output.topics.serializer: com.hurence.logisland.serializer.BytesArraySerializer - kafka.error.topics.serializer: com.hurence.logisland.serializer.JsonSerializer - kafka.metadata.broker.list: kafka:9092 - kafka.zookeeper.quorum: zookeeper:2181 - kafka.topic.autoCreate: true - kafka.topic.default.partitions: 2 - kafka.topic.default.replicationFactor: 1 - - streamConfigurations: - - - stream: parsing_stream - component: com.hurence.logisland.stream.spark.structured.StructuredStream - configuration: - read.topics.serializer: com.hurence.logisland.serializer.BytesArraySerializer - read.stream.service.provider: kafka_service - write.topics.serializer: com.hurence.logisland.serializer.BytesArraySerializer - write.stream.service.provider: kafka_service - processorConfigurations: - - - processor: contour_extraction - component: com.hurence.logisland.cv.processor.RunScript - configuration: - input.field: record_value - output.field: record_value - output.mode: overwrite - image.format: png - script.ns: com.hurence.logisland - script.function: ld_detect_edges - script.code: > - (ns com.hurence.logisland - (:refer-clojure :exclude [sort min merge reduce max compare repeat]) - (:require [opencv4.utils :refer :all]) - (:require [opencv4.core :refer :all]) - (:import [com.hurence.logisland.record Record])) - - (defn ld_detect_edges [mat] - (-> mat - (resize-by 0.5) - (cvt-color! COLOR_RGB2GRAY) - (canny! 300.0 100.0 3 true) - (bitwise-not!))) \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/remote-engine.yml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/remote-engine.yml deleted file mode 100644 index 9da496f55..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/remote-engine.yml +++ /dev/null @@ -1,38 +0,0 @@ -version: 1.2.0 -documentation: LogIsland remote controlled. - -engine: - component: com.hurence.logisland.engine.spark.RemoteApiStreamProcessingEngine - type: engine - documentation: Do some remote pipelines. - configuration: - spark.app.name: RemoteConnect - spark.master: local[2] - spark.driver.memory: 512M - spark.driver.cores: 1 - spark.executor.memory: 512M - spark.executor.instances: 2 - spark.executor.cores: 2 - spark.yarn.queue: default - spark.yarn.maxAppAttempts: 4 - spark.yarn.am.attemptFailuresValidityInterval: 1h - spark.yarn.max.executor.failures: 20 - spark.yarn.executor.failuresValidityInterval: 1h - spark.task.maxFailures: 8 - spark.serializer: org.apache.spark.serializer.KryoSerializer - spark.streaming.batchDuration: 2000 - spark.streaming.backpressure.enabled: false - spark.streaming.blockInterval: 500 - spark.streaming.kafka.maxRatePerPartition: 10000 - spark.streaming.timeout: -1 - spark.streaming.unpersist: false - spark.streaming.kafka.maxRetries: 3 - spark.streaming.ui.retainedBatches: 200 - spark.streaming.receiver.writeAheadLog.enable: false - spark.ui.port: 4040 - remote.api.baseUrl: http://localhost:3000 - remote.api.polling.rate: 5000 - remote.api.push.rate: 10000 - - - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/structured-stream.yml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/structured-stream.yml deleted file mode 100644 index 4db7dabe6..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/structured-stream.yml +++ /dev/null @@ -1,76 +0,0 @@ -version: 1.2.0 -documentation: LogIsland future factory job - -engine: - component: com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine - type: engine - documentation: Index some apache logs with logisland - configuration: - spark.app.name: FutureFactory - spark.master: local[4] - spark.driver.memory: 1G - spark.driver.cores: 1 - spark.executor.memory: 1G - spark.executor.instances: 4 - spark.executor.cores: 2 - spark.yarn.queue: default - spark.yarn.maxAppAttempts: 4 - spark.yarn.am.attemptFailuresValidityInterval: 1h - spark.yarn.max.executor.failures: 20 - spark.yarn.executor.failuresValidityInterval: 1h - spark.task.maxFailures: 8 - spark.serializer: org.apache.spark.serializer.KryoSerializer - #spark.serializer: org.apache.spark.serializer.JavaSerializer - spark.streaming.batchDuration: 2000 - spark.streaming.backpressure.enabled: false - spark.streaming.blockInterval: 500 - spark.streaming.kafka.maxRatePerPartition: 10000 - spark.streaming.timeout: -1 - spark.streaming.unpersist: false - spark.streaming.kafka.maxRetries: 3 - spark.streaming.ui.retainedBatches: 200 - spark.streaming.receiver.writeAheadLog.enable: false - spark.ui.port: 4040 - - controllerServiceConfigurations: - - - controllerService: mqtt_service - component: com.hurence.logisland.stream.spark.structured.provider.MQTTStructuredStreamProviderService - configuration: - # mqtt.broker.url: tcp://51.15.164.141:1883 - mqtt.broker.url: tcp://localhost:1883 - mqtt.persistence: memory - mqtt.client.id: logisland - mqtt.qos: 0 - mqtt.topic: Account123/# - mqtt.username: User123 - mqtt.password: Kapu12345678+ - mqtt.clean.session: true - mqtt.connection.timeout: 30 - mqtt.keep.alive: 60 - mqtt.version: 3 - - - controllerService: console_service - component: com.hurence.logisland.stream.spark.structured.provider.ConsoleStructuredStreamProviderService - - streamConfigurations: - - # indexing stream - - stream: indexing_stream - component: com.hurence.logisland.stream.spark.structured.StructuredStream - configuration: - read.topics.serializer: com.hurence.logisland.serializer.KuraProtobufSerializer - read.topics.client.service: mqtt_service - write.topics.serializer: none - write.topics.client.service: console_service - processorConfigurations: - - - processor: flatten - component: com.hurence.logisland.processor.FlatMap - type: processor - documentation: "extract metrics from root record" - configuration: - keep.root.record: false - copy.root.record.fields: true - leaf.record.type: record_metric - concat.fields: record_name diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/timeseries-structured-stream.yml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/timeseries-structured-stream.yml deleted file mode 100644 index b257f36e9..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/conf/timeseries-structured-stream.yml +++ /dev/null @@ -1,99 +0,0 @@ -version: 1.2.0 -documentation: LogIsland future factory job - -engine: - component: com.hurence.logisland.engine.spark.KafkaStreamProcessingEngine - configuration: - spark.app.name: TimeseriesParsing - spark.master: local[2] - spark.streaming.batchDuration: 200 - spark.streaming.kafka.maxRatePerPartition: 10000 - controllerServiceConfigurations: - - - - controllerService: kafka_service - component: com.hurence.logisland.stream.spark.structured.provider.KafkaStructuredStreamProviderService - configuration: - kafka.input.topics: logisland_raw - kafka.output.topics: logisland_measures - kafka.error.topics: logisland_errors - kafka.input.topics.serializer: com.hurence.logisland.serializer.JsonSerializer - kafka.output.topics.serializer: com.hurence.logisland.serializer.JsonSerializer - kafka.error.topics.serializer: com.hurence.logisland.serializer.JsonSerializer - kafka.metadata.broker.list: localhost:9092 - kafka.zookeeper.quorum: localhost:2181 - kafka.topic.autoCreate: true - kafka.topic.default.partitions: 4 - kafka.topic.default.replicationFactor: 1 - - - controllerService: kafka_service_out - component: com.hurence.logisland.stream.spark.structured.provider.KafkaStructuredStreamProviderService - configuration: - kafka.input.topics: logisland_measures - kafka.output.topics: logisland_metrics - kafka.error.topics: logisland_errors - kafka.input.topics.serializer: com.hurence.logisland.serializer.JsonSerializer - kafka.output.topics.serializer: com.hurence.logisland.serializer.JsonSerializer - kafka.error.topics.serializer: com.hurence.logisland.serializer.JsonSerializer - kafka.metadata.broker.list: localhost:9092 - kafka.zookeeper.quorum: localhost:2181 - kafka.topic.autoCreate: true - kafka.topic.default.partitions: 4 - kafka.topic.default.replicationFactor: 1 - - streamConfigurations: - - # This stream take all raw events as lines comming from local files - # these lines are split into logisland records and sent into a kafka topic - - stream: parsing_stream - component: com.hurence.logisland.stream.spark.structured.StructuredStream - configuration: - read.topics.serializer: none - read.stream.service.provider: kafka_service - write.topics.serializer: com.hurence.logisland.serializer.JsonSerializer - write.stream.service.provider: kafka_service - processorConfigurations: - - - processor: historian_parser - component: com.hurence.logisland.processor.SplitText - configuration: - record.type: timeserie - value.regex: (\S+\s+\S+);(\S+);(\S+);(\S+) - value.fields: record_time,tagname,record_value,quality - - - processor: create_aliases - component: com.hurence.logisland.processor.NormalizeFields - configuration: - conflict.resolution.policy: keep_both_fields - record_name: tagname - - - processor: fields_types_converter - component: com.hurence.logisland.processor.ConvertFieldsType - configuration: - record_value: double - quality: float - - # This stream will perform a statefull groupBy operation on tagname - - stream: compaction_stream - component: com.hurence.logisland.stream.spark.structured.StructuredStream - configuration: - read.topics.key.serializer: com.hurence.logisland.serializer.StringSerializer - read.topics.serializer: com.hurence.logisland.serializer.JsonSerializer - read.stream.service.provider: kafka_service_out - write.topics.serializer: com.hurence.logisland.serializer.JsonSerializer - write.stream.service.provider: kafka_service_out - groupby: tagname - chunk.size: 50 - state.timeout.ms: 30000 - - processorConfigurations: - -# - processor: debug_1 -# component: com.hurence.logisland.processor.DebugStream - # Make one chronix chunk from all records - - processor: timeseries_converter - component: com.hurence.logisland.processor.ConvertToTimeseries - configuration: - groupby: tagname - metric: avg;max;min;trend;sax:7,0.01,10 - diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/log4j.properties b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/log4j.properties deleted file mode 100644 index 754f1c2a8..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/log4j.properties +++ /dev/null @@ -1,65 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Set everything to be logged to the console -log4j.rootCategory=INFO, console -log4j.appender.console=org.apache.log4j.ConsoleAppender -log4j.appender.console.target=System.err -log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n - -# Set the default spark-shell log level to WARN. When running the spark-shell, the -# log level for this class is used to overwrite the root logger's log level, so that -# the user can have different defaults for the shell and regular Spark apps. -log4j.logger.org.apache.spark.repl.Main=WARN - -# Settings to quiet third party logs that are too verbose -log4j.logger.org.spark_project.jetty=WARN -log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR -log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO -log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO -log4j.logger.org.apache.parquet=ERROR -log4j.logger.parquet=ERROR - -log4j.logger.org.apache.spark=WARN -log4j.logger.org.apache.spark.scheduler=WARN -log4j.logger.org.apache.spark.history=WARN -log4j.logger.org.apache.spark.streaming=WARN -log4j.logger.org.spark-project.jetty=WARN -log4j.logger.org.eclipse.jetty.server=OFF -log4j.logger.org.apache.spark.deploy.yarn=DEBUG -log4j.logger.io.netty=WARN -log4j.logger.org.apache.hadoop.ipc.Client=WARN -log4j.logger.org.apache.hadoop=WARN -log4j.logger.org.apache.hadoop.ipc.ProtobufRpcEngine=WARN -log4j.logger.parquet.hadoop=WARN -log4j.logger.org.apache.kafka=ERROR -log4j.logger.kafka=WARN -log4j.logger.org.elasticsearch=WARN -log4j.logger.com.hurence=DEBUG -log4j.logger.org.apache.zookeeper=ERROR -log4j.logger.org.I0Itec.zkclient=ERROR -log4j.logger.org.apache.spark.sql.execution.streaming.state.StateStore=WARN -log4j.logger.org.apache.spark.ContextCleaner=WARN -log4j.additivity.kafka.server=false -log4j.additivity.kafka.consumer.ZookeeperConsumerConnector=false - - - -# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support -log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL -log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/logback.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/logback.xml deleted file mode 100644 index 1c1e3e91f..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/logisland-engine-spark_2_4plus_common/src/test/resources/logback.xml +++ /dev/null @@ -1,58 +0,0 @@ - - - - - - - - %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/pom.xml deleted file mode 100644 index cd7474cd2..000000000 --- a/logisland-core/logisland-engines/logisland-engine-spark_2_4plus_kafka_2_4plus/pom.xml +++ /dev/null @@ -1,25 +0,0 @@ - - - 4.0.0 - - com.hurence.logisland - logisland-engines - 1.2.0 - - pom - - - 2.12 - 2.12.10 - 2.6.6 - - - logisland-engine-spark_2_4plus_kafka_2_4plus - - - logisland-engine-spark_2_4plus_common - logisland-engine-spark_2_4_kafka_2_4 - - diff --git a/logisland-core/logisland-engines/pom.xml b/logisland-core/logisland-engines/pom.xml index aafa5e9dd..10b200ad7 100644 --- a/logisland-core/logisland-engines/pom.xml +++ b/logisland-core/logisland-engines/pom.xml @@ -15,7 +15,6 @@ - logisland-engine-spark_2_X logisland-engine-spark_1_6 logisland-engine-vanilla From 94732be80fbc58530813825824f77f8d0bd84ba3 Mon Sep 17 00:00:00 2001 From: Mathieu Rossignol Date: Thu, 27 Feb 2020 17:35:41 +0100 Subject: [PATCH 26/43] Update user doc after full build --- .../user/components/common-processors.rst | 266 ++-- .../user/components/components.yaml | 17 + .../user/components/other-processors.rst | 1123 +++++++++++++++++ .../user/components/services.rst | 533 ++++++++ 4 files changed, 1780 insertions(+), 159 deletions(-) diff --git a/logisland-documentation/user/components/common-processors.rst b/logisland-documentation/user/components/common-processors.rst index 9751dd0f0..1786f153f 100644 --- a/logisland-documentation/user/components/common-processors.rst +++ b/logisland-documentation/user/components/common-processors.rst @@ -9,56 +9,7 @@ Find below the list. ---------- - -.. _com.hurence.logisland.processor.AddFields: - -AddFields ---------- -Add one or more field to records - -Module -______ -com.hurence.logisland:logisland-processor-common:1.2.0 - -Class -_____ -com.hurence.logisland.processor.AddFields - -Tags -____ -record, fields, Add - -Properties -__________ -In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values. - -.. csv-table:: allowable-values - :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL" - :widths: 20,60,30,20,10,10 - :escape: \ - - "conflict.resolution.policy", "What to do when a field with the same name already exists ?", "overwrite_existing (if field already exist), keep_only_old_field (keep only old field)", "keep_only_old_field", "false", "false" - -Dynamic Properties -__________________ -Dynamic Properties allow the user to specify both the name and value of a property. - -.. csv-table:: dynamic-properties - :header: "Name","Value","Description","Allowable Values","Default Value","EL" - :widths: 20,20,40,40,20,10 - :escape: \ - - "Name of the field to add", "Value of the field to add", "Add a field to the record with the specified value. Expression language can be used.You can not add a field that end with '.type' as this suffix is used to specify the type of fields to add", "", "null", **true** - "Name of the field to add with the suffix '.field.type'", "Type of the field to add", "Add a field to the record with the specified type. These properties are only used if a correspondant property without the suffix '.field.type' is already defined. If this property is not defined, default type for adding fields is String.You can only use Logisland predefined type fields.", "NULL, STRING, INT, LONG, ARRAY, FLOAT, DOUBLE, BYTES, RECORD, MAP, ENUM, BOOLEAN, UNION, DATETIME, OBJECT", "STRING", false - "Name of the field to add with the suffix '.field.name'", "Name of the field to add using expression language", "Add a field to the record with the specified name (which is evaluated using expression language). These properties are only used if a correspondant property without the suffix '.field.name' is already defined. If this property is not defined, the name of the field to add is the key of the first dynamic property (which is the main and only required dynamic property).", "", "null", **true** - -Extra informations -__________________ -.. include:: ./details/common-processors/AddFields-Detail.rst ----------- - -.. _com.hurence.logisland.processor.ApplyRegexp: - +.. _com.hurence.logisland.processor.alerting.ComputeTags: ComputeTags ----------- @@ -430,7 +381,7 @@ Dynamic Properties allow the user to specify both the name and value of a proper :escape: \ "Name of the field to add", "Value of the field to add", "Add a field to the record with the specified value. Expression language can be used.You can not add a field that end with '.type' as this suffix is used to specify the type of fields to add", "", "null", **true** - "Name of the field to add with the suffix '.field.type'", "Type of the field to add", "Add a field to the record with the specified type. These properties are only used if a correspondant property without the suffix '.field.type' is already defined. If this property is not defined, default type for adding fields is String.You can only use Logisland predefined type fields.", "NULL, STRING, INT, LONG, ARRAY, FLOAT, DOUBLE, BYTES, RECORD, MAP, ENUM, BOOLEAN, UNION, DATETIME", "STRING", false + "Name of the field to add with the suffix '.field.type'", "Type of the field to add", "Add a field to the record with the specified type. These properties are only used if a correspondant property without the suffix '.field.type' is already defined. If this property is not defined, default type for adding fields is String.You can only use Logisland predefined type fields.", "NULL, STRING, INT, LONG, ARRAY, FLOAT, DOUBLE, BYTES, RECORD, MAP, ENUM, BOOLEAN, UNION, DATETIME, OBJECT", "STRING", false "Name of the field to add with the suffix '.field.name'", "Name of the field to add using expression language", "Add a field to the record with the specified name (which is evaluated using expression language). These properties are only used if a correspondant property without the suffix '.field.name' is already defined. If this property is not defined, the name of the field to add is the key of the first dynamic property (which is the main and only required dynamic property).", "", "null", **true** Extra informations @@ -715,78 +666,7 @@ __________________ .. include:: ./details/common-processors/SelectDistinctRecords-Detail.rst ---------- -.. _com.hurence.logisland.processor.DecodeBase64: - -DecodeBase64 ------------- -Decodes fields to base64. The fields should be of type string - -Module -______ -com.hurence.logisland:logisland-processor-common:1.2.0 - -Class -_____ -com.hurence.logisland.processor.DecodeBase64 - -Tags -____ -decode, base64 - -Properties -__________ -In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values. - -.. csv-table:: allowable-values - :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL" - :widths: 20,60,30,20,10,10 - :escape: \ - - "**source.fields**", "a comma separated list of fields corresponding to the fields to decode. Please note than the fields should be of type string", "", "null", "false", "false" - "**destination.fields**", "a comma separated list of fields corresponding to the decoded content according to the fields provided as input. Those fields will be of type bytes", "", "null", "false", "false" - -Extra informations -__________________ -.. include:: ./details/common-processors/DecodeBase64-Detail.rst ----------- - -.. _com.hurence.logisland.processor.EncodeBase64: - -EncodeBase64 ------------- -Encodes fields to base64. The fields should be of type array of bytes - -Module -______ -com.hurence.logisland:logisland-processor-common:1.2.0 - -Class -_____ -com.hurence.logisland.processor.EncodeBase64 - -Tags -____ -encode, base64 - -Properties -__________ -In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values. - -.. csv-table:: allowable-values - :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL" - :widths: 20,60,30,20,10,10 - :escape: \ - - "**source.fields**", "a comma separated list of fields corresponding to the fields to encode. Please note than the fields should be of type bytes", "", "null", "false", "false" - "**destination.fields**", "a comma separated list of fields corresponding to the encoded content according to the fields provided as input. Those fields will be of type string", "", "null", "false", "false" - -Extra informations -__________________ -.. include:: ./details/common-processors/EncodeBase64-Detail.rst ----------- - -.. _com.hurence.logisland.processor.EncryptField: - +.. _com.hurence.logisland.processor.EvaluateJsonPath: EvaluateJsonPath ---------------- @@ -1179,6 +1059,76 @@ __________________ .. include:: ./details/common-processors/ConvertSimpleDateFormatFields-Detail.rst ---------- +.. _com.hurence.logisland.processor.DecodeBase64: + +DecodeBase64 +------------ +Decodes fields to base64. The fields should be of type string + +Module +______ +com.hurence.logisland:logisland-processor-common:1.2.0 + +Class +_____ +com.hurence.logisland.processor.DecodeBase64 + +Tags +____ +decode, base64 + +Properties +__________ +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values. + +.. csv-table:: allowable-values + :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL" + :widths: 20,60,30,20,10,10 + :escape: \ + + "**source.fields**", "a comma separated list of fields corresponding to the fields to decode. Please note than the fields should be of type string", "", "null", "false", "false" + "**destination.fields**", "a comma separated list of fields corresponding to the decoded content according to the fields provided as input. Those fields will be of type bytes", "", "null", "false", "false" + +Extra informations +__________________ +.. include:: ./details/common-processors/DecodeBase64-Detail.rst +---------- + +.. _com.hurence.logisland.processor.EncodeBase64: + +EncodeBase64 +------------ +Encodes fields to base64. The fields should be of type array of bytes + +Module +______ +com.hurence.logisland:logisland-processor-common:1.2.0 + +Class +_____ +com.hurence.logisland.processor.EncodeBase64 + +Tags +____ +encode, base64 + +Properties +__________ +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values. + +.. csv-table:: allowable-values + :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL" + :widths: 20,60,30,20,10,10 + :escape: \ + + "**source.fields**", "a comma separated list of fields corresponding to the fields to encode. Please note than the fields should be of type bytes", "", "null", "false", "false" + "**destination.fields**", "a comma separated list of fields corresponding to the encoded content according to the fields provided as input. Those fields will be of type string", "", "null", "false", "false" + +Extra informations +__________________ +.. include:: ./details/common-processors/EncodeBase64-Detail.rst +---------- + .. _com.hurence.logisland.processor.EncryptField: EncryptField @@ -1479,11 +1429,11 @@ __________________ .. include:: ./details/common-processors/SetJsonAsFields-Detail.rst ---------- -.. _com.hurence.logisland.processor.alerting.CheckAlerts: +.. _com.hurence.logisland.processor.SplitRecord: -CheckAlerts +SplitRecord ----------- -Add one or more records representing alerts. Using a datastore. +This processor is used to create a new set of records from one record. Module ______ @@ -1491,11 +1441,11 @@ com.hurence.logisland:logisland-processor-common:1.2.0 Class _____ -com.hurence.logisland.processor.alerting.CheckAlerts +com.hurence.logisland.processor.SplitRecord Tags ____ -record, alerting, thresholds, opc, tag +None. Properties __________ @@ -1506,26 +1456,32 @@ In the list below, the names of required properties appear in **bold**. Any othe :widths: 20,60,30,20,10,10 :escape: \ - "max.cpu.time", "maximum CPU time in milliseconds allowed for script execution.", "", "100", "false", "false" - "max.memory", "maximum memory in Bytes which JS executor thread can allocate", "", "51200", "false", "false" - "allow.no.brace", "Force, to check if all blocks are enclosed with curly braces \"\"{}\"\". - - .. raw:: html + "keep.parent.record", "Specify if the parent record should exist", "", "false", "false", "false" + "keep.parent.record_time", "Specify whether to use the processing_time as record_time or not", "", "true", "false", "false" + "keep.parent.record_type", "Specify whether to use the dynamic property name as record_type or not", "", "false", "false", "false" -

      +Dynamic Properties +__________________ +Dynamic Properties allow the user to specify both the name and value of a property. - Explanation: all loops (for, do-while, while, and if-else, and functions +.. csv-table:: dynamic-properties + :header: "Name","Value","Description","Allowable Values","Default Value","EL" + :widths: 20,20,40,40,20,10 + :escape: \ - should use braces, because poison_pill() function will be inserted after + "new record name", "fields to have", "the new record", "", "null", **true** - each open brace \"\"{\"\", to ensure interruption checking. Otherwise simple +Extra informations +__________________ +No additional information is provided +---------- -.. _com.hurence.logisland.processor.SplitRecord: +.. _com.hurence.logisland.processor.alerting.CheckAlerts: -SplitRecord +CheckAlerts ----------- -This processor is used to create a new set of records from one record. +Add one or more records representing alerts. Using a datastore. Module ______ @@ -1533,11 +1489,11 @@ com.hurence.logisland:logisland-processor-common:1.2.0 Class _____ -com.hurence.logisland.processor.SplitRecord +com.hurence.logisland.processor.alerting.CheckAlerts Tags ____ -None. +record, alerting, thresholds, opc, tag Properties __________ @@ -1548,29 +1504,21 @@ In the list below, the names of required properties appear in **bold**. Any othe :widths: 20,60,30,20,10,10 :escape: \ - "keep.parent.record", "Specify if the parent record should exist", "", "false", "false", "false" - "keep.parent.record_time", "Specify whether to use the processing_time as record_time or not", "", "true", "false", "false" - "keep.parent.record_type", "Specify whether to use the dynamic property name as record_type or not", "", "false", "false", "false" - -Dynamic Properties -__________________ -Dynamic Properties allow the user to specify both the name and value of a property. + "max.cpu.time", "maximum CPU time in milliseconds allowed for script execution.", "", "100", "false", "false" + "max.memory", "maximum memory in Bytes which JS executor thread can allocate", "", "51200", "false", "false" + "allow.no.brace", "Force, to check if all blocks are enclosed with curly braces \"\"{}\"\". -.. csv-table:: dynamic-properties - :header: "Name","Value","Description","Allowable Values","Default Value","EL" - :widths: 20,20,40,40,20,10 - :escape: \ + .. raw:: html - "new record name", "fields to have", "the new record", "", "null", **true** +

      -Extra informations -__________________ -No additional information is provided + Explanation: all loops (for, do-while, while, and if-else, and functions ----------- + should use braces, because poison_pill() function will be inserted after -.. _com.hurence.logisland.processor.SplitText: + each open brace \"\"{\"\", to ensure interruption checking. Otherwise simple + code like:

       
      diff --git a/logisland-documentation/user/components/components.yaml b/logisland-documentation/user/components/components.yaml
      index 172f6471a..4eba7020e 100644
      --- a/logisland-documentation/user/components/components.yaml
      +++ b/logisland-documentation/user/components/components.yaml
      @@ -179,6 +179,18 @@ extensions:
           module: com.hurence.logisland:logisland-processor-common:1.2.0
           class: com.hurence.logisland.processor.ConvertSimpleDateFormatFields
           tags: [record, fields, add, date, conversion, convert]
      +  - name: DecodeBase64
      +    description: Decodes fields to base64. The fields should be of type string
      +    category: misc
      +    module: com.hurence.logisland:logisland-processor-common:1.2.0
      +    class: com.hurence.logisland.processor.DecodeBase64
      +    tags: [decode, base64]
      +  - name: EncodeBase64
      +    description: Encodes fields to base64. The fields should be of type array of bytes
      +    category: misc
      +    module: com.hurence.logisland:logisland-processor-common:1.2.0
      +    class: com.hurence.logisland.processor.EncodeBase64
      +    tags: [encode, base64]
         - name: EncryptField
           description: This is a processor that is used to encrypt or decrypt one or many fields of any type of a given Record mapping
           category: misc
      @@ -231,6 +243,11 @@ extensions:
           module: com.hurence.logisland:logisland-processor-common:1.2.0
           class: com.hurence.logisland.processor.SetJsonAsFields
           tags: [json]
      +  - name: SplitRecord
      +    description: This processor is used to create a new set of records from one record.
      +    category: misc
      +    module: com.hurence.logisland:logisland-processor-common:1.2.0
      +    class: com.hurence.logisland.processor.SplitRecord
         - name: CheckAlerts
           description: Add one or more records representing alerts. Using a datastore.
           category: alerting
      diff --git a/logisland-documentation/user/components/other-processors.rst b/logisland-documentation/user/components/other-processors.rst
      index 0db518d83..1e0671e4a 100644
      --- a/logisland-documentation/user/components/other-processors.rst
      +++ b/logisland-documentation/user/components/other-processors.rst
      @@ -2243,6 +2243,1129 @@ Dynamic Properties
       __________________
       Dynamic Properties allow the user to specify both the name and value of a property.
       
      +.. csv-table:: dynamic-properties
      +   :header: "Name","Value","Description","Allowable Values","Default Value","EL"
      +   :widths: 20,20,40,40,20,10
      +   :escape: \
      +
      +   "fields to decode", "a default value", "Decode one or more fields from the record ", "", "null", false
      +
      +Extra informations
      +__________________
      +.. include:: ./details/URLDecoder-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.useragent.ParseUserAgent: 
      +
      +ParseUserAgent
      +--------------
      +The user-agent processor allows to decompose User-Agent value from an HTTP header into several attributes of interest. There is no standard format for User-Agent strings, hence it is not easily possible to use regexp to handle them. This processor rely on the `YAUAA library `_ to do the heavy work.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-useragent:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.useragent.ParseUserAgent
      +
      +Tags
      +____
      +User-Agent, clickstream, DMP
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "debug", "Enable debug.", "", "false", "false", "false"
      +   "cache.enabled", "Enable caching. Caching to avoid to redo the same computation for many identical User-Agent strings.", "", "true", "false", "false"
      +   "cache.size", "Set the size of the cache.", "", "1000", "false", "false"
      +   "**useragent.field**", "Must contain the name of the field that contains the User-Agent value in the incoming record.", "", "null", "false", "false"
      +   "useragent.keep", "Defines if the field that contained the User-Agent must be kept or not in the resulting records.", "", "true", "false", "false"
      +   "confidence.enabled", "Enable confidence reporting. Each field will report a confidence attribute with a value comprised between 0 and 10000.", "", "false", "false", "false"
      +   "ambiguity.enabled", "Enable ambiguity reporting. Reports a count of ambiguities.", "", "false", "false", "false"
      +   "fields", "Defines the fields to be returned.", "", "DeviceClass, DeviceName, DeviceBrand, DeviceCpu, DeviceFirmwareVersion, DeviceVersion, OperatingSystemClass, OperatingSystemName, OperatingSystemVersion, OperatingSystemNameVersion, OperatingSystemVersionBuild, LayoutEngineClass, LayoutEngineName, LayoutEngineVersion, LayoutEngineVersionMajor, LayoutEngineNameVersion, LayoutEngineNameVersionMajor, LayoutEngineBuild, AgentClass, AgentName, AgentVersion, AgentVersionMajor, AgentNameVersion, AgentNameVersionMajor, AgentBuild, AgentLanguage, AgentLanguageCode, AgentInformationEmail, AgentInformationUrl, AgentSecurity, AgentUuid, FacebookCarrier, FacebookDeviceClass, FacebookDeviceName, FacebookDeviceVersion, FacebookFBOP, FacebookFBSS, FacebookOperatingSystemName, FacebookOperatingSystemVersion, Anonymized, HackerAttackVector, HackerToolkit, KoboAffiliate, KoboPlatformId, IECompatibilityVersion, IECompatibilityVersionMajor, IECompatibilityNameVersion, IECompatibilityNameVersionMajor, __SyntaxError__, Carrier, GSAInstallationID, WebviewAppName, WebviewAppNameVersionMajor, WebviewAppVersion, WebviewAppVersionMajor", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/ParseUserAgent-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.webAnalytics.IncrementalWebSession: 
      +
      +IncrementalWebSession
      +---------------------
      +This processor creates and updates web-sessions based on incoming web-events. Note that both web-sessions and web-events are stored in elasticsearch.
      + Firstly, web-events are grouped by their session identifier and processed in chronological order.
      + Then each web-session associated to each group is retrieved from elasticsearch.
      + In case none exists yet then a new web session is created based on the first web event.
      + The following fields of the newly created web session are set based on the associated web event: session identifier, first timestamp, first visited page. Secondly, once created, or retrieved, the web session is updated by the remaining web-events.
      + Updates have impacts on fields of the web session such as event counter, last visited page,  session duration, ...
      + Before updates are actually applied, checks are performed to detect rules that would trigger the creation of a new session:
      +
      +	the duration between the web session and the web event must not exceed the specified time-out,
      +	the web session and the web event must have timestamps within the same day (at midnight a new web session is created),
      +	source of traffic (campaign, ...) must be the same on the web session and the web event.
      +
      + When a breaking rule is detected, a new web session is created with a new session identifier where as remaining web-events still have the original session identifier. The new session identifier is the original session suffixed with the character '#' followed with an incremented counter. This new session identifier is also set on the remaining web-events.
      + Finally when all web events were applied, all web events -potentially modified with a new session identifier- are save in elasticsearch. And web sessions are passed to the next processor.
      +
      +WebSession information are:
      +- first and last visited page
      +- first and last timestamp of processed event 
      +- total number of processed events
      +- the userId
      +- a boolean denoting if the web-session is still active or not
      +- an integer denoting the duration of the web-sessions
      +- optional fields that may be retrieved from the processed events
      +
      +
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-web-analytics:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.webAnalytics.IncrementalWebSession
      +
      +Tags
      +____
      +analytics, web, session
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "debug", "Enable debug. If enabled, debug information are logged.", "", "false", "false", "false"
      +   "**es.session.index.field**", "Name of the field in the record defining the ES index containing the web session documents.", "", "null", "false", "false"
      +   "**es.session.type.name**", "Name of the ES type of web session documents.", "", "null", "false", "false"
      +   "**es.event.index.prefix**", "Prefix of the index containing the web event documents.", "", "null", "false", "false"
      +   "**es.event.type.name**", "Name of the ES type of web event documents.", "", "null", "false", "false"
      +   "**es.mapping.event.to.session.index.name**", "Name of the ES index containing the mapping of web session documents.", "", "null", "false", "false"
      +   "sessionid.field", "the name of the field containing the session id => will override default value if set", "", "sessionId", "false", "false"
      +   "timestamp.field", "the name of the field containing the timestamp => will override default value if set", "", "h2kTimestamp", "false", "false"
      +   "visitedpage.field", "the name of the field containing the visited page => will override default value if set", "", "location", "false", "false"
      +   "userid.field", "the name of the field containing the userId => will override default value if set", "", "userId", "false", "false"
      +   "fields.to.return", "the list of fields to return", "", "null", "false", "false"
      +   "firstVisitedPage.out.field", "the name of the field containing the first visited page => will override default value if set", "", "firstVisitedPage", "false", "false"
      +   "lastVisitedPage.out.field", "the name of the field containing the last visited page => will override default value if set", "", "lastVisitedPage", "false", "false"
      +   "isSessionActive.out.field", "the name of the field stating whether the session is active or not => will override default value if set", "", "is_sessionActive", "false", "false"
      +   "sessionDuration.out.field", "the name of the field containing the session duration => will override default value if set", "", "sessionDuration", "false", "false"
      +   "sessionInactivityDuration.out.field", "the name of the field containing the session inactivity duration => will override default value if set", "", "sessionInactivityDuration", "false", "false"
      +   "session.timeout", "session timeout in sec", "", "1800", "false", "false"
      +   "eventsCounter.out.field", "the name of the field containing the session duration => will override default value if set", "", "eventsCounter", "false", "false"
      +   "firstEventDateTime.out.field", "the name of the field containing the date of the first event => will override default value if set", "", "firstEventDateTime", "false", "false"
      +   "lastEventDateTime.out.field", "the name of the field containing the date of the last event => will override default value if set", "", "lastEventDateTime", "false", "false"
      +   "newSessionReason.out.field", "the name of the field containing the reason why a new session was created => will override default value if set", "", "reasonForNewSession", "false", "false"
      +   "transactionIds.out.field", "the name of the field containing all transactionIds => will override default value if set", "", "transactionIds", "false", "false"
      +   "source_of_traffic.suffix", "Prefix for the source of the traffic related fields", "", "source_of_traffic", "false", "false"
      +   "**elasticsearch.client.service**", "The instance of the Controller Service to use for accessing Elasticsearch.", "", "null", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/IncrementalWebSession-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.webAnalytics.SetSourceOfTraffic: 
      +
      +SetSourceOfTraffic
      +------------------
      +Compute the source of traffic of a web session. Users arrive at a website or application through a variety of sources, 
      +including advertising/paying campaigns, search engines, social networks, referring sites or direct access. 
      +When analysing user experience on a webshop, it is crucial to collect, process, and report the campaign and traffic-source data. 
      +To compute the source of traffic of a web session, the user has to provide the utm_* related properties if available
      +i-e: **utm_source.field**, **utm_medium.field**, **utm_campaign.field**, **utm_content.field**, **utm_term.field**)
      +, the referer (**referer.field** property) and the first visited page of the session (**first.visited.page.field** property).
      +By default the source of traffic information are placed in a flat structure (specified by the **source_of_traffic.suffix** property
      +with a default value of source_of_traffic). To work properly the SetSourceOfTraffic processor needs to have access to an 
      +Elasticsearch index containing a list of the most popular search engines and social networks. The ES index (specified by the **es.index** property) should be structured such that the _id of an ES document MUST be the name of the domain. If the domain is a search engine, the related ES doc MUST have a boolean field (default being search_engine) specified by the property **es.search_engine.field** with a value set to true. If the domain is a social network , the related ES doc MUST have a boolean field (default being social_network) specified by the property **es.social_network.field** with a value set to true. 
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-web-analytics:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.webAnalytics.SetSourceOfTraffic
      +
      +Tags
      +____
      +session, traffic, source, web, analytics
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "referer.field", "Name of the field containing the referer value in the session", "", "referer", "false", "false"
      +   "first.visited.page.field", "Name of the field containing the first visited page in the session", "", "firstVisitedPage", "false", "false"
      +   "utm_source.field", "Name of the field containing the utm_source value in the session", "", "utm_source", "false", "false"
      +   "utm_medium.field", "Name of the field containing the utm_medium value in the session", "", "utm_medium", "false", "false"
      +   "utm_campaign.field", "Name of the field containing the utm_campaign value in the session", "", "utm_campaign", "false", "false"
      +   "utm_content.field", "Name of the field containing the utm_content value in the session", "", "utm_content", "false", "false"
      +   "utm_term.field", "Name of the field containing the utm_term value in the session", "", "utm_term", "false", "false"
      +   "source_of_traffic.suffix", "Suffix for the source of the traffic related fields", "", "source_of_traffic", "false", "false"
      +   "source_of_traffic.hierarchical", "Should the additional source of trafic information fields be added under a hierarchical father field or not.", "", "false", "false", "false"
      +   "**elasticsearch.client.service**", "The instance of the Controller Service to use for accessing Elasticsearch.", "", "null", "false", "false"
      +   "**cache.service**", "Name of the cache service to use.", "", "null", "false", "false"
      +   "cache.validity.timeout", "Timeout validity (in seconds) of an entry in the cache.", "", "0", "false", "false"
      +   "debug", "If true, an additional debug field is added. If the source info fields prefix is X, a debug field named X_from_cache contains a boolean value to indicate the origin of the source fields. The default value for this property is false (debug is disabled).", "", "false", "false", "false"
      +   "**es.index**", "Name of the ES index containing the list of search engines and social network. ", "", "null", "false", "false"
      +   "es.type", "Name of the ES type to use.", "", "default", "false", "false"
      +   "es.search_engine.field", "Name of the ES field used to specify that the domain is a search engine.", "", "search_engine", "false", "false"
      +   "es.social_network.field", "Name of the ES field used to specify that the domain is a social network.", "", "social_network", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/SetSourceOfTraffic-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.enrichment.IpToFqdn: 
      +
      +IpToFqdn
      +--------
      +Translates an IP address into a FQDN (Fully Qualified Domain Name). An input field from the record has the IP as value. An new field is created and its value is the FQDN matching the IP address. The resolution mechanism is based on the underlying operating system. The resolution request may take some time, specially if the IP address cannot be translated into a FQDN. For these reasons this processor relies on the logisland cache service so that once a resolution occurs or not, the result is put into the cache. That way, the real request for the same IP is not re-triggered during a certain period of time, until the cache entry expires. This timeout is configurable but by default a request for the same IP is not triggered before 24 hours to let the time to the underlying DNS system to be potentially updated.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-enrichment:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.enrichment.IpToFqdn
      +
      +Tags
      +____
      +dns, ip, fqdn, domain, address, fqhn, reverse, resolution, enrich
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**ip.address.field**", "The name of the field containing the ip address to use.", "", "null", "false", "false"
      +   "**fqdn.field**", "The field that will contain the full qualified domain name corresponding to the ip address.", "", "null", "false", "false"
      +   "overwrite.fqdn.field", "If the field should be overwritten when it already exists.", "", "false", "false", "false"
      +   "**cache.service**", "The name of the cache service to use.", "", "null", "false", "false"
      +   "cache.max.time", "The amount of time, in seconds, for which a cached FQDN value is valid in the cache service. After this delay, the next new request to translate the same IP into FQDN will trigger a new reverse DNS request and the result will overwrite the entry in the cache. This allows two things: if the IP was not resolved into a FQDN, this will get a chance to obtain a FQDN if the DNS system has been updated, if the IP is resolved into a FQDN, this will allow to be more accurate if the DNS system has been updated.  A value of 0 seconds disables this expiration mechanism. The default value is 84600 seconds, which corresponds to new requests triggered every day if a record with the same IP passes every day in the processor.", "", "84600", "false", "false"
      +   "resolution.timeout", "The amount of time, in milliseconds, to wait at most for the resolution to occur. This avoids to block the stream for too much time. Default value is 1000ms. If the delay expires and no resolution could occur before, the FQDN field is not created. A special value of 0 disables the logisland timeout and the resolution request may last for many seconds if the IP cannot be translated into a FQDN by the underlying operating system. In any case, whether the timeout occurs in logisland of in the operating system, the fact that a timeout occurs is kept in the cache system so that a resolution request for the same IP will not occur before the cache entry expires.", "", "1000", "false", "false"
      +   "debug", "If true, some additional debug fields are added. If the FQDN field is named X, a debug field named X_os_resolution_time_ms contains the resolution time in ms (using the operating system, not the cache). This field is added whether the resolution occurs or time is out. A debug field named  X_os_resolution_timeout contains a boolean value to indicate if the timeout occurred. Finally, a debug field named X_from_cache contains a boolean value to indicate the origin of the FQDN field. The default value for this property is false (debug is disabled.", "", "false", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/IpToFqdn-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.enrichment.IpToGeo: 
      +
      +IpToGeo
      +-------
      +Looks up geolocation information for an IP address. The attribute that contains the IP address to lookup must be provided in the **ip.address.field** property. By default, the geo information are put in a hierarchical structure. That is, if the name of the IP field is 'X', then the the geo attributes added by enrichment are added under a father field named X_geo. "_geo" is the default hierarchical suffix that may be changed with the **geo.hierarchical.suffix** property. If one wants to put the geo fields at the same level as the IP field, then the **geo.hierarchical** property should be set to false and then the geo attributes are  created at the same level as him with the naming pattern X_geo_. "_geo_" is the default flat suffix but this may be changed with the **geo.flat.suffix** property. The IpToGeo processor requires a reference to an Ip to Geo service. This must be defined in the **iptogeo.service** property. The added geo fields are dependant on the underlying Ip to Geo service. The **geo.fields** property must contain the list of geo fields that should be created if data is available for  the IP to resolve. This property defaults to "*" which means to add every available fields. If one only wants a subset of the fields,  one must define a comma separated list of fields as a value for the **geo.fields** property. The list of the available geo fields is in the description of the **geo.fields** property.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-enrichment:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.enrichment.IpToGeo
      +
      +Tags
      +____
      +geo, enrich, ip
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**ip.address.field**", "The name of the field containing the ip address to use.", "", "null", "false", "false"
      +   "**iptogeo.service**", "The reference to the IP to Geo service to use.", "", "null", "false", "false"
      +   "geo.fields", "Comma separated list of geo information fields to add to the record. Defaults to '*', which means to include all available fields. If a list of fields is specified and the data is not available, the geo field is not created. The geo fields are dependant on the underlying defined Ip to Geo service. The currently only supported type of Ip to Geo service is the Maxmind Ip to Geo service. This means that the currently supported list of geo fields is the following:**continent**: the identified continent for this IP address. **continent_code**: the identified continent code for this IP address. **city**: the identified city for this IP address. **latitude**: the identified latitude for this IP address. **longitude**: the identified longitude for this IP address. **location**: the identified location for this IP address, defined as Geo-point expressed as a string with the format: 'latitude,longitude'. **accuracy_radius**: the approximate accuracy radius, in kilometers, around the latitude and longitude for the location. **time_zone**: the identified time zone for this IP address. **subdivision_N**: the identified subdivision for this IP address. N is a one-up number at the end of the attribute name, starting with 0. **subdivision_isocode_N**: the iso code matching the identified subdivision_N. **country**: the identified country for this IP address. **country_isocode**: the iso code for the identified country for this IP address. **postalcode**: the identified postal code for this IP address. **lookup_micros**: the number of microseconds that the geo lookup took. The Ip to Geo service must have the lookup_micros property enabled in order to have this field available.", "", "*", "false", "false"
      +   "geo.hierarchical", "Should the additional geo information fields be added under a hierarchical father field or not.", "", "true", "false", "false"
      +   "geo.hierarchical.suffix", "Suffix to use for the field holding geo information. If geo.hierarchical is true, then use this suffix appended to the IP field name to define the father field name. This may be used for instance to distinguish between geo fields with various locales using many Ip to Geo service instances.", "", "_geo", "false", "false"
      +   "geo.flat.suffix", "Suffix to use for geo information fields when they are flat. If geo.hierarchical is false, then use this suffix appended to the IP field name but before the geo field name. This may be used for instance to distinguish between geo fields with various locales using many Ip to Geo service instances.", "", "_geo_", "false", "false"
      +   "**cache.service**", "The name of the cache service to use.", "", "null", "false", "false"
      +   "debug", "If true, an additional debug field is added. If the geo info fields prefix is X, a debug field named X_from_cache contains a boolean value to indicate the origin of the geo fields. The default value for this property is false (debug is disabled).", "", "false", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/IpToGeo-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.networkpacket.ParseNetworkPacket: 
      +
      +ParseNetworkPacket
      +------------------
      +The ParseNetworkPacket processor is the LogIsland entry point to parse network packets captured either off-the-wire (stream mode) or in pcap format (batch mode).  In batch mode, the processor decodes the bytes of the incoming pcap record, where a Global header followed by a sequence of [packet header, packet data] pairs are stored. Then, each incoming pcap event is parsed into n packet records. The fields of packet headers are then extracted and made available in dedicated record fields. See the `Capturing Network packets tutorial `_ for an example of usage of this processor.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-cyber-security:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.networkpacket.ParseNetworkPacket
      +
      +Tags
      +____
      +PCap, security, IDS, NIDS
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "debug", "Enable debug.", "", "false", "false", "false"
      +   "**flow.mode**", "Flow Mode. Indicate whether packets are provided in batch mode (via pcap files) or in stream mode (without headers). Allowed values are batch and stream.", "batch, stream", "null", "false", "false"
      +
      +Extra informations
      +__________________
      +No additional information is provided
      +
      +----------
      +
      +.. _com.hurence.logisland.processor.elasticsearch.BulkAddElasticsearch: 
      +
      +BulkAddElasticsearch
      +--------------------
      +Indexes the content of a Record in Elasticsearch using elasticsearch's bulk processor
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-elasticsearch:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.elasticsearch.BulkAddElasticsearch
      +
      +Tags
      +____
      +elasticsearch
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values, and whether a property supports the  `Expression Language `_ .
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**elasticsearch.client.service**", "The instance of the Controller Service to use for accessing Elasticsearch.", "", "null", "false", "false"
      +   "**default.index**", "The name of the index to insert into", "", "null", "false", "**true**"
      +   "**default.type**", "The type of this document (used by Elasticsearch for indexing and searching)", "", "null", "false", "**true**"
      +   "**timebased.index**", "do we add a date suffix", "no (no date added to default index), today (today's date added to default index), yesterday (yesterday's date added to default index)", "no", "false", "false"
      +   "es.index.field", "the name of the event field containing es index name => will override index value if set", "", "null", "false", "false"
      +   "es.type.field", "the name of the event field containing es doc type => will override type value if set", "", "null", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/BulkAddElasticsearch-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.hbase.FetchHBaseRow: 
      +
      +FetchHBaseRow
      +-------------
      +Fetches a row from an HBase table. The Destination property controls whether the cells are added as flow file attributes, or the row is written to the flow file content as JSON. This processor may be used to fetch a fixed row on a interval by specifying the table and row id directly in the processor, or it may be used to dynamically fetch rows by referencing the table and row id from incoming flow files.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-hbase:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.hbase.FetchHBaseRow
      +
      +Tags
      +____
      +hbase, scan, fetch, get, enrich
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values, and whether a property supports the  `Expression Language `_ .
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**hbase.client.service**", "The instance of the Controller Service to use for accessing HBase.", "", "null", "false", "false"
      +   "**table.name.field**", "The field containing the name of the HBase Table to fetch from.", "", "null", "false", "**true**"
      +   "**row.identifier.field**", "The field containing the identifier of the row to fetch.", "", "null", "false", "**true**"
      +   "columns.field", "The field containing an optional comma-separated list of \"\":\"\" pairs to fetch. To return all columns for a given family, leave off the qualifier such as \"\",\"\".", "", "null", "false", "**true**"
      +   "record.serializer", "the serializer needed to i/o the record in the HBase row", "com.hurence.logisland.serializer.KryoSerializer (serialize events as json blocs), com.hurence.logisland.serializer.JsonSerializer (serialize events as json blocs), com.hurence.logisland.serializer.AvroSerializer (serialize events as avro blocs), none (send events as bytes)", "com.hurence.logisland.serializer.KryoSerializer", "false", "false"
      +   "record.schema", "the avro schema definition for the Avro serialization", "", "null", "false", "false"
      +   "table.name.default", "The table to use if table name field is not set", "", "null", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/FetchHBaseRow-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.elasticsearch.MultiGetElasticsearch: 
      +
      +MultiGetElasticsearch
      +---------------------
      +Retrieves a content indexed in elasticsearch using elasticsearch multiget queries.
      +Each incoming record contains information regarding the elasticsearch multiget query that will be performed. This information is stored in record fields whose names are configured in the plugin properties (see below) :
      +
      + - index (String) : name of the elasticsearch index on which the multiget query will be performed. This field is mandatory and should not be empty, otherwise an error output record is sent for this specific incoming record.
      + - type (String) : name of the elasticsearch type on which the multiget query will be performed. This field is not mandatory.
      + - ids (String) : comma separated list of document ids to fetch. This field is mandatory and should not be empty, otherwise an error output record is sent for this specific incoming record.
      + - includes (String) : comma separated list of patterns to filter in (include) fields to retrieve. Supports wildcards. This field is not mandatory.
      + - excludes (String) : comma separated list of patterns to filter out (exclude) fields to retrieve. Supports wildcards. This field is not mandatory.
      +
      +Each outcoming record holds data of one elasticsearch retrieved document. This data is stored in these fields :
      +
      + - index (same field name as the incoming record) : name of the elasticsearch index.
      + - type (same field name as the incoming record) : name of the elasticsearch type.
      + - id (same field name as the incoming record) : retrieved document id.
      + - a list of String fields containing :
      +
      +   * field name : the retrieved field name
      +   * field value : the retrieved field value
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-elasticsearch:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.elasticsearch.MultiGetElasticsearch
      +
      +Tags
      +____
      +elasticsearch
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**elasticsearch.client.service**", "The instance of the Controller Service to use for accessing Elasticsearch.", "", "null", "false", "false"
      +   "**es.index.field**", "the name of the incoming records field containing es index name to use in multiget query. ", "", "null", "false", "false"
      +   "**es.type.field**", "the name of the incoming records field containing es type name to use in multiget query", "", "null", "false", "false"
      +   "**es.ids.field**", "the name of the incoming records field containing es document Ids to use in multiget query", "", "null", "false", "false"
      +   "**es.includes.field**", "the name of the incoming records field containing es includes to use in multiget query", "", "null", "false", "false"
      +   "**es.excludes.field**", "the name of the incoming records field containing es excludes to use in multiget query", "", "null", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/MultiGetElasticsearch-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.hbase.PutHBaseCell: 
      +
      +PutHBaseCell
      +------------
      +Adds the Contents of a Record to HBase as the value of a single cell
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-hbase:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.hbase.PutHBaseCell
      +
      +Tags
      +____
      +hadoop, hbase
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values, and whether a property supports the  `Expression Language `_ .
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**hbase.client.service**", "The instance of the Controller Service to use for accessing HBase.", "", "null", "false", "false"
      +   "**table.name.field**", "The field containing the name of the HBase Table to put data into", "", "null", "false", "**true**"
      +   "row.identifier.field", "Specifies  field containing the Row ID to use when inserting data into HBase", "", "null", "false", "**true**"
      +   "row.identifier.encoding.strategy", "Specifies the data type of Row ID used when inserting data into HBase. The default behavior is to convert the row id to a UTF-8 byte array. Choosing Binary will convert a binary formatted string to the correct byte[] representation. The Binary option should be used if you are using Binary row keys in HBase", "String (Stores the value of row id as a UTF-8 String.), Binary (Stores the value of the rows id as a binary byte array. It expects that the row id is a binary formatted string.)", "String", "false", "false"
      +   "**column.family.field**", "The field containing the  Column Family to use when inserting data into HBase", "", "null", "false", "**true**"
      +   "**column.qualifier.field**", "The field containing the  Column Qualifier to use when inserting data into HBase", "", "null", "false", "**true**"
      +   "**batch.size**", "The maximum number of Records to process in a single execution. The Records will be grouped by table, and a single Put per table will be performed.", "", "25", "false", "false"
      +   "record.schema", "the avro schema definition for the Avro serialization", "", "null", "false", "false"
      +   "record.serializer", "the serializer needed to i/o the record in the HBase row", "com.hurence.logisland.serializer.KryoSerializer (serialize events as json blocs), com.hurence.logisland.serializer.JsonSerializer (serialize events as json blocs), com.hurence.logisland.serializer.AvroSerializer (serialize events as avro blocs), none (send events as bytes)", "com.hurence.logisland.serializer.KryoSerializer", "false", "false"
      +   "table.name.default", "The table table to use if table name field is not set", "", "null", "false", "false"
      +   "column.family.default", "The column family to use if column family field is not set", "", "null", "false", "false"
      +   "column.qualifier.default", "The column qualifier to use if column qualifier field is not set", "", "null", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/PutHBaseCell-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.xml.EvaluateXPath: 
      +
      +EvaluateXPath
      +-------------
      +Evaluates one or more XPaths against the content of a record. The results of those XPaths are assigned to new attributes in the records, depending on configuration of the Processor. XPaths are entered by adding user-defined properties; the name of the property maps to the Attribute Name into which the result will be placed. The value of the property must be a valid XPath expression. If the expression matches nothing, no attributes is added. 
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-xml:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.xml.EvaluateXPath
      +
      +Tags
      +____
      +XML, evaluate, XPath
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**source**", "Indicates the attribute containing the xml data to evaluate xpath against.", "", "null", "false", "false"
      +   "**validate_dtd**", "Specifies whether or not the XML content should be validated against the DTD.", "true, false", "true", "false", "false"
      +   "conflict.resolution.policy", "What to do when a field with the same name already exists ?", "overwrite_existing (if field already exist), keep_only_old_field (keep only old field)", "keep_only_old_field", "false", "false"
      +
      +Dynamic Properties
      +__________________
      +Dynamic Properties allow the user to specify both the name and value of a property.
      +
      +.. csv-table:: dynamic-properties
      +   :header: "Name","Value","Description","Allowable Values","Default Value","EL"
      +   :widths: 20,20,40,40,20,10
      +   :escape: \
      +
      +   "An attribute", "An XPath expression", " the attribute is set to the result of the XPath Expression.", "", "null", false
      +
      +Extra informations
      +__________________
      +.. include:: ./details/EvaluateXPath-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.webAnalytics.ConsolidateSession: 
      +
      +ConsolidateSession
      +------------------
      +The ConsolidateSession processor is the Logisland entry point to get and process events from the Web Analytics.As an example here is an incoming event from the Web Analytics:
      +
      +"fields": [{ "name": "timestamp",              "type": "long" },{ "name": "remoteHost",             "type": "string"},{ "name": "record_type",            "type": ["null", "string"], "default": null },{ "name": "record_id",              "type": ["null", "string"], "default": null },{ "name": "location",               "type": ["null", "string"], "default": null },{ "name": "hitType",                "type": ["null", "string"], "default": null },{ "name": "eventCategory",          "type": ["null", "string"], "default": null },{ "name": "eventAction",            "type": ["null", "string"], "default": null },{ "name": "eventLabel",             "type": ["null", "string"], "default": null },{ "name": "localPath",              "type": ["null", "string"], "default": null },{ "name": "q",                      "type": ["null", "string"], "default": null },{ "name": "n",                      "type": ["null", "int"],    "default": null },{ "name": "referer",                "type": ["null", "string"], "default": null },{ "name": "viewportPixelWidth",     "type": ["null", "int"],    "default": null },{ "name": "viewportPixelHeight",    "type": ["null", "int"],    "default": null },{ "name": "screenPixelWidth",       "type": ["null", "int"],    "default": null },{ "name": "screenPixelHeight",      "type": ["null", "int"],    "default": null },{ "name": "partyId",                "type": ["null", "string"], "default": null },{ "name": "sessionId",              "type": ["null", "string"], "default": null },{ "name": "pageViewId",             "type": ["null", "string"], "default": null },{ "name": "is_newSession",          "type": ["null", "boolean"],"default": null },{ "name": "userAgentString",        "type": ["null", "string"], "default": null },{ "name": "pageType",               "type": ["null", "string"], "default": null },{ "name": "UserId",                 "type": ["null", "string"], "default": null },{ "name": "B2Bunit",                "type": ["null", "string"], "default": null },{ "name": "pointOfService",         "type": ["null", "string"], "default": null },{ "name": "companyID",              "type": ["null", "string"], "default": null },{ "name": "GroupCode",              "type": ["null", "string"], "default": null },{ "name": "userRoles",              "type": ["null", "string"], "default": null },{ "name": "is_PunchOut",            "type": ["null", "string"], "default": null }]The ConsolidateSession processor groups the records by sessions and compute the duration between now and the last received event. If the distance from the last event is beyond a given threshold (by default 30mn), then the session is considered closed.The ConsolidateSession is building an aggregated session object for each active session.This aggregated object includes: - The actual session duration. - A boolean representing wether the session is considered active or closed.   Note: it is possible to ressurect a session if for instance an event arrives after a session has been marked closed. - User related infos: userId, B2Bunit code, groupCode, userRoles, companyId - First visited page: URL - Last visited page: URL The properties to configure the processor are: - sessionid.field:          Property name containing the session identifier (default: sessionId). - timestamp.field:          Property name containing the timestamp of the event (default: timestamp). - session.timeout:          Timeframe of inactivity (in seconds) after which a session is considered closed (default: 30mn). - visitedpage.field:        Property name containing the page visited by the customer (default: location). - fields.to.return:         List of fields to return in the aggregated object. (default: N/A)
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-web-analytics:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.webAnalytics.ConsolidateSession
      +
      +Tags
      +____
      +analytics, web, session
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "debug", "Enable debug. If enabled, the original JSON string is embedded in the record_value field of the record.", "", "null", "false", "false"
      +   "session.timeout", "session timeout in sec", "", "1800", "false", "false"
      +   "sessionid.field", "the name of the field containing the session id => will override default value if set", "", "sessionId", "false", "false"
      +   "timestamp.field", "the name of the field containing the timestamp => will override default value if set", "", "h2kTimestamp", "false", "false"
      +   "visitedpage.field", "the name of the field containing the visited page => will override default value if set", "", "location", "false", "false"
      +   "userid.field", "the name of the field containing the userId => will override default value if set", "", "userId", "false", "false"
      +   "fields.to.return", "the list of fields to return", "", "null", "false", "false"
      +   "firstVisitedPage.out.field", "the name of the field containing the first visited page => will override default value if set", "", "firstVisitedPage", "false", "false"
      +   "lastVisitedPage.out.field", "the name of the field containing the last visited page => will override default value if set", "", "lastVisitedPage", "false", "false"
      +   "isSessionActive.out.field", "the name of the field stating whether the session is active or not => will override default value if set", "", "is_sessionActive", "false", "false"
      +   "sessionDuration.out.field", "the name of the field containing the session duration => will override default value if set", "", "sessionDuration", "false", "false"
      +   "eventsCounter.out.field", "the name of the field containing the session duration => will override default value if set", "", "eventsCounter", "false", "false"
      +   "firstEventDateTime.out.field", "the name of the field containing the date of the first event => will override default value if set", "", "firstEventDateTime", "false", "false"
      +   "lastEventDateTime.out.field", "the name of the field containing the date of the last event => will override default value if set", "", "lastEventDateTime", "false", "false"
      +   "sessionInactivityDuration.out.field", "the name of the field containing the session inactivity duration => will override default value if set", "", "sessionInactivityDuration", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/ConsolidateSession-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.DetectOutliers: 
      +
      +DetectOutliers
      +--------------
      +Outlier Analysis: A Hybrid Approach
      +
      +In order to function at scale, a two-phase approach is taken
      +
      +For every data point
      +
      +- Detect outlier candidates using a robust estimator of variability (e.g. median absolute deviation) that uses distributional sketching (e.g. Q-trees)
      +- Gather a biased sample (biased by recency)
      +- Extremely deterministic in space and cheap in computation
      +
      +For every outlier candidate
      +
      +- Use traditional, more computationally complex approaches to outlier analysis (e.g. Robust PCA) on the biased sample
      +- Expensive computationally, but run infrequently
      +
      +This becomes a data filter which can be attached to a timeseries data stream within a distributed computational framework (i.e. Storm, Spark, Flink, NiFi) to detect outliers.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-outlier-detection:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.DetectOutliers
      +
      +Tags
      +____
      +analytic, outlier, record, iot, timeseries
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**value.field**", "the numeric field to get the value", "", "record_value", "false", "false"
      +   "**time.field**", "the numeric field to get the value", "", "record_time", "false", "false"
      +   "output.record.type", "the output type of the record", "", "alert_match", "false", "false"
      +   "**rotation.policy.type**", "...", "by_amount, by_time, never", "by_amount", "false", "false"
      +   "**rotation.policy.amount**", "...", "", "100", "false", "false"
      +   "**rotation.policy.unit**", "...", "milliseconds, seconds, hours, days, months, years, points", "points", "false", "false"
      +   "**chunking.policy.type**", "...", "by_amount, by_time, never", "by_amount", "false", "false"
      +   "**chunking.policy.amount**", "...", "", "100", "false", "false"
      +   "**chunking.policy.unit**", "...", "milliseconds, seconds, hours, days, months, years, points", "points", "false", "false"
      +   "sketchy.outlier.algorithm", "...", "SKETCHY_MOVING_MAD", "SKETCHY_MOVING_MAD", "false", "false"
      +   "batch.outlier.algorithm", "...", "RAD", "RAD", "false", "false"
      +   "global.statistics.min", "minimum value", "", "null", "false", "false"
      +   "global.statistics.max", "maximum value", "", "null", "false", "false"
      +   "global.statistics.mean", "mean value", "", "null", "false", "false"
      +   "global.statistics.stddev", "standard deviation value", "", "null", "false", "false"
      +   "**zscore.cutoffs.normal**", "zscoreCutoffs level for normal outlier", "", "0.000000000000001", "false", "false"
      +   "**zscore.cutoffs.moderate**", "zscoreCutoffs level for moderate outlier", "", "1.5", "false", "false"
      +   "**zscore.cutoffs.severe**", "zscoreCutoffs level for severe outlier", "", "10.0", "false", "false"
      +   "zscore.cutoffs.notEnoughData", "zscoreCutoffs level for notEnoughData outlier", "", "100", "false", "false"
      +   "smooth", "do smoothing ?", "", "false", "false", "false"
      +   "decay", "the decay", "", "0.1", "false", "false"
      +   "**min.amount.to.predict**", "minAmountToPredict", "", "100", "false", "false"
      +   "min_zscore_percentile", "minZscorePercentile", "", "50.0", "false", "false"
      +   "reservoir_size", "the size of points reservoir", "", "100", "false", "false"
      +   "rpca.force.diff", "No Description Provided.", "", "null", "false", "false"
      +   "rpca.lpenalty", "No Description Provided.", "", "null", "false", "false"
      +   "rpca.min.records", "No Description Provided.", "", "null", "false", "false"
      +   "rpca.spenalty", "No Description Provided.", "", "null", "false", "false"
      +   "rpca.threshold", "No Description Provided.", "", "null", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/DetectOutliers-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.elasticsearch.EnrichRecordsElasticsearch: 
      +
      +EnrichRecordsElasticsearch
      +--------------------------
      +Enrich input records with content indexed in elasticsearch using multiget queries.
      +Each incoming record must be possibly enriched with information stored in elasticsearch. 
      +Each outcoming record holds at least the input record plus potentially one or more fields coming from of one elasticsearch document.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-elasticsearch:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.elasticsearch.EnrichRecordsElasticsearch
      +
      +Tags
      +____
      +elasticsearch
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values, and whether a property supports the  `Expression Language `_ .
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**elasticsearch.client.service**", "The instance of the Controller Service to use for accessing Elasticsearch.", "", "null", "false", "false"
      +   "**record.key**", "The name of field in the input record containing the document id to use in ES multiget query", "", "null", "false", "**true**"
      +   "**es.index**", "The name of the ES index to use in multiget query. ", "", "null", "false", "**true**"
      +   "es.type", "The name of the ES type to use in multiget query.", "", "default", "false", "**true**"
      +   "es.includes.field", "The name of the ES fields to include in the record.", "", "*", "false", "**true**"
      +   "es.excludes.field", "The name of the ES fields to exclude.", "", "N/A", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/EnrichRecordsElasticsearch-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.excel.ExcelExtract: 
      +
      +ExcelExtract
      +------------
      +Consumes a Microsoft Excel document and converts each worksheet's line to a structured record. The processor is assuming to receive raw excel file as input record.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-excel:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.excel.ExcelExtract
      +
      +Tags
      +____
      +excel, processor, poi
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "sheets", "Comma separated list of Excel document sheet names that should be extracted from the excel document. If this property is left blank then all of the sheets will be extracted from the Excel document. You can specify regular expressions. Any sheets not specified in this value will be ignored.", "", "", "false", "false"
      +   "skip.columns", "Comma delimited list of column numbers to skip. Use the columns number and not the letter designation. Use this to skip over columns anywhere in your worksheet that you don't want extracted as part of the record.", "", "", "false", "false"
      +   "field.names", "The comma separated list representing the names of columns of extracted cells. Order matters! You should use either field.names either field.row.header but not both together.", "", "null", "false", "false"
      +   "skip.rows", "The row number of the first row to start processing.Use this to skip over rows of data at the top of your worksheet that are not part of the dataset.Empty rows of data anywhere in the spreadsheet will always be skipped, no matter what this value is set to.", "", "0", "false", "false"
      +   "record.type", "Default type of record", "", "excel_record", "false", "false"
      +   "field.row.header", "If set, field names mapping will be extracted from the specified row number. You should use either field.names either field.row.header but not both together.", "", "null", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/ExcelExtract-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.MatchIP: 
      +
      +MatchIP
      +-------
      +IP address Query matching (using `Luwak )`_
      +
      +You can use this processor to handle custom events matching IP address (CIDR)
      +The record sent from a matching an IP address record is tagged appropriately.
      +
      +A query is expressed as a lucene query against a field like for example: 
      +
      +.. code::
      +
      +	message:'bad exception'
      +	error_count:[10 TO *]
      +	bytes_out:5000
      +	user_name:tom*
      +
      +Please read the `Lucene syntax guide `_ for supported operations
      +
      +.. warning::
      +
      +	don't forget to set numeric fields property to handle correctly numeric ranges queries
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-querymatcher:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.MatchIP
      +
      +Tags
      +____
      +analytic, percolator, record, record, query, lucene
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "numeric.fields", "a comma separated string of numeric field to be matched", "", "null", "false", "false"
      +   "output.record.type", "the output type of the record", "", "alert_match", "false", "false"
      +   "record.type.updatePolicy", "Record type update policy", "", "overwrite", "false", "false"
      +   "policy.onmatch", "the policy applied to match events: 'first' (default value) match events are tagged with the name and value of the first query that matched;'all' match events are tagged with all names and values of the queries that matched.", "", "first", "false", "false"
      +   "policy.onmiss", "the policy applied to miss events: 'discard' (default value) drop events that did not match any query;'forward' include also events that did not match any query.", "", "discard", "false", "false"
      +
      +Dynamic Properties
      +__________________
      +Dynamic Properties allow the user to specify both the name and value of a property.
      +
      +.. csv-table:: dynamic-properties
      +   :header: "Name","Value","Description","Allowable Values","Default Value","EL"
      +   :widths: 20,20,40,40,20,10
      +   :escape: \
      +
      +   "query", "some Lucene query", "generate a new record when this query is matched", "", "null", **true**
      +
      +Extra informations
      +__________________
      +.. include:: ./details/MatchIP-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.MatchQuery: 
      +
      +MatchQuery
      +----------
      +Query matching based on `Luwak `_
      +
      +you can use this processor to handle custom events defined by lucene queries
      +a new record is added to output each time a registered query is matched
      +
      +A query is expressed as a lucene query against a field like for example: 
      +
      +.. code::
      +
      +	message:'bad exception'
      +	error_count:[10 TO *]
      +	bytes_out:5000
      +	user_name:tom*
      +
      +Please read the `Lucene syntax guide `_ for supported operations
      +
      +.. warning::
      +
      +	don't forget to set numeric fields property to handle correctly numeric ranges queries
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-querymatcher:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.MatchQuery
      +
      +Tags
      +____
      +analytic, percolator, record, record, query, lucene
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "numeric.fields", "a comma separated string of numeric field to be matched", "", "null", "false", "false"
      +   "output.record.type", "the output type of the record", "", "alert_match", "false", "false"
      +   "record.type.updatePolicy", "Record type update policy", "", "overwrite", "false", "false"
      +   "policy.onmatch", "the policy applied to match events: 'first' (default value) match events are tagged with the name and value of the first query that matched;'all' match events are tagged with all names and values of the queries that matched.", "", "first", "false", "false"
      +   "policy.onmiss", "the policy applied to miss events: 'discard' (default value) drop events that did not match any query;'forward' include also events that did not match any query.", "", "discard", "false", "false"
      +
      +Dynamic Properties
      +__________________
      +Dynamic Properties allow the user to specify both the name and value of a property.
      +
      +.. csv-table:: dynamic-properties
      +   :header: "Name","Value","Description","Allowable Values","Default Value","EL"
      +   :widths: 20,20,40,40,20,10
      +   :escape: \
      +
      +   "query", "some Lucene query", "generate a new record when this query is matched", "", "null", **true**
      +
      +Extra informations
      +__________________
      +.. include:: ./details/MatchQuery-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.bro.ParseBroEvent: 
      +
      +ParseBroEvent
      +-------------
      +The ParseBroEvent processor is the Logisland entry point to get and process `Bro `_ events. The `Bro-Kafka plugin `_ should be used and configured in order to have Bro events sent to Kafka. See the `Bro/Logisland tutorial `_ for an example of usage for this processor. The ParseBroEvent processor does some minor pre-processing on incoming Bro events from the Bro-Kafka plugin to adapt them to Logisland.
      +
      +Basically the events coming from the Bro-Kafka plugin are JSON documents with a first level field indicating the type of the event. The ParseBroEvent processor takes the incoming JSON document, sets the event type in a record_type field and sets the original sub-fields of the JSON event as first level fields in the record. Also any dot in a field name is transformed into an underscore. Thus, for instance, the field id.orig_h becomes id_orig_h. The next processors in the stream can then process the Bro events generated by this ParseBroEvent processor.
      +
      +As an example here is an incoming event from Bro:
      +
      +{
      +
      +   "conn": {
      +
      +     "id.resp_p": 9092,
      +
      +     "resp_pkts": 0,
      +
      +     "resp_ip_bytes": 0,
      +
      +     "local_orig": true,
      +
      +     "orig_ip_bytes": 0,
      +
      +     "orig_pkts": 0,
      +
      +     "missed_bytes": 0,
      +
      +     "history": "Cc",
      +
      +     "tunnel_parents": [],
      +
      +     "id.orig_p": 56762,
      +
      +     "local_resp": true,
      +
      +     "uid": "Ct3Ms01I3Yc6pmMZx7",
      +
      +     "conn_state": "OTH",
      +
      +     "id.orig_h": "172.17.0.2",
      +
      +     "proto": "tcp",
      +
      +     "id.resp_h": "172.17.0.3",
      +
      +     "ts": 1487596886.953917
      +
      +   }
      +
      + }
      +
      +It gets processed and transformed into the following Logisland record by the ParseBroEvent processor:
      +
      +"@timestamp": "2017-02-20T13:36:32Z"
      +
      +"record_id": "6361f80a-c5c9-4a16-9045-4bb51736333d"
      +
      +"record_time": 1487597792782
      +
      +"record_type": "conn"
      +
      +"id_resp_p": 9092
      +
      +"resp_pkts": 0
      +
      +"resp_ip_bytes": 0
      +
      +"local_orig": true
      +
      +"orig_ip_bytes": 0
      +
      +"orig_pkts": 0
      +
      +"missed_bytes": 0
      +
      +"history": "Cc"
      +
      +"tunnel_parents": []
      +
      +"id_orig_p": 56762
      +
      +"local_resp": true
      +
      +"uid": "Ct3Ms01I3Yc6pmMZx7"
      +
      +"conn_state": "OTH"
      +
      +"id_orig_h": "172.17.0.2"
      +
      +"proto": "tcp"
      +
      +"id_resp_h": "172.17.0.3"
      +
      +"ts": 1487596886.953917
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-cyber-security:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.bro.ParseBroEvent
      +
      +Tags
      +____
      +bro, security, IDS, NIDS
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "debug", "Enable debug. If enabled, the original JSON string is embedded in the record_value field of the record.", "", "false", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/ParseBroEvent-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.netflow.ParseNetflowEvent: 
      +
      +ParseNetflowEvent
      +-----------------
      +The `Netflow V5 `_ processor is the Logisland entry point to  process Netflow (V5) events. NetFlow is a feature introduced on Cisco routers that provides the ability to collect IP network traffic.We can distinguish 2 components:
      +
      +	- Flow exporter: aggregates packets into flows and exports flow records (binary format) towards one or more flow collectors
      +
      +	- Flow collector: responsible for reception, storage and pre-processing of flow data received from a flow exporter
      +
      +The collected data are then available for analysis purpose (intrusion detection, traffic analysis...)
      +Netflow are sent to kafka in order to be processed by logisland.
      +In the tutorial we will simulate Netflow traffic using `nfgen `_. this traffic will be sent to port 2055. The we rely on nifi to listen of that port for   incoming netflow (V5) traffic and send them to a kafka topic. The Netflow processor could thus treat these events and generate corresponding logisland records. The following processors in the stream can then process the Netflow records generated by this processor.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-cyber-security:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.netflow.ParseNetflowEvent
      +
      +Tags
      +____
      +netflow, security
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "debug", "Enable debug. If enabled, the original JSON string is embedded in the record_value field of the record.", "", "false", "false", "false"
      +   "output.record.type", "the output type of the record", "", "netflowevent", "false", "false"
      +   "enrich.record", "Enrich data. If enabledthe netflow record is enriched with inferred data", "", "false", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/ParseNetflowEvent-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.scripting.python.RunPython: 
      +
      +RunPython
      +---------
      + !!!! WARNING !!!!
      +
      +The RunPython processor is currently an experimental feature : it is delivered as is, with the current set of features and is subject to modifications in API or anything else in further logisland releases without warnings. There is no tutorial yet. If you want to play with this processor, use the python-processing.yml example and send the apache logs of the index apache logs tutorial. The debug stream processor at the end of the stream should output events in stderr file of the executors from the spark console.
      +
      +This processor allows to implement and run a processor written in python. This can be done in 2 ways. Either directly defining the process method code in the **script.code.process** configuration property or poiting to an external python module script file in the **script.path** configuration property. Directly defining methods is called the inline mode whereas using a script file is called the file mode. Both ways are mutually exclusive. Whether using the inline of file mode, your python code may depend on some python dependencies. If the set of python dependencies already delivered with the Logisland framework is not sufficient, you can use the **dependencies.path** configuration property to give their location. Currently only the nltk python library is delivered with Logisland.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-scripting:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.scripting.python.RunPython
      +
      +Tags
      +____
      +scripting, python
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "script.code.imports", "For inline mode only. This is the python code that should hold the import statements if required.", "", "null", "false", "false"
      +   "script.code.init", "The python code to be called when the processor is initialized. This is the python equivalent of the init method code for a java processor. This is not mandatory but can only be used if **script.code.process** is defined (inline mode).", "", "null", "false", "false"
      +   "script.code.process", "The python code to be called to process the records. This is the pyhton equivalent of the process method code for a java processor. For inline mode, this is the only minimum required configuration property. Using this property, you may also optionally define the **script.code.init** and **script.code.imports** properties.", "", "null", "false", "false"
      +   "script.path", "The path to the user's python processor script. Use this property for file mode. Your python code must be in a python file with the following constraints: let's say your pyhton script is named MyProcessor.py. Then MyProcessor.py is a module file that must contain a class named MyProcessor which must inherits from the Logisland delivered class named AbstractProcessor. You can then define your code in the process method and in the other traditional methods (init...) as you would do in java in a class inheriting from the AbstractProcessor java class.", "", "null", "false", "false"
      +   "dependencies.path", "The path to the additional dependencies for the user's python code, whether using inline or file mode. This is optional as your code may not have additional dependencies. If you defined **script.path** (so using file mode) and if **dependencies.path** is not defined, Logisland will scan a potential directory named **dependencies** in the same directory where the script file resides and if it exists, any python code located there will be loaded as dependency as needed.", "", "null", "false", "false"
      +   "logisland.dependencies.path", "The path to the directory containing the python dependencies shipped with logisland. You should not have to tune this parameter.", "", "null", "false", "false"
      +
      +Extra informations
      +__________________
      +.. include:: ./details/RunPython-Detail.rst
      +----------
      +
      +.. _com.hurence.logisland.processor.webAnalytics.URLDecoder: 
      +
      +URLDecoder
      +----------
      +Decode one or more field containing an URL with possibly special chars encoded
      +...
      +
      +Module
      +______
      +com.hurence.logisland:logisland-processor-web-analytics:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.processor.webAnalytics.URLDecoder
      +
      +Tags
      +____
      +record, fields, Decode
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**decode.fields**", "List of fields (URL) to decode", "", "null", "false", "false"
      +   "charset", "Charset to use to decode the URL", "", "UTF-8", "false", "false"
      +
      +Dynamic Properties
      +__________________
      +Dynamic Properties allow the user to specify both the name and value of a property.
      +
       .. csv-table:: dynamic-properties
          :header: "Name","Value","Description","Allowable Values","Default Value","EL"
          :widths: 20,20,40,40,20,10
      diff --git a/logisland-documentation/user/components/services.rst b/logisland-documentation/user/components/services.rst
      index 434547506..897e80be6 100644
      --- a/logisland-documentation/user/components/services.rst
      +++ b/logisland-documentation/user/components/services.rst
      @@ -1252,3 +1252,536 @@ In the list below, the names of required properties appear in **bold**. Any othe
       Extra informations
       __________________
       No additional information is provided
      +
      +----------
      +
      +.. _com.hurence.logisland.service.iptogeo.maxmind.MaxmindIpToGeoService: 
      +
      +MaxmindIpToGeoService
      +---------------------
      +Implementation of the IP 2 GEO Service using maxmind lite db file
      +
      +Module
      +______
      +com.hurence.logisland:logisland-service-ip-to-geo-maxmind:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.service.iptogeo.maxmind.MaxmindIpToGeoService
      +
      +Tags
      +____
      +ip, service, geo, maxmind
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "maxmind.database.uri", "Path to the Maxmind Geo Enrichment Database File.", "", "null", "false", "false"
      +   "maxmind.database.path", "Local Path to the Maxmind Geo Enrichment Database File.", "", "null", "false", "false"
      +   "locale", "Locale to use for geo information. Defaults to 'en'.", "", "en", "false", "false"
      +   "lookup.time", "Should the additional lookup_micros field be returned or not.", "", "false", "false", "false"
      +
      +Extra informations
      +__________________
      +No additional information is provided
      +
      +----------
      +
      +.. _com.hurence.logisland.service.cache.CSVKeyValueCacheService: 
      +
      +CSVKeyValueCacheService
      +-----------------------
      +A cache that store csv lines as records loaded from a file
      +
      +Module
      +______
      +com.hurence.logisland:logisland-service-inmemory-cache:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.service.cache.CSVKeyValueCacheService
      +
      +Tags
      +____
      +csv, service, cache
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**csv.format**", "a configuration for loading csv", "default (Standard comma separated format, as for RFC4180 but allowing empty lines. Settings are: withDelimiter(',') withQuote('\"') withRecordSeparator(\"\r\n\") withIgnoreEmptyLines(true)), excel (Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale dependent, it might be necessary to customize this format to accommodate to your regional settings. withDelimiter(',')  withQuote('\"') withRecordSeparator(\"\r\n\") withIgnoreEmptyLines(false) withAllowMissingColumnNames(true)), excel_fr (Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is locale dependent, it might be necessary to customize this format to accommodate to your regional settings. withDelimiter(';')  withQuote('\"') withRecordSeparator(\"\r\n\") withIgnoreEmptyLines(false) withAllowMissingColumnNames(true)), mysql (Default MySQL format used by the SELECT INTO OUTFILE and LOAD DATA INFILE operations.This is a tab-delimited format with a LF character as the line separator. Values are not quoted and special characters are escaped with '\'. The default NULL string is \"\\N\". Settings are: withDelimiter('\t') withQuote(null) withRecordSeparator('\n') withIgnoreEmptyLines(false) withEscape('\\') withNullString(\"\\N\") withQuoteMode(QuoteMode.ALL_NON_NULL)), rfc4180 (Comma separated format as defined by RFC 4180. Settings are: withDelimiter(',') withQuote('\"') withRecordSeparator(\"\r\n\") withIgnoreEmptyLines(false)), tdf (Tab-delimited format. Settings are: withDelimiter('\t') withQuote('\"') withRecordSeparator(\"\r\n\") withIgnoreSurroundingSpaces(true))", "default", "false", "false"
      +   "csv.header", "comma separated header values", "", "null", "false", "false"
      +   "csv.file.uri", "Path to the CSV File.", "", "null", "false", "false"
      +   "csv.file.path", "Local Path to the CSV File.", "", "null", "false", "false"
      +   "**row.key**", "th primary key of this db", "", "null", "false", "false"
      +   "cache.size", "The maximum number of element in the cache.", "", "16384", "false", "false"
      +   "first.line.header", "csv headers grabbed from first line", "", "null", "false", "false"
      +   "encoding.charset", "charset", "", "UTF-8", "false", "false"
      +
      +Extra informations
      +__________________
      +No additional information is provided
      +
      +----------
      +
      +.. _com.hurence.logisland.service.cassandra.CassandraControllerService: 
      +
      +CassandraControllerService
      +--------------------------
      +Provides a controller service that for the moment only allows to bulkput records into cassandra.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-service-cassandra-client:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.service.cassandra.CassandraControllerService
      +
      +Tags
      +____
      +cassandra, service
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**cassandra.hosts**", "Cassandra cluster hosts as a comma separated value list", "", "null", "false", "false"
      +   "**cassandra.port**", "Cassandra cluster port", "", "null", "false", "false"
      +   "cassandra.with-ssl", "If this property is true, use SSL. Default is no SSL (false).", "", "false", "false", "false"
      +   "cassandra.with-credentials", "If this property is true, use credentials. Default is no credentials (false).", "", "false", "false", "false"
      +   "cassandra.credentials.user", "The user name to use for authentication. cassandra.with-credentials must be true for that property to be used.", "", "null", "false", "false"
      +   "cassandra.credentials.password", "The user password to use for authentication. cassandra.with-credentials must be true for that property to be used.", "", "null", "false", "false"
      +   "batch.size", "The preferred number of Records to setField to the database in a single transaction", "", "1000", "false", "false"
      +   "flush.interval", "flush interval in ms", "", "500", "false", "false"
      +
      +Extra informations
      +__________________
      +No additional information is provided
      +
      +----------
      +
      +.. _com.hurence.logisland.service.elasticsearch.Elasticsearch_6_6_2_ClientService: 
      +
      +Elasticsearch_6_6_2_ClientService
      +---------------------------------
      +Implementation of ElasticsearchClientService for Elasticsearch 6.6.2.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-service-elasticsearch_6_6_2-client:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.service.elasticsearch.Elasticsearch_6_6_2_ClientService
      +
      +Tags
      +____
      +elasticsearch, client
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values, and whether a property is considered "sensitive"..
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**backoff.policy**", "strategy for retrying to execute requests in bulkRequest", "noBackoff (when a request fail there won't be any retry.), constantBackoff (wait a fixed amount of time between retries, using user put retry number and throttling delay), exponentialBackoff (time waited between retries grow exponentially, using user put retry number and throttling delay), defaultExponentialBackoff (time waited between retries grow exponentially, using es default parameters)", "defaultExponentialBackoff", "false", "false"
      +   "**throttling.delay**", "number of time we should wait between each retry (in milliseconds)", "", "500", "false", "false"
      +   "**num.retry**", "number of time we should try to inject a bulk into es", "", "3", "false", "false"
      +   "batch.size", "The preferred number of Records to setField to the database in a single transaction", "", "1000", "false", "false"
      +   "bulk.size", "bulk size in MB", "", "5", "false", "false"
      +   "flush.interval", "flush interval in sec", "", "5", "false", "false"
      +   "concurrent.requests", "setConcurrentRequests", "", "2", "false", "false"
      +   "**ping.timeout**", "The ping timeout used to determine when a node is unreachable. For example, 5s (5 seconds). If non-local recommended is 30s", "", "5s", "false", "false"
      +   "**sampler.interval**", "How often to sample / ping the nodes listed and connected. For example, 5s (5 seconds). If non-local recommended is 30s.", "", "5s", "false", "false"
      +   "username", "Username to access the Elasticsearch cluster", "", "null", "false", "false"
      +   "password", "Password to access the Elasticsearch cluster", "", "null", "**true**", "false"
      +   "shield.location", "Specifies the path to the JAR for the Elasticsearch Shield plugin. If the Elasticsearch cluster has been secured with the Shield plugin, then the Shield plugin JAR must also be available to this processor. Note: Do NOT place the Shield JAR into NiFi's lib/ directory, doing so will prevent the Shield plugin from being loaded.", "", "null", "false", "false"
      +   "**hosts**", "ElasticSearch Hosts, which should be comma separated and colon for hostname/port host1:port,host2:port,....  For example testcluster:9300.", "", "null", "false", "false"
      +   "ssl.context.service", "The SSL Context Service used to provide client certificate information for TLS/SSL connections. This service only applies if the Shield plugin is available.", "", "null", "false", "false"
      +   "**charset**", "Specifies the character set of the document data.", "", "UTF-8", "false", "false"
      +
      +Extra informations
      +__________________
      +No additional information is provided
      +
      +----------
      +
      +.. _com.hurence.logisland.service.hbase.HBase_1_1_2_ClientService: 
      +
      +HBase_1_1_2_ClientService
      +-------------------------
      +Implementation of HBaseClientService for HBase 1.1.2. This service can be configured by providing a comma-separated list of configuration files, or by specifying values for the other properties. If configuration files are provided, they will be loaded first, and the values of the additional properties will override the values from the configuration files. In addition, any user defined properties on the processor will also be passed to the HBase configuration.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-service-hbase_1_1_2-client:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.service.hbase.HBase_1_1_2_ClientService
      +
      +Tags
      +____
      +hbase, client
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values, and whether a property supports the  `Expression Language `_ .
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "hadoop.configuration.files", "Comma-separated list of Hadoop Configuration files, such as hbase-site.xml and core-site.xml for kerberos, including full paths to the files.", "", "null", "false", "false"
      +   "zookeeper.quorum", "Comma-separated list of ZooKeeper hosts for HBase. Required if Hadoop Configuration Files are not provided.", "", "null", "false", "false"
      +   "zookeeper.client.port", "The port on which ZooKeeper is accepting client connections. Required if Hadoop Configuration Files are not provided.", "", "null", "false", "false"
      +   "zookeeper.znode.parent", "The ZooKeeper ZNode Parent value for HBase (example: /hbase). Required if Hadoop Configuration Files are not provided.", "", "null", "false", "false"
      +   "hbase.client.retries", "The number of times the HBase client will retry connecting. Required if Hadoop Configuration Files are not provided.", "", "3", "false", "false"
      +   "phoenix.client.jar.location", "The full path to the Phoenix client JAR. Required if Phoenix is installed on top of HBase.", "", "null", "false", "**true**"
      +
      +Dynamic Properties
      +__________________
      +Dynamic Properties allow the user to specify both the name and value of a property.
      +
      +.. csv-table:: dynamic-properties
      +   :header: "Name","Value","Description","Allowable Values","Default Value","EL"
      +   :widths: 20,20,40,40,20,10
      +   :escape: \
      +
      +   "The name of an HBase configuration property.", "The value of the given HBase configuration property.", "These properties will be set on the HBase configuration after loading any provided configuration files.", "", "null", false
      +
      +Extra informations
      +__________________
      +No additional information is provided
      +
      +----------
      +
      +.. _com.hurence.logisland.service.influxdb.InfluxDBControllerService: 
      +
      +InfluxDBControllerService
      +-------------------------
      +Provides a controller service that for the moment only allows to bulkput records into influxdb.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-service-influxdb-client:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.service.influxdb.InfluxDBControllerService
      +
      +Tags
      +____
      +influxdb, service, time series
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**influxdb.url**", "InfluxDB connection url", "", "null", "false", "false"
      +   "influxdb.user", "The user name to use for authentication.", "", "null", "false", "false"
      +   "**influxdb.database**", "InfluxDB database name", "", "null", "false", "false"
      +   "influxdb.password", "The user password to use for authentication.", "", "null", "false", "false"
      +   "influxdb.tags", "List of tags for each supported measurement.  Syntax: :[,]...[;:,[]]... Example: cpu:core1,core2;mem:used : in this example, the cpu measurement has 2 tags: core1 and core2 and the mem measurement has 1 tag: used. This must only be set if configuration mode is explicit_tags_and_fields or all_as_fields_but_explicit_tags.", "", "null", "false", "false"
      +   "influxdb.fields", "List of fields for each supported measurement.  Syntax: :[,]...[;:,[]]... Example: cpu:core1,core2;mem:used : in this example, the cpu measurement has 2 fields: core1 and core2 and the mem measurement has 1 field: used. This must only be set if configuration mode is explicit_tags_and_fields or all_as_tags_but_explicit_fields.", "", "null", "false", "false"
      +   "**influxdb.configuration_mode**", "Determines the way fields and tags are chosen from the logisland record. Possible values and meaning: explicit_tags_and_fields: only logisland record fields listed in influxdb.tags and influxdb.fields will be inserted into InfluxDB with the explicit type. all_as_fields: all available logisland record fields will be inserted into  InfluxDB as fields. all_as_tags_but_explicit_fields: all available logisland record fields will be inserted into  InfluxDB as tags except those listed in influxdb.fields that will be inserted into InfluxDB as fields. all_as_fields_but_explicit_tags: all available logisland record fields will be inserted into  InfluxDB as fields except those listed in influxdb.tags that will be inserted into InfluxDB as tags", "explicit_tags_and_fields, all_as_fields, all_as_fields_but_explicit_tags, all_as_tags_but_explicit_fields", "null", "false", "false"
      +   "influxdb.consistency_level", "Determines the consistency level used to write points into InfluxDB. Possible values are: ANY, ONE, QUORUMand ALL. Default value is ANY. This is only useful when  using a clustered InfluxDB infrastructure.", "ANY, ONE, QUORUM, ALL", "ANY", "false", "false"
      +   "influxdb.retention_policy", "Determines the name of the retention policy to use. Defaults to autogen. The defined retention policy must already be defined in the InfluxDB server.", "", "autogen", "false", "false"
      +   "influxdb.timefield", "Time field for each supported measurement.  Syntax: :,...[;:,]... With format being any constant defined in  java.util.concurrent.TimeUnit enum: DAYS, HOURS, MICROSECONDS, MILLISECONDS, MINUTES, NANOSECONDS or SECONDS. Example: cpu:time,NANOSECONDS;mem:timeStamp,MILLISECONDS In this example: for the cpu measurement, the time for the influx DB point matching the record will be the value of the time field that represents nanoseconds. For the mem measurement, the time for the influx DB point matching the record will be the value of the timeStamp field that represents milliseconds.  Any measurement for which the time field is not defined will use the content of the record_time technical field as the time (which is a number of milliseconds since epoch).", "", "null", "false", "false"
      +   "batch.size", "The preferred number of Records to setField to the database in a single transaction", "", "1000", "false", "false"
      +   "flush.interval", "flush interval in ms", "", "500", "false", "false"
      +
      +Extra informations
      +__________________
      +No additional information is provided
      +
      +----------
      +
      +.. _com.hurence.logisland.service.cache.LRUKeyValueCacheService: 
      +
      +LRUKeyValueCacheService
      +-----------------------
      +A controller service for caching data by key value pair with LRU (last recently used) strategy. using LinkedHashMap
      +
      +Module
      +______
      +com.hurence.logisland:logisland-service-inmemory-cache:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.service.cache.LRUKeyValueCacheService
      +
      +Tags
      +____
      +cache, service, key, value, pair, LRU
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "cache.size", "The maximum number of element in the cache.", "", "16384", "false", "false"
      +
      +Extra informations
      +__________________
      +No additional information is provided
      +
      +----------
      +
      +.. _com.hurence.logisland.service.mongodb.MongoDBControllerService: 
      +
      +MongoDBControllerService
      +------------------------
      +Provides a controller service that wraps most of the functionality of the MongoDB driver.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-service-mongodb-client:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.service.mongodb.MongoDBControllerService
      +
      +Tags
      +____
      +mongo, mongodb, service
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values, and whether a property supports the  `Expression Language `_ .
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**mongo.uri**", "MongoURI, typically of the form: mongodb://host1[:port1][,host2[:port2],...]", "", "null", "false", "**true**"
      +   "**mongo.db.name**", "The name of the database to use", "", "null", "false", "**true**"
      +   "**mongo.collection.name**", "The name of the collection to use", "", "null", "false", "**true**"
      +   "batch.size", "The preferred number of Records to setField to the database in a single transaction", "", "1000", "false", "false"
      +   "bulk.size", "bulk size in MB", "", "5", "false", "false"
      +   "mongo.bulk.mode", "Bulk mode (insert or upsert)", "insert (Insert records whose key must be unique), upsert (Insert records if not already existing or update the record if already existing)", "insert", "false", "false"
      +   "flush.interval", "flush interval in ms", "", "500", "false", "false"
      +   "**mongo.write.concern**", "The write concern to use", "ACKNOWLEDGED, UNACKNOWLEDGED, FSYNCED, JOURNALED, REPLICA_ACKNOWLEDGED, MAJORITY", "ACKNOWLEDGED", "false", "false"
      +   "mongo.bulk.upsert.condition", "A custom condition for the bulk upsert (Filter for the bulkwrite). If not specified the standard condition is to match same id ('_id': data._id)", "", "${'{ \"_id\" :\"' + record_id + '\"}'}", "false", "**true**"
      +
      +Extra informations
      +__________________
      +No additional information is provided
      +
      +----------
      +
      +.. _com.hurence.logisland.redis.service.RedisKeyValueCacheService: 
      +
      +RedisKeyValueCacheService
      +-------------------------
      +A controller service for caching records by key value pair with LRU (last recently used) strategy. using LinkedHashMap
      +
      +Module
      +______
      +com.hurence.logisland:logisland-service-redis:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.redis.service.RedisKeyValueCacheService
      +
      +Tags
      +____
      +cache, service, key, value, pair, redis
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values, and whether a property is considered "sensitive"..
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**redis.mode**", "The type of Redis being communicated with - standalone, sentinel, or clustered.", "standalone (A single standalone Redis instance.), sentinel (Redis Sentinel which provides high-availability. Described further at https://redis.io/topics/sentinel), cluster (Clustered Redis which provides sharding and replication. Described further at https://redis.io/topics/cluster-spec)", "standalone", "false", "false"
      +   "**connection.string**", "The connection string for Redis. In a standalone instance this value will be of the form hostname:port. In a sentinel instance this value will be the comma-separated list of sentinels, such as host1:port1,host2:port2,host3:port3. In a clustered instance this value will be the comma-separated list of cluster masters, such as host1:port,host2:port,host3:port.", "", "null", "false", "false"
      +   "**database.index**", "The database index to be used by connections created from this connection pool. See the databases property in redis.conf, by default databases 0-15 will be available.", "", "0", "false", "false"
      +   "**communication.timeout**", "The timeout to use when attempting to communicate with Redis.", "", "10 seconds", "false", "false"
      +   "**cluster.max.redirects**", "The maximum number of redirects that can be performed when clustered.", "", "5", "false", "false"
      +   "sentinel.master", "The name of the sentinel master, require when Mode is set to Sentinel", "", "null", "false", "false"
      +   "password", "The password used to authenticate to the Redis server. See the requirepass property in redis.conf.", "", "null", "**true**", "false"
      +   "**pool.max.total**", "The maximum number of connections that can be allocated by the pool (checked out to clients, or idle awaiting checkout). A negative value indicates that there is no limit.", "", "8", "false", "false"
      +   "**pool.max.idle**", "The maximum number of idle connections that can be held in the pool, or a negative value if there is no limit.", "", "8", "false", "false"
      +   "**pool.min.idle**", "The target for the minimum number of idle connections to maintain in the pool. If the configured value of Min Idle is greater than the configured value for Max Idle, then the value of Max Idle will be used instead.", "", "0", "false", "false"
      +   "**pool.block.when.exhausted**", "Whether or not clients should block and wait when trying to obtain a connection from the pool when the pool has no available connections. Setting this to false means an error will occur immediately when a client requests a connection and none are available.", "true, false", "true", "false", "false"
      +   "**pool.max.wait.time**", "The amount of time to wait for an available connection when Block When Exhausted is set to true.", "", "10 seconds", "false", "false"
      +   "**pool.min.evictable.idle.time**", "The minimum amount of time an object may sit idle in the pool before it is eligible for eviction.", "", "60 seconds", "false", "false"
      +   "**pool.time.between.eviction.runs**", "The amount of time between attempting to evict idle connections from the pool.", "", "30 seconds", "false", "false"
      +   "**pool.num.tests.per.eviction.run**", "The number of connections to tests per eviction attempt. A negative value indicates to test all connections.", "", "-1", "false", "false"
      +   "**pool.test.on.create**", "Whether or not connections should be tested upon creation.", "true, false", "false", "false", "false"
      +   "**pool.test.on.borrow**", "Whether or not connections should be tested upon borrowing from the pool.", "true, false", "false", "false", "false"
      +   "**pool.test.on.return**", "Whether or not connections should be tested upon returning to the pool.", "true, false", "false", "false", "false"
      +   "**pool.test.while.idle**", "Whether or not connections should be tested while idle.", "true, false", "true", "false", "false"
      +   "**record.recordSerializer**", "the way to serialize/deserialize the record", "com.hurence.logisland.serializer.KryoSerializer (serialize events as json blocs), com.hurence.logisland.serializer.JsonSerializer (serialize events as json blocs), com.hurence.logisland.serializer.AvroSerializer (serialize events as avro blocs), com.hurence.logisland.serializer.BytesArraySerializer (serialize events as byte arrays), com.hurence.logisland.serializer.KuraProtobufSerializer (serialize events as Kura protocol buffer), none (send events as bytes)", "com.hurence.logisland.serializer.JsonSerializer", "false", "false"
      +
      +Extra informations
      +__________________
      +No additional information is provided
      +
      +----------
      +
      +.. _com.hurence.logisland.service.solr.Solr_6_6_2_ClientService: 
      +
      +Solr_6_6_2_ClientService
      +------------------------
      +Implementation of ElasticsearchClientService for Solr 5.5.5.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-service-solr_6_6_2-client:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.service.solr.Solr_6_6_2_ClientService
      +
      +Tags
      +____
      +solr, client
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "batch.size", "The preferred number of Records to setField to the database in a single transaction", "", "1000", "false", "false"
      +   "bulk.size", "bulk size in MB", "", "5", "false", "false"
      +   "**solr.cloud**", "is slor cloud enabled", "", "false", "false", "false"
      +   "**solr.collection**", "name of the collection to use", "", "null", "false", "false"
      +   "**solr.connection.string**", "zookeeper quorum host1:2181,host2:2181 for solr cloud or http address of a solr core ", "", "localhost:8983/solr", "false", "false"
      +   "solr.concurrent.requests", "setConcurrentRequests", "", "2", "false", "false"
      +   "flush.interval", "flush interval in ms", "", "500", "false", "false"
      +   "schema.update_timeout", "Schema update timeout interval in s", "", "15", "false", "false"
      +
      +Extra informations
      +__________________
      +No additional information is provided
      +
      +----------
      +
      +.. _com.hurence.logisland.service.elasticsearch.Elasticsearch_7_x_ClientService: 
      +
      +Elasticsearch_7_x_ClientService
      +-------------------------------
      +Implementation of ElasticsearchClientService for ElasticSearch 7.x. Note that although Elasticsearch 7.x still accepts type information, this implementation will ignore any type usage and will only work at the index level to be already compliant with the ElasticSearch 8.x version that will completely remove type usage.
      +
      +Module
      +______
      +com.hurence.logisland:logisland-service-elasticsearch_7_x-client:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.service.elasticsearch.Elasticsearch_7_x_ClientService
      +
      +Tags
      +____
      +elasticsearch, client
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values, and whether a property is considered "sensitive"..
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "**backoff.policy**", "strategy for retrying to execute requests in bulkRequest", "noBackoff (when a request fail there won't be any retry.), constantBackoff (wait a fixed amount of time between retries, using user put retry number and throttling delay), exponentialBackoff (time waited between retries grow exponentially, using user put retry number and throttling delay), defaultExponentialBackoff (time waited between retries grow exponentially, using es default parameters)", "defaultExponentialBackoff", "false", "false"
      +   "**throttling.delay**", "number of time we should wait between each retry (in milliseconds)", "", "500", "false", "false"
      +   "**num.retry**", "number of time we should try to inject a bulk into es", "", "3", "false", "false"
      +   "batch.size", "The preferred number of Records to setField to the database in a single transaction", "", "1000", "false", "false"
      +   "bulk.size", "bulk size in MB", "", "5", "false", "false"
      +   "flush.interval", "flush interval in sec", "", "5", "false", "false"
      +   "concurrent.requests", "setConcurrentRequests", "", "2", "false", "false"
      +   "**ping.timeout**", "The ping timeout used to determine when a node is unreachable. For example, 5s (5 seconds). If non-local recommended is 30s", "", "5s", "false", "false"
      +   "**sampler.interval**", "How often to sample / ping the nodes listed and connected. For example, 5s (5 seconds). If non-local recommended is 30s.", "", "5s", "false", "false"
      +   "username", "Username to access the Elasticsearch cluster", "", "null", "false", "false"
      +   "password", "Password to access the Elasticsearch cluster", "", "null", "**true**", "false"
      +   "shield.location", "Specifies the path to the JAR for the Elasticsearch Shield plugin. If the Elasticsearch cluster has been secured with the Shield plugin, then the Shield plugin JAR must also be available to this processor. Note: Do NOT place the Shield JAR into NiFi's lib/ directory, doing so will prevent the Shield plugin from being loaded.", "", "null", "false", "false"
      +   "**hosts**", "ElasticSearch Hosts, which should be comma separated and colon for hostname/port host1:port,host2:port,....  For example testcluster:9300.", "", "null", "false", "false"
      +   "ssl.context.service", "The SSL Context Service used to provide client certificate information for TLS/SSL connections. This service only applies if the Shield plugin is available.", "", "null", "false", "false"
      +   "**charset**", "Specifies the character set of the document data.", "", "UTF-8", "false", "false"
      +
      +Extra informations
      +__________________
      +No additional information is provided
      +
      +----------
      +
      +.. _com.hurence.logisland.service.solr.Solr8ClientService: 
      +
      +Solr8ClientService
      +------------------
      +Implementation of SolrClientService for Solr 8
      +
      +Module
      +______
      +com.hurence.logisland:logisland-service-solr_8-client:1.2.0
      +
      +Class
      +_____
      +com.hurence.logisland.service.solr.Solr8ClientService
      +
      +Tags
      +____
      +solr, client
      +
      +Properties
      +__________
      +In the list below, the names of required properties appear in **bold**. Any other properties (not in bold) are considered optional. The table also indicates any default values.
      +
      +.. csv-table:: allowable-values
      +   :header: "Name","Description","Allowable Values","Default Value","Sensitive","EL"
      +   :widths: 20,60,30,20,10,10
      +   :escape: \
      +
      +   "batch.size", "The preferred number of Records to setField to the database in a single transaction", "", "1000", "false", "false"
      +   "bulk.size", "bulk size in MB", "", "5", "false", "false"
      +   "**solr.cloud**", "is slor cloud enabled", "", "false", "false", "false"
      +   "**solr.collection**", "name of the collection to use", "", "null", "false", "false"
      +   "**solr.connection.string**", "zookeeper quorum host1:2181,host2:2181 for solr cloud or http address of a solr core ", "", "localhost:8983/solr", "false", "false"
      +   "solr.concurrent.requests", "setConcurrentRequests", "", "2", "false", "false"
      +   "flush.interval", "flush interval in ms", "", "500", "false", "false"
      +   "schema.update_timeout", "Schema update timeout interval in s", "", "15", "false", "false"
      +
      +Extra informations
      +__________________
      +No additional information is provided
      
      From b4d1f77a0cbfb07554add1273da44a18b1ba1ac3 Mon Sep 17 00:00:00 2001
      From: Mathieu Rossignol 
      Date: Fri, 28 Feb 2020 10:41:23 +0100
      Subject: [PATCH 27/43] Added experimental logisland.sh feature that allows
       minimal launch on spark standalone cluster in client mode, not tested cluster
       mode. Also to be improved
      
      ---
       .../src/main/resources/bin/logisland.sh       | 55 +++++++++++++++----
       1 file changed, 43 insertions(+), 12 deletions(-)
      
      diff --git a/logisland-resources/src/main/resources/bin/logisland.sh b/logisland-resources/src/main/resources/bin/logisland.sh
      index 1abbb1d12..4cf7cfaa3 100755
      --- a/logisland-resources/src/main/resources/bin/logisland.sh
      +++ b/logisland-resources/src/main/resources/bin/logisland.sh
      @@ -2,7 +2,6 @@
       
       #. $(dirname $0)/launcher.sh
       
      -
       case "$(uname -s)" in
          Darwin)
            echo "I've detected that you're running Mac OS X, using greadlink instead of readlink"
      @@ -58,6 +57,22 @@ initSparkJarsOptRecursively() {
           return 0;
       }
       
      +# Create app classpath for spark standalone mode
      +initSparkStandaloneClassPath() {
      +    for entry in `ls ${1}/*.jar`
      +    do
      +      #echo "add spark standalone jar ${entry}"
      +      if [[ -z "$spark_standalone_classpath" ]]
      +        then
      +          spark_standalone_classpath="$entry"
      +        else
      +          spark_standalone_classpath="$entry,$spark_standalone_classpath"
      +        fi
      +    done
      +
      +    echo $spark_standalone_classpath
      +    return 0
      +}
       
       # update $java_cp so that it contains all logisland jars except for engines.
       # we look for jars into specified dir recursively.
      @@ -100,6 +115,7 @@ usage() {
         echo "  --conf  : provides the configuration file"
         echo "  --standalone start logisland in standalone mode (no spark required)"
         echo "  --spark-home : sets the SPARK_HOME (defaults to \$SPARK_HOME environment variable)"
      +  echo "  --spark-standalone-dir : sets the base shared directory for logisland jars for spark standlone (experimental)"
         echo "  --help : displays help"
       }
       
      @@ -167,6 +183,10 @@ parse_input() {
                 SPARK_HOME="$2"
                 shift
                 ;;
      +        --spark-standalone-dir)
      +          SPARK_STANDALONE_DIR="$2"
      +          shift
      +          ;;
               --help)
                 usage
                 exit 0
      @@ -252,7 +272,7 @@ main() {
       
           # ----------------------------------------------------------------
           # find the spark-submit mode
      -    # can be either local, standalone, mesos or yarn
      +    # can be either local, standalone, spark (standalone), mesos or yarn
           # ----------------------------------------------------------------
           if [[ "$STANDALONE" = true ]] ;
           then
      @@ -322,6 +342,12 @@ main() {
               #
               if [[ "${MODE}" =~ ^spark://.* ]] # Starts with spark:// (spark standalone url)
               then
      +            if [[ -z "${SPARK_STANDALONE_DIR}" ]]
      +            then
      +             echo "Spark standalone mode requires --spark-standalone-dir option to be set"
      +             exit 1
      +            fi
      +
                   SPARK_MASTER=${MODE}
                   EXTRA_MODE=`awk '{ if( $1 == "spark.deploy-mode:" ){ print $2 } }' ${CONF_FILE}`
                   if [[ -z "${EXTRA_MODE}" ]]
      @@ -620,12 +646,14 @@ main() {
       
                   CONF_FILE="logisland-configuration.yml"
       
      +            engine_jar=(basename $engine_jar)
      +
                   ${SPARK_HOME}/bin/spark-submit ${VERBOSE_OPTIONS} ${SPARK_CLUSTER_OPTIONS} \
                   --conf "${EXTRA_DRIVER_JAVA_OPTIONS}" \
                   --conf "${EXTRA_PROCESSOR_JAVA_OPTIONS}" \
                   --class ${app_mainclass} \
      -            --jars ${app_classpath} ${engine_jar} \
      -             -conf ${CONF_FILE}
      +            --jars ${SPARK_STANDALONE_DIR}/* ${SPARK_STANDALONE_DIR}/${engine_jar} \
      +            -conf ${CONF_FILE}
                   ;;
       
                 spark-client)
      @@ -660,22 +688,25 @@ main() {
                        SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --properties-file ${PROPERTIES_FILE_PATH}"
                   fi
       
      -            SPARK_MONITORING_DRIVER_PORT=`awk '{ if( $1 == "spark.monitoring.driver.port:" ){ print $2 } }' ${CONF_FILE}`
      -            if [[ -z "${SPARK_MONITORING_DRIVER_PORT}" ]]
      +            EXECUTORS_INSTANCES=`awk '{ if( $1 == "spark.executor.instances:" ){ print $2 } }' ${CONF_FILE}`
      +            if [[ ! -z "${EXECUTORS_INSTANCES}" ]]
                   then
      -                 EXTRA_DRIVER_JAVA_OPTIONS='spark.driver.extraJavaOptions=-Dlog4j.configuration=log4j.properties'
      -                 EXTRA_PROCESSOR_JAVA_OPTIONS='spark.executor.extraJavaOptions=-Dlog4j.configuration=log4j.properties'
      -            else
      -                 EXTRA_DRIVER_JAVA_OPTIONS='spark.driver.extraJavaOptions=-Dlog4j.configuration=log4j.properties -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=0 -Dcom.sun.management.jmxremote.rmi.port=0 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -javaagent:'${CONF_DIR}'/../monitoring/jmx_prometheus_javaagent-0.10.jar='${SPARK_MONITORING_DRIVER_PORT}':'${CONF_DIR}'/../monitoring/spark-prometheus.yml'
      -                 EXTRA_PROCESSOR_JAVA_OPTIONS='spark.executor.extraJavaOptions=-Dlog4j.configuration=log4j.properties -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=0 -Dcom.sun.management.jmxremote.rmi.port=0 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -javaagent:./jmx_prometheus_javaagent-0.10.jar='${SPARK_MONITORING_DRIVER_PORT}':./spark-prometheus.yml'
      +                 SPARK_CLUSTER_OPTIONS="${SPARK_CLUSTER_OPTIONS} --num-executors ${EXECUTORS_INSTANCES}"
                   fi
       
      +            EXTRA_DRIVER_JAVA_OPTIONS='spark.driver.extraJavaOptions=-Dlog4j.configuration=log4j.properties'
      +            EXTRA_PROCESSOR_JAVA_OPTIONS='spark.executor.extraJavaOptions=-Dlog4j.configuration=log4j.properties'
      +
      +            engine_jar=`basename $engine_jar`
      +
      +            spark_standalone_classpath=`initSparkStandaloneClassPath ${SPARK_STANDALONE_DIR}`
      +
                   ${SPARK_HOME}/bin/spark-submit ${VERBOSE_OPTIONS} ${SPARK_CLUSTER_OPTIONS} \
                   --conf spark.metrics.conf="${CONF_DIR}/../monitoring/metrics.properties"  \
                   --conf "${EXTRA_DRIVER_JAVA_OPTIONS}" \
                   --conf "${EXTRA_PROCESSOR_JAVA_OPTIONS}" \
                   --class ${app_mainclass} \
      -            --jars ${app_classpath} ${engine_jar} \
      +            --jars ${spark_standalone_classpath} ${SPARK_STANDALONE_DIR}/${engine_jar} \
                   -conf ${CONF_FILE}
                   ;;
       
      
      From af2e8b79e2151c1804da5a6593adf36850095baa Mon Sep 17 00:00:00 2001
      From: Mathieu Rossignol 
      Date: Fri, 28 Feb 2020 15:30:02 +0100
      Subject: [PATCH 28/43] Added OpenDistro Elasticsearch integration test. For
       the moment, only user password without https is supported and validated
      
      ---
       .../elasticsearch/ESOpenDistroRule.java       | 102 ++++
       .../ElasticsearchOpenDistroContainer.java     |  99 ++++
       ...csearchOpenDistro_7_x_ClientServiceIT.java | 512 ++++++++++++++++++
       3 files changed, 713 insertions(+)
       create mode 100644 logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESOpenDistroRule.java
       create mode 100644 logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistroContainer.java
       create mode 100644 logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistro_7_x_ClientServiceIT.java
      
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESOpenDistroRule.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESOpenDistroRule.java
      new file mode 100644
      index 000000000..ba06dc4b2
      --- /dev/null
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESOpenDistroRule.java
      @@ -0,0 +1,102 @@
      +/**
      + * Copyright (C) 2020 Hurence (support@hurence.com)
      + *
      + * Licensed under the Apache License, Version 2.0 (the "License");
      + * you may not use this file except in compliance with the License.
      + * You may obtain a copy of the License at
      + *
      + *         http://www.apache.org/licenses/LICENSE-2.0
      + *
      + * Unless required by applicable law or agreed to in writing, software
      + * distributed under the License is distributed on an "AS IS" BASIS,
      + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      + * See the License for the specific language governing permissions and
      + * limitations under the License.
      + */
      +package com.hurence.logisland.service.elasticsearch;
      +
      +import org.apache.http.HttpHost;
      +import org.apache.http.auth.AuthScope;
      +import org.apache.http.auth.UsernamePasswordCredentials;
      +import org.apache.http.client.CredentialsProvider;
      +import org.apache.http.impl.client.BasicCredentialsProvider;
      +import org.apache.http.impl.nio.client.HttpAsyncClientBuilder;
      +import org.elasticsearch.client.RestClient;
      +import org.elasticsearch.client.RestClientBuilder;
      +import org.elasticsearch.client.RestHighLevelClient;
      +import org.junit.rules.TestRule;
      +import org.junit.runner.Description;
      +import org.junit.runners.model.Statement;
      +
      +/**
      +* A JUnit rule which starts an embedded opendsitro elasticsearch docker container to test security features
      +*/
      +public class ESOpenDistroRule implements TestRule {
      +
      +   /**
      +    * The internal-transport client that talks to the local node.
      +    */
      +   private RestHighLevelClient client;
      +   private ElasticsearchOpenDistroContainer container;
      +
      +   /**
      +    * Return a closure which starts an embedded ES OpenDistro docker container, executes the unit-test, then shuts down the
      +    * ES instance.
      +    */
      +   @Override
      +   public Statement apply(Statement base, Description description) {
      +       return new Statement() {
      +           @Override
      +           public void evaluate() throws Throwable {
      +               container = new ElasticsearchOpenDistroContainer("amazon/opendistro-for-elasticsearch:1.4.0", "admin", "admin");
      +               container.start();
      +
      +               /**
      +                * Inspired from https://github.com/opendistro-for-elasticsearch/community/issues/64
      +                */
      +
      +               final CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
      +               credentialsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials("admin", "admin"));
      +
      +//               client = new RestHighLevelClient(RestClient.builder(HttpHost.create(container.getHostPortString())));
      +
      +               RestClientBuilder builder = RestClient.builder(
      +                       new HttpHost(container.getHostAddress(), container.getPort(), "http"))
      +                       .setHttpClientConfigCallback(new RestClientBuilder.HttpClientConfigCallback() {
      +                           @Override
      +                           public HttpAsyncClientBuilder customizeHttpClient(HttpAsyncClientBuilder httpClientBuilder) {
      +                               return httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider);
      +                           }
      +                       });
      +               client = new RestHighLevelClient(builder);
      +
      +               try {
      +                   base.evaluate(); // execute the unit test
      +               } finally {
      +                   client.close();
      +                   container.stop();
      +               }
      +           }
      +       };
      +   }
      +
      +    public String getHostPortString() {
      +        return container.getHostPortString();
      +    }
      +
      +    public String getHostAddress() {
      +        return container.getHostAddress();
      +    }
      +
      +    public int getPort() {
      +        return container.getPort();
      +    }
      +
      +   /**
      +    * Return the object through which operations can be performed on the ES cluster.
      +    */
      +   public RestHighLevelClient getClient() {
      +       return client;
      +   }
      +
      +}
      \ No newline at end of file
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistroContainer.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistroContainer.java
      new file mode 100644
      index 000000000..0db51bdef
      --- /dev/null
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistroContainer.java
      @@ -0,0 +1,99 @@
      +/**
      + * Copyright (C) 2020 Hurence (support@hurence.com)
      + *
      + * Licensed under the Apache License, Version 2.0 (the "License");
      + * you may not use this file except in compliance with the License.
      + * You may obtain a copy of the License at
      + *
      + *         http://www.apache.org/licenses/LICENSE-2.0
      + *
      + * Unless required by applicable law or agreed to in writing, software
      + * distributed under the License is distributed on an "AS IS" BASIS,
      + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      + * See the License for the specific language governing permissions and
      + * limitations under the License.
      + */
      +package com.hurence.logisland.service.elasticsearch;
      +
      +import org.testcontainers.containers.GenericContainer;
      +import org.testcontainers.containers.wait.strategy.HttpWaitStrategy;
      +import org.testcontainers.utility.Base58;
      +
      +import java.net.InetSocketAddress;
      +import java.time.Duration;
      +
      +import static java.net.HttpURLConnection.HTTP_OK;
      +
      +/**
      + * Represents an elasticsearch docker instance which exposes by default port 9200 and 9300 (transport.tcp.port)
      + * The docker image is by default fetched from docker.elastic.co/elasticsearch/elasticsearch
      + */
      +public class ElasticsearchOpenDistroContainer extends GenericContainer {
      +
      +    /**
      +     * Elasticsearch Default HTTP port
      +     */
      +    private static final int ELASTICSEARCH_OPENDISTRO_DEFAULT_PORT = 9200;
      +
      +    /**
      +     * Elasticsearch Default Transport port
      +     */
      +    private static final int ELASTICSEARCH_OPENDISTRO_DEFAULT_TCP_PORT = 9300;
      +
      +    /**
      +     * Elasticsearch Docker base URL
      +     */
      +    private static final String ELASTICSEARCH_OPENDISTRO_DEFAULT_IMAGE = "amazon/opendistro-for-elasticsearch";
      +
      +    /**
      +     * Elasticsearch Default version
      +     */
      +    protected static final String ELASTICSEARCH_OPENDISTRO_DEFAULT_VERSION = "1.4.0";
      +
      +    public ElasticsearchOpenDistroContainer() {
      +        this(ELASTICSEARCH_OPENDISTRO_DEFAULT_IMAGE + ":" + ELASTICSEARCH_OPENDISTRO_DEFAULT_VERSION, null, null);
      +    }
      +
      +    /**
      +     * Create an OpenDistro Elasticsearch Container by passing the full docker image name
      +     * @param dockerImageName Full docker image name, like: docker.elastic.co/elasticsearch/elasticsearch:6.4.1
      +     */
      +    public ElasticsearchOpenDistroContainer(String dockerImageName, String user, String password) {
      +        super(dockerImageName);
      +
      +        logger().info("Starting an opendistro elasticsearch container using [{}]", dockerImageName);
      +        withNetworkAliases("elasticsearch-opendistro-" + Base58.randomString(6));
      +        withEnv("discovery.type", "single-node");
      +        withEnv("opendistro_security.ssl.http.enabled", "false"); // Disable https
      +//        withEnv("opendistro_security.disabled", "true"); // Completely disable security (https; authentication...)
      +        addExposedPorts(ELASTICSEARCH_OPENDISTRO_DEFAULT_PORT, ELASTICSEARCH_OPENDISTRO_DEFAULT_TCP_PORT);
      +        HttpWaitStrategy httpWaitStrategy = new HttpWaitStrategy()
      +                .forPort(ELASTICSEARCH_OPENDISTRO_DEFAULT_PORT)
      +                .forStatusCodeMatching(response -> response == HTTP_OK);
      +//                .usingTls()
      +
      +        // Ideally we woul like to be able to setup the user with the passed one. For the moment we only support the
      +        // out of the box admin/admin user
      +        if ( (user != null) && (password != null) ) {
      +            httpWaitStrategy.withBasicCredentials(user, password);
      +        }
      +//        setWaitStrategy(httpWaitStrategy.withStartupTimeout(Duration.ofMinutes(2)));
      +        setWaitStrategy(httpWaitStrategy.withStartupTimeout(Duration.ofSeconds(30)));
      +    }
      +
      +    public String getHostPortString() {
      +        return getContainerIpAddress() + ":" + getMappedPort(ELASTICSEARCH_OPENDISTRO_DEFAULT_PORT);
      +    }
      +
      +    public String getHostAddress() {
      +        return getContainerIpAddress();
      +    }
      +
      +    public int getPort() {
      +        return getMappedPort(ELASTICSEARCH_OPENDISTRO_DEFAULT_PORT);
      +    }
      +
      +    public InetSocketAddress getTcpHost() {
      +        return new InetSocketAddress(getContainerIpAddress(), getMappedPort(ELASTICSEARCH_OPENDISTRO_DEFAULT_TCP_PORT));
      +    }
      +}
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistro_7_x_ClientServiceIT.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistro_7_x_ClientServiceIT.java
      new file mode 100644
      index 000000000..c177e55a8
      --- /dev/null
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistro_7_x_ClientServiceIT.java
      @@ -0,0 +1,512 @@
      +/**
      + * Copyright (C) 2020 Hurence (support@hurence.com)
      + *
      + * Licensed under the Apache License, Version 2.0 (the "License");
      + * you may not use this file except in compliance with the License.
      + * You may obtain a copy of the License at
      + *
      + *         http://www.apache.org/licenses/LICENSE-2.0
      + *
      + * Unless required by applicable law or agreed to in writing, software
      + * distributed under the License is distributed on an "AS IS" BASIS,
      + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      + * See the License for the specific language governing permissions and
      + * limitations under the License.
      + */
      +package com.hurence.logisland.service.elasticsearch;
      +
      +import com.hurence.logisland.classloading.PluginProxy;
      +import com.hurence.logisland.component.InitializationException;
      +import com.hurence.logisland.record.FieldType;
      +import com.hurence.logisland.record.Record;
      +import com.hurence.logisland.record.StandardRecord;
      +import com.hurence.logisland.service.datastore.InvalidMultiGetQueryRecordException;
      +import com.hurence.logisland.service.datastore.MultiGetQueryRecord;
      +import com.hurence.logisland.service.datastore.MultiGetResponseRecord;
      +import com.hurence.logisland.util.runner.TestRunner;
      +import com.hurence.logisland.util.runner.TestRunners;
      +import org.elasticsearch.ElasticsearchStatusException;
      +import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest;
      +import org.elasticsearch.client.RequestOptions;
      +import org.elasticsearch.client.indices.GetIndexRequest;
      +import org.elasticsearch.client.indices.GetIndexResponse;
      +import org.elasticsearch.common.unit.TimeValue;
      +import org.junit.After;
      +import org.junit.Assert;
      +import org.junit.ClassRule;
      +import org.junit.Test;
      +import org.slf4j.Logger;
      +import org.slf4j.LoggerFactory;
      +
      +import java.io.IOException;
      +import java.util.*;
      +
      +import static com.hurence.logisland.service.elasticsearch.ElasticsearchClientService.*;
      +
      +public class ElasticsearchOpenDistro_7_x_ClientServiceIT {
      +
      +    private static final String MAPPING1 = "{'properties':{'name':{'type': 'text'},'val':{'type':'integer'}}}";
      +    private static final String MAPPING2 = "{'properties':{'name':{'type': 'text'},'val':{'type': 'text'}}}";
      +    private static final String MAPPING3 =
      +            "{'dynamic':'strict','properties':{'name':{'type': 'text'},'xyz':{'type': 'text'}}}";
      +
      +    private static Logger logger = LoggerFactory.getLogger(ElasticsearchOpenDistro_7_x_ClientServiceIT.class);
      +
      +    @ClassRule
      +    public static final ESOpenDistroRule esOpenDistroRule = new ESOpenDistroRule();
      +
      +    @After
      +    public void clean() throws IOException {
      +//        ClusterHealthRequest is returning nothing... So we are using IndexRequest here
      +        GetIndexRequest request = new GetIndexRequest("*");
      +        GetIndexResponse response;
      +        try {
      +            response = esOpenDistroRule.getClient().indices().get(request, RequestOptions.DEFAULT);
      +        } catch (ElasticsearchStatusException ex) {
      +            return;//should be index not found
      +        }
      +        String[] indices = response.getIndices();
      +        List indicesToClean = new ArrayList();
      +        // Do not remove .opendistro_security mandatory index
      +        Arrays.stream(indices).forEach(index -> {
      +            if (!index.equals(".opendistro_security")) {
      +                indicesToClean.add(index);
      +            }
      +        });
      +        if (indicesToClean.size() > 0) {
      +            logger.info("Cleaning indices:" + indicesToClean);
      +            DeleteIndexRequest deleteRequest = new DeleteIndexRequest(indicesToClean.toArray(new String[0]));
      +            Assert.assertTrue(esOpenDistroRule.getClient().indices().delete(deleteRequest, RequestOptions.DEFAULT).isAcknowledged());
      +        } else {
      +            logger.info("No index to clean");
      +        }
      +    }
      +
      +    private ElasticsearchClientService configureElasticsearchOpenDistroClientService(final TestRunner runner) throws InitializationException {
      +        final Elasticsearch_7_x_ClientService elasticsearchClientService = new Elasticsearch_7_x_ClientService();
      +
      +        runner.addControllerService("elasticsearchClient", elasticsearchClientService);
      +
      +        runner.setProperty(TestProcessor.ELASTICSEARCH_CLIENT_SERVICE, "elasticsearchClient");
      +        runner.setProperty(elasticsearchClientService, HOSTS, esOpenDistroRule.getHostPortString());
      +        runner.setProperty(elasticsearchClientService, USERNAME, "admin");
      +        runner.setProperty(elasticsearchClientService, PASSWORD, "admin");
      +        runner.enableControllerService(elasticsearchClientService);
      +
      +        // TODO : is this necessary ?
      +        final ElasticsearchClientService service = PluginProxy.unwrap(runner.getProcessContext().getPropertyValue(TestProcessor.ELASTICSEARCH_CLIENT_SERVICE).asControllerService());
      +        return service;
      +    }
      +
      +    @Test
      +    public void testBasics() throws Exception {
      +
      +        Map document1 = new HashMap<>();
      +        document1.put("name", "fred");
      +        document1.put("val", 33);
      +
      +        boolean result;
      +
      +        final TestRunner runner = TestRunners.newTestRunner(new TestProcessor());
      +
      +        final ElasticsearchClientService elasticsearchClientService = configureElasticsearchOpenDistroClientService(runner);
      +
      +
      +        // Verify the index does not exist
      +        Assert.assertEquals(false, elasticsearchClientService.existsCollection("foo"));
      +
      +        // Define the index
      +        elasticsearchClientService.createCollection("foo", 2, 1);
      +        Assert.assertEquals(true, elasticsearchClientService.existsCollection("foo"));
      +
      +        // Define another index
      +        elasticsearchClientService.createCollection("bar", 2, 1);
      +        Assert.assertEquals(true, elasticsearchClientService.existsCollection("foo"));
      +
      +        // Add a mapping to foo
      +        result = elasticsearchClientService.putMapping("foo", null, MAPPING1.replace('\'', '"'));
      +        Assert.assertEquals(true, result);
      +
      +        // Add the same mapping again
      +        result = elasticsearchClientService.putMapping("foo", null, MAPPING1.replace('\'', '"'));
      +        Assert.assertEquals(true, result);
      +
      +        // create alias
      +        elasticsearchClientService.createAlias("foo", "aliasFoo");
      +        Assert.assertEquals(true, elasticsearchClientService.existsCollection("aliasFoo"));
      +
      +        // Insert a record into foo and count foo
      +        Assert.assertEquals(0, elasticsearchClientService.countCollection("foo"));
      +        elasticsearchClientService.saveSync("foo", null, document1);
      +        Assert.assertEquals(1, elasticsearchClientService.countCollection("foo"));
      +
      +        // copy index foo to bar - should work
      +        Assert.assertEquals(0, elasticsearchClientService.countCollection("bar"));
      +        elasticsearchClientService.copyCollection(TimeValue.timeValueMinutes(2).toString(), "foo", "bar");
      +        elasticsearchClientService.bulkFlush();
      +        Thread.sleep(2000);
      +        elasticsearchClientService.refreshCollection("bar");
      +        Assert.assertEquals(1, elasticsearchClientService.countCollection("bar"));
      +
      +        // Define incompatible mappings in two different indexes, then try to copy - should fail
      +        // as a document registered in index foo cannot be written in index baz.
      +        //
      +        // Note: MAPPING2 cannot be added to index foo or bar at all, even under a different doctype, as ES (lucene)
      +        // does not allow two types for the same field-name in different mappings of the same index. However if
      +        // MAPPING2 is added to index baz, then the copyCollection succeeds - because by default ES automatically converts
      +        // integers into strings when necessary. Interestingly, this means MAPPING1 and MAPPING2 are not compatible
      +        // at the "put mapping" level, but are compatible at the "reindex" level..
      +        //
      +        // The document (doc1) of type "type1" already in index "foo" cannot be inserted into index "baz" as type1
      +        // because that means applying its source to MAPPING3 - but MAPPING3 is strict and does not define property
      +        // "val", so the insert fails.
      +        elasticsearchClientService.createCollection("baz",2, 1);
      +        elasticsearchClientService.putMapping("baz", null, MAPPING2.replace('\'', '"'));
      +
      +//       try {
      +//            elasticsearchClientService.copyCollection(TimeValue.timeValueMinutes(2).toString(), "foo", "baz");
      +//            Assert.fail("Exception not thrown when expected");
      +//        } catch(DatastoreClientServiceException e) {
      +//            Assert.assertTrue(e.getMessage().contains("Reindex failed"));
      +//        }
      +        elasticsearchClientService.refreshCollection("baz");
      +        Assert.assertEquals(0, elasticsearchClientService.countCollection("baz"));
      +
      +        // Drop index foo
      +        elasticsearchClientService.dropCollection("foo");
      +        Assert.assertEquals(false, elasticsearchClientService.existsCollection("foo"));
      +        Assert.assertEquals(false, elasticsearchClientService.existsCollection("aliasFoo")); // alias for foo disappears too
      +        Assert.assertEquals(true, elasticsearchClientService.existsCollection("bar"));
      +    }
      +
      +    @Test
      +    public void testBulkPut() throws InitializationException, IOException, InterruptedException {
      +        final String index = "foo";
      +        final String docId = "id1";
      +        final String nameKey = "name";
      +        final String nameValue = "fred";
      +        final String ageKey = "age";
      +        final int ageValue = 33;
      +
      +        Map document1 = new HashMap<>();
      +        document1.put(nameKey, nameValue);
      +        document1.put(ageKey, ageValue);
      +
      +        final TestRunner runner = TestRunners.newTestRunner(new TestProcessor());
      +
      +        // create the controller service and link it to the test processor :
      +        final ElasticsearchClientService elasticsearchClientService = configureElasticsearchOpenDistroClientService(runner);
      +
      +        // Verify the index does not exist
      +        Assert.assertEquals(false, elasticsearchClientService.existsCollection(index));
      +
      +        // Create the index
      +        elasticsearchClientService.createCollection(index,2, 1);
      +        Assert.assertEquals(true, elasticsearchClientService.existsCollection(index));
      +
      +        // Put a document in the bulk processor :
      +        elasticsearchClientService.bulkPut(index, null, document1, Optional.of(docId));
      +        // Flush the bulk processor :
      +        elasticsearchClientService.bulkFlush();
      +        Thread.sleep(2000);
      +        try {
      +            // Refresh the index :
      +            elasticsearchClientService.refreshCollection(index);
      +        } catch (Exception e) {
      +            logger.error("Error while refreshing the index : " + e.toString());
      +        }
      +
      +        long documentsNumber = 0;
      +
      +        try {
      +            documentsNumber = elasticsearchClientService.countCollection(index);
      +        } catch (Exception e) {
      +            logger.error("Error while counting the number of documents in the index : " + e.toString());
      +        }
      +
      +        Assert.assertEquals(1, documentsNumber);
      +
      +        try {
      +            elasticsearchClientService.saveSync(index, null, document1);
      +        } catch (Exception e) {
      +            logger.error("Error while saving the document in the index : " + e.toString());
      +        }
      +
      +        try {
      +            documentsNumber = elasticsearchClientService.countCollection(index);
      +        } catch (Exception e) {
      +            logger.error("Error while counting the number of documents in the index : " + e.toString());
      +        }
      +
      +        Assert.assertEquals(2, documentsNumber);
      +
      +        long numberOfHits = elasticsearchClientService.searchNumberOfHits(index, null, nameKey, nameValue);
      +
      +        Assert.assertEquals(2, numberOfHits);
      +
      +    }
      +
      +
      +    @Test
      +    public void testBulkPutGeopoint() throws InitializationException, InterruptedException {
      +        final String index = "future_factory";
      +        final String docId = "modane_factory";
      +        Record record = new StandardRecord("factory")
      +                .setId(docId)
      +                .setStringField("address", "rue du Frejus")
      +                .setField("latitude", FieldType.FLOAT, 45.4f)
      +                .setField("longitude", FieldType.FLOAT, 45.4f);
      +
      +        final TestRunner runner = TestRunners.newTestRunner(new TestProcessor());
      +
      +        // create the controller service and link it to the test processor :
      +        final ElasticsearchClientService elasticsearchClientService = configureElasticsearchOpenDistroClientService(runner);
      +
      +        // Verify the index does not exist
      +        Assert.assertEquals(false, elasticsearchClientService.existsCollection(index));
      +
      +        // Create the index
      +        elasticsearchClientService.createCollection(index, 2, 1);
      +        Assert.assertEquals(true, elasticsearchClientService.existsCollection(index));
      +
      +        // Put a document in the bulk processor :
      +        String document1 = ElasticsearchRecordConverter.convertToString(record);
      +        elasticsearchClientService.bulkPut(index, null, document1, Optional.of(docId));
      +        // Flush the bulk processor :
      +        elasticsearchClientService.bulkFlush();
      +        Thread.sleep(2000);
      +        try {
      +            // Refresh the index :
      +            elasticsearchClientService.refreshCollection(index);
      +        } catch (Exception e) {
      +            logger.error("Error while refreshing the index : " + e.toString());
      +        }
      +
      +        long documentsNumber = 0;
      +
      +        try {
      +            documentsNumber = elasticsearchClientService.countCollection(index);
      +        } catch (Exception e) {
      +            logger.error("Error while counting the number of documents in the index : " + e.toString());
      +        }
      +
      +        Assert.assertEquals(1, documentsNumber);
      +
      +        List multiGetQueryRecords = new ArrayList<>();
      +        ArrayList documentIds = new ArrayList<>();
      +        List multiGetResponseRecords = new ArrayList<>();
      +
      +
      +        // Make sure a dummy query returns no result :
      +        documentIds.add(docId);
      +        try {
      +            multiGetQueryRecords.add(new MultiGetQueryRecord(index, null, new String[]{"location", "id"}, new String[]{}, documentIds));
      +        } catch (InvalidMultiGetQueryRecordException e) {
      +            e.printStackTrace();
      +        }
      +        multiGetResponseRecords = elasticsearchClientService.multiGet(multiGetQueryRecords);
      +        Assert.assertEquals(1, multiGetResponseRecords.size()); // number of documents retrieved
      +
      +    }
      +
      +
      +    @Test
      +    public void testMultiGet() throws InitializationException, InterruptedException, InvalidMultiGetQueryRecordException {
      +        final String index1 = "index1";
      +        final String index2 = "index2";
      +
      +        Map document1 = new HashMap<>();
      +        final String docId1 = "id1";
      +        document1.put("field_beg_1", "field_beg_1_document1_value");
      +        document1.put("field_beg_2", "field_beg_2_document1_value");
      +        document1.put("field_beg_3", "field_beg_3_document1_value");
      +        document1.put("field_fin_1", "field_fin_1_document1_value");
      +        document1.put("field_fin_2", "field_fin_2_document1_value");
      +
      +        Map document2 = new HashMap<>();
      +        final String docId2 = "id2";
      +        document2.put("field_beg_1", "field_beg_1_document2_value");
      +        document2.put("field_beg_2", "field_beg_2_document2_value");
      +        document2.put("field_beg_3", "field_beg_3_document2_value");
      +        document2.put("field_fin_1", "field_fin_1_document2_value");
      +        document2.put("field_fin_2", "field_fin_2_document2_value");
      +
      +        Map document3 = new HashMap<>();
      +        final String docId3 = "id3";
      +        document3.put("field_beg_1", "field_beg_1_document3_value");
      +        document3.put("field_beg_2", "field_beg_2_document3_value");
      +        // this 3rd field is intentionally removed :
      +        // document3.put("field_beg_3", "field_beg_3_document3_value");
      +        document3.put("field_fin_1", "field_fin_1_document3_value");
      +        document3.put("field_fin_2", "field_fin_2_document3_value");
      +
      +        final TestRunner runner = TestRunners.newTestRunner(new TestProcessor());
      +
      +        // create the controller service and link it to the test processor :
      +        final ElasticsearchClientService elasticsearchClientService = configureElasticsearchOpenDistroClientService(runner);
      +
      +        // Verify the indexes do not exist
      +        Assert.assertEquals(false, elasticsearchClientService.existsCollection(index1));
      +        Assert.assertEquals(false, elasticsearchClientService.existsCollection(index2));
      +
      +        // Create the indexes
      +        elasticsearchClientService.createCollection(index1, 2, 1);
      +        elasticsearchClientService.createCollection(index2, 2, 1);
      +        Assert.assertEquals(true, elasticsearchClientService.existsCollection(index1));
      +        Assert.assertEquals(true, elasticsearchClientService.existsCollection(index2));
      +
      +        // Put documents in the bulk processor :
      +        elasticsearchClientService.bulkPut(index1, null, document1, Optional.of(docId1));
      +        elasticsearchClientService.bulkPut(index1, null, document2, Optional.of(docId2));
      +        elasticsearchClientService.bulkPut(index1, null, document3, Optional.of(docId3));
      +        elasticsearchClientService.bulkPut(index2, null, document1, Optional.of(docId1));
      +        elasticsearchClientService.bulkPut(index2, null, document2, Optional.of(docId2));
      +        elasticsearchClientService.bulkPut(index2, null, document3, Optional.of(docId3));
      +        // Flush the bulk processor :
      +        elasticsearchClientService.bulkFlush();
      +        Thread.sleep(2000);
      +        try {
      +            // Refresh the indexes :
      +            elasticsearchClientService.refreshCollection(index1);
      +            elasticsearchClientService.refreshCollection(index2);
      +        } catch (Exception e) {
      +            logger.error("Error while refreshing the indexes : " + e.toString());
      +        }
      +
      +        long countIndex1 = 0;
      +        long countIndex2 = 0;
      +        try {
      +            countIndex1 = elasticsearchClientService.countCollection(index1);
      +            countIndex2 = elasticsearchClientService.countCollection(index2);
      +        } catch (Exception e) {
      +            logger.error("Error while counting the number of documents in the index : " + e.toString());
      +        }
      +        Assert.assertEquals(3, countIndex1);
      +        Assert.assertEquals(3, countIndex2);
      +
      +        List multiGetQueryRecords = new ArrayList<>();
      +        ArrayList documentIds = new ArrayList<>();
      +        ArrayList documentIds_2 = new ArrayList<>();
      +        List multiGetResponseRecords;
      +        String[] fieldsToInclude = {"field_b*", "field*1"};
      +        String[] fieldsToExclude = {"field_*2"};
      +
      +        // Make sure a dummy query returns no result :
      +        documentIds.add(docId1);
      +        multiGetQueryRecords.add(new MultiGetQueryRecord("dummy", "", new String[]{"dummy"}, new String[]{}, documentIds));
      +        multiGetResponseRecords = elasticsearchClientService.multiGet(multiGetQueryRecords);
      +        Assert.assertEquals(0, multiGetResponseRecords.size()); // number of documents retrieved
      +
      +        multiGetQueryRecords.clear();
      +        documentIds.clear();
      +        multiGetResponseRecords.clear();
      +
      +        // Test : 1 MultiGetQueryRecord record, with 1 index, 1 type, 1 id, WITHOUT includes, WITHOUT excludes :
      +        documentIds.add(docId1);
      +        multiGetQueryRecords.add(new MultiGetQueryRecord(index1, null, documentIds));
      +        multiGetResponseRecords = elasticsearchClientService.multiGet(multiGetQueryRecords);
      +
      +        Assert.assertEquals(1, multiGetResponseRecords.size()); // number of documents retrieved
      +        Assert.assertEquals(index1, multiGetResponseRecords.get(0).getCollectionName());
      +        Assert.assertEquals("_doc", multiGetResponseRecords.get(0).getTypeName());
      +        Assert.assertEquals(docId1, multiGetResponseRecords.get(0).getDocumentId());
      +        Assert.assertEquals(5, multiGetResponseRecords.get(0).getRetrievedFields().size()); // number of fields retrieved for the document
      +        multiGetResponseRecords.get(0).getRetrievedFields().forEach((k, v) -> document1.get(k).equals(v.toString()));
      +
      +        multiGetQueryRecords.clear();
      +        documentIds.clear();
      +        multiGetResponseRecords.clear();
      +
      +        // Test : 1 MultiGetQueryRecord record, with 1 index, 0 type, 3 ids, WITH include, WITH exclude :
      +        documentIds.add(docId1);
      +        documentIds.add(docId2);
      +        documentIds.add(docId3);
      +        multiGetQueryRecords.add(new MultiGetQueryRecord(index1, null, fieldsToInclude, fieldsToExclude, documentIds));
      +        multiGetResponseRecords = elasticsearchClientService.multiGet(multiGetQueryRecords);
      +
      +        Assert.assertEquals(3, multiGetResponseRecords.size()); // verify that 3 documents has been retrieved
      +        multiGetResponseRecords.forEach(responseRecord -> Assert.assertEquals(index1, responseRecord.getCollectionName())); // verify that all retrieved are in index1
      +        multiGetResponseRecords.forEach(responseRecord -> Assert.assertEquals("_doc", responseRecord.getTypeName())); // verify that the type of all retrieved documents is type1
      +        multiGetResponseRecords.forEach(responseRecord -> {
      +            if (responseRecord.getDocumentId() == docId1) {
      +                Assert.assertEquals(3, responseRecord.getRetrievedFields().size()); // for document1, verify that 3 fields has been retrieved
      +                // verify that the 3 retrieved fields are the correct ones :
      +                Assert.assertEquals(true, responseRecord.getRetrievedFields().containsKey("field_beg_1"));
      +                Assert.assertEquals(true, responseRecord.getRetrievedFields().containsKey("field_beg_3"));
      +                Assert.assertEquals(true, responseRecord.getRetrievedFields().containsKey("field_fin_1"));
      +                // verify that the values of the 3 retrieved fields are the correct ones :
      +                Assert.assertEquals("field_beg_1_document1_value", responseRecord.getRetrievedFields().get("field_beg_1").toString());
      +                Assert.assertEquals("field_beg_3_document1_value", responseRecord.getRetrievedFields().get("field_beg_3").toString());
      +                Assert.assertEquals("field_fin_1_document1_value", responseRecord.getRetrievedFields().get("field_fin_1").toString());
      +            }
      +            if (responseRecord.getDocumentId() == docId2)
      +                Assert.assertEquals(3, responseRecord.getRetrievedFields().size()); // for document2, verify that 3 fields has been retrieved
      +            if (responseRecord.getDocumentId() == docId3)
      +                Assert.assertEquals(2, responseRecord.getRetrievedFields().size()); // for document3, verify that 2 fields has been retrieved
      +        });
      +
      +        multiGetQueryRecords.clear();
      +        documentIds.clear();
      +        multiGetResponseRecords.clear();
      +
      +        // Test : 2 MultiGetQueryRecord records :
      +        //    - 1st : 1 index (index1), 1 type, 2 ids, WITH include, WITH exclude    --> expecting : 2 docs retrieved (from index1), 3 fields each (except doc3 : 2 fields)
      +        //    - 2nd : 1 index (index2), 0 type, 3 ids, WITH include, WITHOUT exclude --> expecting : 3 docs retrieved (from index2), 4 fields each (except doc3 : 3 fields)
      +        documentIds.add(docId1);
      +        documentIds.add(docId2);
      +        multiGetQueryRecords.add(new MultiGetQueryRecord(index1, null, fieldsToInclude, fieldsToExclude, documentIds));
      +        documentIds_2.add(docId1);
      +        documentIds_2.add(docId1);
      +        documentIds_2.add(docId1);
      +        multiGetQueryRecords.add(new MultiGetQueryRecord(index2, null, fieldsToInclude, null, documentIds_2));
      +        multiGetResponseRecords = elasticsearchClientService.multiGet(multiGetQueryRecords);
      +
      +        Assert.assertEquals(5, multiGetResponseRecords.size()); // verify that 5 documents has been retrieved
      +        multiGetResponseRecords.forEach(responseRecord -> {
      +            if (responseRecord.getCollectionName() == index1 && !responseRecord.getDocumentId().equals(docId3))
      +                Assert.assertEquals(3, responseRecord.getRetrievedFields().size()); // for documents from index1 (except doc3), verify that 3 fields has been retrieved
      +            if (responseRecord.getCollectionName() == index1 && responseRecord.getDocumentId().equals(docId3))
      +                Assert.assertEquals(2, responseRecord.getRetrievedFields().size()); // for document3 from index1, verify that 2 fields has been retrieved
      +            if (responseRecord.getDocumentId() == index2 && !responseRecord.getDocumentId().equals(docId3))
      +                Assert.assertEquals(4, responseRecord.getRetrievedFields().size()); // for documents from index2 (except doc3), verify that 4 fields has been retrieved
      +            if (responseRecord.getDocumentId() == index2 && responseRecord.getDocumentId().equals(docId3))
      +                Assert.assertEquals(3, responseRecord.getRetrievedFields().size()); // for document3 from index2, verify that 3 fields has been retrieved
      +        });
      +
      +    }
      +
      +    @Test
      +    public void testMultiGetInvalidRecords() {
      +
      +        List multiGetQueryRecords = new ArrayList<>();
      +
      +        String errorMessage = "";
      +
      +        // Validate null index behaviour :
      +        try {
      +            multiGetQueryRecords.add(new MultiGetQueryRecord(null, null, null, null, null));
      +        } catch (InvalidMultiGetQueryRecordException e) {
      +            errorMessage = e.getMessage();
      +        }
      +        Assert.assertEquals(errorMessage, "The index name cannot be null");
      +
      +        // Validate empty index behaviour :
      +        try {
      +            multiGetQueryRecords.add(new MultiGetQueryRecord("", null, null, null, null));
      +        } catch (InvalidMultiGetQueryRecordException e) {
      +            errorMessage = e.getMessage();
      +        }
      +        Assert.assertEquals(errorMessage, "The index name cannot be empty");
      +
      +        // Validate null documentIds behaviour :
      +        try {
      +            multiGetQueryRecords.add(new MultiGetQueryRecord("dummy", null, null, null, null));
      +        } catch (InvalidMultiGetQueryRecordException e) {
      +            errorMessage = e.getMessage();
      +        }
      +        Assert.assertEquals(errorMessage, "The list of document ids cannot be null");
      +
      +        // Make sure no invalid MultiGetQueryRecord has been added to multiGetQueryRecords list :
      +        Assert.assertEquals(0, multiGetQueryRecords.size());
      +    }
      +}
      
      From bd0636ad6300172d40f57b66215ce3df0cfa6d25 Mon Sep 17 00:00:00 2001
      From: Mathieu Rossignol 
      Date: Fri, 28 Feb 2020 15:38:09 +0100
      Subject: [PATCH 29/43] Centralized opendistro user name and password in
       integration tests
      
      ---
       .../service/elasticsearch/ESOpenDistroRule.java | 17 ++++++++++++-----
       ...ticsearchOpenDistro_7_x_ClientServiceIT.java | 12 +++++++++---
       2 files changed, 21 insertions(+), 8 deletions(-)
      
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESOpenDistroRule.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESOpenDistroRule.java
      index ba06dc4b2..d37aa1505 100644
      --- a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESOpenDistroRule.java
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESOpenDistroRule.java
      @@ -38,8 +38,15 @@ public class ESOpenDistroRule implements TestRule {
           */
          private RestHighLevelClient client;
          private ElasticsearchOpenDistroContainer container;
      +   private String opendistroUsername;
      +    private String opendistroPassword;
       
      -   /**
      +    public ESOpenDistroRule(String opendistroUsername, String opendistroPassword) {
      +        this.opendistroUsername = opendistroUsername;
      +        this.opendistroPassword = opendistroPassword;
      +    }
      +
      +    /**
           * Return a closure which starts an embedded ES OpenDistro docker container, executes the unit-test, then shuts down the
           * ES instance.
           */
      @@ -48,7 +55,8 @@ public Statement apply(Statement base, Description description) {
              return new Statement() {
                  @Override
                  public void evaluate() throws Throwable {
      -               container = new ElasticsearchOpenDistroContainer("amazon/opendistro-for-elasticsearch:1.4.0", "admin", "admin");
      +               container = new ElasticsearchOpenDistroContainer("amazon/opendistro-for-elasticsearch:1.4.0",
      +                       opendistroUsername, opendistroPassword);
                      container.start();
       
                      /**
      @@ -56,9 +64,8 @@ public void evaluate() throws Throwable {
                       */
       
                      final CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
      -               credentialsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials("admin", "admin"));
      -
      -//               client = new RestHighLevelClient(RestClient.builder(HttpHost.create(container.getHostPortString())));
      +               credentialsProvider.setCredentials(AuthScope.ANY,
      +                       new UsernamePasswordCredentials(opendistroUsername, opendistroPassword));
       
                      RestClientBuilder builder = RestClient.builder(
                              new HttpHost(container.getHostAddress(), container.getPort(), "http"))
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistro_7_x_ClientServiceIT.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistro_7_x_ClientServiceIT.java
      index c177e55a8..1ce24573f 100644
      --- a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistro_7_x_ClientServiceIT.java
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistro_7_x_ClientServiceIT.java
      @@ -52,8 +52,14 @@ public class ElasticsearchOpenDistro_7_x_ClientServiceIT {
       
           private static Logger logger = LoggerFactory.getLogger(ElasticsearchOpenDistro_7_x_ClientServiceIT.class);
       
      +    // For the moment, the ES opendistro container does not support configuring and using another user/password than
      +    // admin/admin. To be allowed to changed that, the ElasticsearchOpenDistroContainer constructor must find a way
      +    // to configure a new user/password starting the opendistro container.
      +    public static final String OPENDISTRO_USERNAME = "admin";
      +    public static final String OPENDISTRO_PASSWORD = "admin";
      +
           @ClassRule
      -    public static final ESOpenDistroRule esOpenDistroRule = new ESOpenDistroRule();
      +    public static final ESOpenDistroRule esOpenDistroRule = new ESOpenDistroRule(OPENDISTRO_USERNAME, OPENDISTRO_PASSWORD);
       
           @After
           public void clean() throws IOException {
      @@ -89,8 +95,8 @@ private ElasticsearchClientService configureElasticsearchOpenDistroClientService
       
               runner.setProperty(TestProcessor.ELASTICSEARCH_CLIENT_SERVICE, "elasticsearchClient");
               runner.setProperty(elasticsearchClientService, HOSTS, esOpenDistroRule.getHostPortString());
      -        runner.setProperty(elasticsearchClientService, USERNAME, "admin");
      -        runner.setProperty(elasticsearchClientService, PASSWORD, "admin");
      +        runner.setProperty(elasticsearchClientService, USERNAME, OPENDISTRO_USERNAME);
      +        runner.setProperty(elasticsearchClientService, PASSWORD, OPENDISTRO_PASSWORD);
               runner.enableControllerService(elasticsearchClientService);
       
               // TODO : is this necessary ?
      
      From 85b1c6e967ed7d3a092e345f714755ac861b4ea8 Mon Sep 17 00:00:00 2001
      From: Mathieu Rossignol 
      Date: Fri, 28 Feb 2020 16:56:50 +0100
      Subject: [PATCH 30/43] update test containers version in ES7 integration test
      
      ---
       .../logisland-service-elasticsearch_7_x-client/pom.xml       | 5 +++--
       1 file changed, 3 insertions(+), 2 deletions(-)
      
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/pom.xml b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/pom.xml
      index baada52f7..4598a76d3 100644
      --- a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/pom.xml
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/pom.xml
      @@ -17,6 +17,7 @@
           
               
               7.1.1
      +        1.12.5
           
       
           
      @@ -70,14 +71,14 @@
               
                   org.testcontainers
                   testcontainers
      -            1.10.7
      +            ${testcontainers.version}
                   test
               
       
               
                   org.testcontainers
                   elasticsearch
      -            1.10.7
      +            ${testcontainers.version}
                   test
               
       
      
      From e5cec1c1536176fc9416c44f47023c7f2d8e7cc6 Mon Sep 17 00:00:00 2001
      From: Mathieu Rossignol 
      Date: Fri, 28 Feb 2020 16:58:03 +0100
      Subject: [PATCH 31/43] No more need of mock service in ES 6 and 7 integration
       test: use real client connection created by the real service himself
      
      ---
       .../service/elasticsearch/ESRule.java         |  7 +-
       .../Elasticsearch_6_6_2_ClientServiceIT.java  | 80 ++-----------------
       .../service/elasticsearch/ESRule.java         |  7 +-
       .../Elasticsearch_7_x_ClientServiceIT.java    | 80 ++-----------------
       4 files changed, 22 insertions(+), 152 deletions(-)
      
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_6_6_2-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESRule.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_6_6_2-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESRule.java
      index 59b604daf..f8792bf22 100644
      --- a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_6_6_2-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESRule.java
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_6_6_2-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESRule.java
      @@ -36,6 +36,7 @@ public class ESRule implements TestRule {
            * The internal-transport client that talks to the local node.
            */
           private RestHighLevelClient client;
      +    private ElasticsearchContainer container;
       
           /**
            * Return a closure which starts an embedded ES docker container, executes the unit-test, then shuts down the
      @@ -46,7 +47,7 @@ public Statement apply(Statement base, Description description) {
               return new Statement() {
                   @Override
                   public void evaluate() throws Throwable {
      -                ElasticsearchContainer container = new ElasticsearchContainer("docker.elastic.co/elasticsearch/elasticsearch:6.6.2");
      +                container = new ElasticsearchContainer("docker.elastic.co/elasticsearch/elasticsearch:6.6.2");
                       container.start();
                       client = new RestHighLevelClient(RestClient.builder(HttpHost.create(container.getHttpHostAddress())));
       
      @@ -60,6 +61,10 @@ public void evaluate() throws Throwable {
               };
           }
       
      +     public String getHostPortString() {
      +         return container.getHttpHostAddress();
      +     }
      +
           /**
            * Return the object through which operations can be performed on the ES cluster.
            */
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_6_6_2-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_6_6_2_ClientServiceIT.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_6_6_2-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_6_6_2_ClientServiceIT.java
      index f2104d133..6a772e0af 100644
      --- a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_6_6_2-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_6_6_2_ClientServiceIT.java
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_6_6_2-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_6_6_2_ClientServiceIT.java
      @@ -48,7 +48,8 @@
       
       import java.io.IOException;
       import java.util.*;
      -import java.util.function.BiConsumer;
      +
      +import static com.hurence.logisland.service.elasticsearch.ElasticsearchClientService.HOSTS;
       
       public class Elasticsearch_6_6_2_ClientServiceIT {
       
      @@ -74,84 +75,13 @@ public void clean() throws IOException {
               }
           }
       
      -    private class MockElasticsearchClientService extends Elasticsearch_6_6_2_ClientService {
      -
      -        @Override
      -        protected void createElasticsearchClient(ControllerServiceInitializationContext context) throws ProcessException {
      -            if (esClient != null) {
      -                return;
      -            }
      -            esClient = esRule.getClient();
      -        }
      -
      -        @Override
      -        protected void createBulkProcessor(ControllerServiceInitializationContext context) {
      -
      -            if (bulkProcessor != null) {
      -                return;
      -            }
      -
      -            // create the bulk processor
      -
      -            BulkProcessor.Listener listener =
      -                    new BulkProcessor.Listener() {
      -                        @Override
      -                        public void beforeBulk(long l, BulkRequest bulkRequest) {
      -                            getLogger().debug("Going to execute bulk [id:{}] composed of {} actions", new Object[]{l, bulkRequest.numberOfActions()});
      -                        }
      -
      -                        @Override
      -                        public void afterBulk(long l, BulkRequest bulkRequest, BulkResponse bulkResponse) {
      -                            getLogger().debug("Executed bulk [id:{}] composed of {} actions", new Object[]{l, bulkRequest.numberOfActions()});
      -                            if (bulkResponse.hasFailures()) {
      -                                getLogger().warn("There was failures while executing bulk [id:{}]," +
      -                                                " done bulk request in {} ms with failure = {}",
      -                                        new Object[]{l, bulkResponse.getTook().getMillis(), bulkResponse.buildFailureMessage()});
      -                                for (BulkItemResponse item : bulkResponse.getItems()) {
      -                                    if (item.isFailed()) {
      -                                        errors.put(item.getId(), item.getFailureMessage());
      -                                    }
      -                                }
      -                            }
      -                        }
      -
      -                        @Override
      -                        public void afterBulk(long l, BulkRequest bulkRequest, Throwable throwable) {
      -                            getLogger().error("something went wrong while bulk loading events to es : {}", new Object[]{throwable.getMessage()});
      -                        }
      -
      -                    };
      -
      -            BiConsumer> bulkConsumer =
      -                    (request, bulkListener) -> esClient.bulkAsync(request, RequestOptions.DEFAULT, bulkListener);
      -            bulkProcessor = BulkProcessor.builder(bulkConsumer, listener)
      -                    .setBulkActions(1000)
      -                    .setBulkSize(new ByteSizeValue(10, ByteSizeUnit.MB))
      -                    .setFlushInterval(TimeValue.timeValueSeconds(1))
      -                    .setConcurrentRequests(2)
      -                    //.setBackoffPolicy(getBackOffPolicy(context))
      -                    .build();
      -
      -        }
      -
      -        @Override
      -        public List getSupportedPropertyDescriptors() {
      -
      -            List props = new ArrayList<>();
      -
      -            return Collections.unmodifiableList(props);
      -        }
      -
      -    }
      -
           private ElasticsearchClientService configureElasticsearchClientService(final TestRunner runner) throws InitializationException {
      -        final MockElasticsearchClientService elasticsearchClientService = new MockElasticsearchClientService();
      +        final Elasticsearch_6_6_2_ClientService elasticsearchClientService = new Elasticsearch_6_6_2_ClientService();
       
               runner.addControllerService("elasticsearchClient", elasticsearchClientService);
      -
      -        runner.enableControllerService(elasticsearchClientService);
               runner.setProperty(TestProcessor.ELASTICSEARCH_CLIENT_SERVICE, "elasticsearchClient");
      -        runner.assertValid(elasticsearchClientService);
      +        runner.setProperty(elasticsearchClientService, HOSTS, esRule.getHostPortString());
      +        runner.enableControllerService(elasticsearchClientService);
       
               // TODO : is this necessary ?
               final ElasticsearchClientService service = PluginProxy.unwrap(runner.getProcessContext().getPropertyValue(TestProcessor.ELASTICSEARCH_CLIENT_SERVICE).asControllerService());
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESRule.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESRule.java
      index 28a023c2c..ae1de5241 100644
      --- a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESRule.java
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESRule.java
      @@ -36,6 +36,7 @@ public class ESRule implements TestRule {
            * The internal-transport client that talks to the local node.
            */
           private RestHighLevelClient client;
      +    private ElasticsearchContainer container;
       
           /**
            * Return a closure which starts an embedded ES docker container, executes the unit-test, then shuts down the
      @@ -46,7 +47,7 @@ public Statement apply(Statement base, Description description) {
               return new Statement() {
                   @Override
                   public void evaluate() throws Throwable {
      -                ElasticsearchContainer container = new ElasticsearchContainer("docker.elastic.co/elasticsearch/elasticsearch:7.1.1");
      +                container = new ElasticsearchContainer("docker.elastic.co/elasticsearch/elasticsearch:7.1.1");
                       container.start();
                       client = new RestHighLevelClient(RestClient.builder(HttpHost.create(container.getHttpHostAddress())));
       
      @@ -60,6 +61,10 @@ public void evaluate() throws Throwable {
               };
           }
       
      +     public String getHostPortString() {
      +         return container.getHttpHostAddress();
      +     }
      +
           /**
            * Return the object through which operations can be performed on the ES cluster.
            */
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientServiceIT.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientServiceIT.java
      index 5e5b86d05..2be59f966 100644
      --- a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientServiceIT.java
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientServiceIT.java
      @@ -54,6 +54,8 @@
       import java.util.*;
       import java.util.function.BiConsumer;
       
      +import static com.hurence.logisland.service.elasticsearch.ElasticsearchClientService.HOSTS;
      +
       public class Elasticsearch_7_x_ClientServiceIT {
       
           private static final String MAPPING1 = "{'properties':{'name':{'type': 'text'},'val':{'type':'integer'}}}";
      @@ -82,85 +84,13 @@ public void clean() throws IOException {
                   Assert.assertTrue(esRule.getClient().indices().delete(deleteRequest, RequestOptions.DEFAULT).isAcknowledged());
               }
           }
      -
      -    private class MockElasticsearchClientService extends Elasticsearch_7_x_ClientService {
      -
      -        @Override
      -        protected void createElasticsearchClient(ControllerServiceInitializationContext context) throws ProcessException {
      -            if (esClient != null) {
      -                return;
      -            }
      -            esClient = esRule.getClient();
      -        }
      -
      -        @Override
      -        protected void createBulkProcessor(ControllerServiceInitializationContext context) {
      -
      -            if (bulkProcessor != null) {
      -                return;
      -            }
      -
      -            // create the bulk processor
      -
      -            BulkProcessor.Listener listener =
      -                    new BulkProcessor.Listener() {
      -                        @Override
      -                        public void beforeBulk(long l, BulkRequest bulkRequest) {
      -                            getLogger().debug("Going to execute bulk [id:{}] composed of {} actions", new Object[]{l, bulkRequest.numberOfActions()});
      -                        }
      -
      -                        @Override
      -                        public void afterBulk(long l, BulkRequest bulkRequest, BulkResponse bulkResponse) {
      -                            getLogger().debug("Executed bulk [id:{}] composed of {} actions", new Object[]{l, bulkRequest.numberOfActions()});
      -                            if (bulkResponse.hasFailures()) {
      -                                getLogger().warn("There was failures while executing bulk [id:{}]," +
      -                                                " done bulk request in {} ms with failure = {}",
      -                                        new Object[]{l, bulkResponse.getTook().getMillis(), bulkResponse.buildFailureMessage()});
      -                                for (BulkItemResponse item : bulkResponse.getItems()) {
      -                                    if (item.isFailed()) {
      -                                        errors.put(item.getId(), item.getFailureMessage());
      -                                    }
      -                                }
      -                            }
      -                        }
      -
      -                        @Override
      -                        public void afterBulk(long l, BulkRequest bulkRequest, Throwable throwable) {
      -                            getLogger().error("something went wrong while bulk loading events to es : {}", new Object[]{throwable.getMessage()});
      -                        }
      -
      -                    };
      -
      -            BiConsumer> bulkConsumer =
      -                    (request, bulkListener) -> esClient.bulkAsync(request, RequestOptions.DEFAULT, bulkListener);
      -            bulkProcessor = BulkProcessor.builder(bulkConsumer, listener)
      -                    .setBulkActions(1000)
      -                    .setBulkSize(new ByteSizeValue(10, ByteSizeUnit.MB))
      -                    .setFlushInterval(TimeValue.timeValueSeconds(1))
      -                    .setConcurrentRequests(2)
      -                    //.setBackoffPolicy(getBackOffPolicy(context))
      -                    .build();
      -
      -        }
      -
      -        @Override
      -        public List getSupportedPropertyDescriptors() {
      -
      -            List props = new ArrayList<>();
      -
      -            return Collections.unmodifiableList(props);
      -        }
      -
      -    }
      -
           private ElasticsearchClientService configureElasticsearchClientService(final TestRunner runner) throws InitializationException {
      -        final MockElasticsearchClientService elasticsearchClientService = new MockElasticsearchClientService();
      +        final Elasticsearch_7_x_ClientService elasticsearchClientService = new Elasticsearch_7_x_ClientService();
       
               runner.addControllerService("elasticsearchClient", elasticsearchClientService);
      -
      -        runner.enableControllerService(elasticsearchClientService);
               runner.setProperty(TestProcessor.ELASTICSEARCH_CLIENT_SERVICE, "elasticsearchClient");
      -        runner.assertValid(elasticsearchClientService);
      +        runner.setProperty(elasticsearchClientService, HOSTS, esRule.getHostPortString());
      +        runner.enableControllerService(elasticsearchClientService);
       
               // TODO : is this necessary ?
               final ElasticsearchClientService service = PluginProxy.unwrap(runner.getProcessContext().getPropertyValue(TestProcessor.ELASTICSEARCH_CLIENT_SERVICE).asControllerService());
      
      From 2607facbe042d3e59f5c20275aca66c237e51f7a Mon Sep 17 00:00:00 2001
      From: Mathieu Rossignol 
      Date: Fri, 28 Feb 2020 18:57:54 +0100
      Subject: [PATCH 32/43] Support for HTTPS opendistro elasticsearch
      
      ---
       .../ElasticsearchClientService.java           | 12 +++-
       .../elasticsearch/ESOpenDistroRule.java       | 55 ++++++++++++++--
       .../ElasticsearchOpenDistroContainer.java     | 23 +++++--
       ...csearchOpenDistro_7_x_ClientServiceIT.java |  7 +-
       .../Elasticsearch_7_x_ClientServiceIT.java    | 10 ---
       .../Elasticsearch_7_x_ClientService.java      | 65 ++++++++++++++++---
       6 files changed, 137 insertions(+), 35 deletions(-)
      
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch-api/src/main/java/com/hurence/logisland/service/elasticsearch/ElasticsearchClientService.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch-api/src/main/java/com/hurence/logisland/service/elasticsearch/ElasticsearchClientService.java
      index 9a52cc701..2a2c0fa18 100644
      --- a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch-api/src/main/java/com/hurence/logisland/service/elasticsearch/ElasticsearchClientService.java
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch-api/src/main/java/com/hurence/logisland/service/elasticsearch/ElasticsearchClientService.java
      @@ -15,7 +15,6 @@
        */
       package com.hurence.logisland.service.elasticsearch;
       
      -
       import com.hurence.logisland.annotation.documentation.CapabilityDescription;
       import com.hurence.logisland.annotation.documentation.Tags;
       import com.hurence.logisland.component.AllowableValue;
      @@ -31,7 +30,6 @@
       import java.util.Map;
       import java.util.Optional;
       
      -
       @Tags({"elasticsearch", "client"})
       @CapabilityDescription("A controller service for accessing an elasticsearch client.")
       public interface ElasticsearchClientService extends DatastoreClientService {
      @@ -154,6 +152,16 @@ public ValidationResult validate(final String subject, final String input) {
                   .addValidator(StandardValidators.FILE_EXISTS_VALIDATOR)
                   .build();
       
      +    PropertyDescriptor ENABLE_SSL = new PropertyDescriptor.Builder()
      +            .name("enable.ssl")
      +            .description("Whether to enable (true) TLS/SSL connections or not (false). This can for instance be used" +
      +                    " with opendistro. Defaults to false. Note that the current implementation does try to validate" +
      +                    " the server certificate.")
      +            .required(false)
      +            .addValidator(StandardValidators.BOOLEAN_VALIDATOR)
      +            .defaultValue("false")
      +            .build();
      +
           PropertyDescriptor USERNAME = new PropertyDescriptor.Builder()
                   .name("username")
                   .description("Username to access the Elasticsearch cluster")
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESOpenDistroRule.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESOpenDistroRule.java
      index d37aa1505..70190c817 100644
      --- a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESOpenDistroRule.java
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ESOpenDistroRule.java
      @@ -27,6 +27,13 @@
       import org.junit.rules.TestRule;
       import org.junit.runner.Description;
       import org.junit.runners.model.Statement;
      +import org.slf4j.Logger;
      +import org.slf4j.LoggerFactory;
      +
      +import javax.net.ssl.SSLContext;
      +import javax.net.ssl.TrustManager;
      +import javax.net.ssl.X509TrustManager;
      +import java.security.cert.X509Certificate;
       
       /**
       * A JUnit rule which starts an embedded opendsitro elasticsearch docker container to test security features
      @@ -39,7 +46,9 @@ public class ESOpenDistroRule implements TestRule {
          private RestHighLevelClient client;
          private ElasticsearchOpenDistroContainer container;
          private String opendistroUsername;
      -    private String opendistroPassword;
      +   private String opendistroPassword;
      +
      +    private static Logger logger = LoggerFactory.getLogger(ESOpenDistroRule.class);
       
           public ESOpenDistroRule(String opendistroUsername, String opendistroPassword) {
               this.opendistroUsername = opendistroUsername;
      @@ -59,20 +68,52 @@ public void evaluate() throws Throwable {
                              opendistroUsername, opendistroPassword);
                      container.start();
       
      +               // TODO: if testcontainers support no SSL server validation one can use the wait strategy
      +               // in ElasticsearchOpenDistroContainer instead. See inside ElasticsearchOpenDistroContainer.
      +               long wait = 10000L;
      +               logger.info("Waiting for ES open distro container to start for " + wait/1000 + " seconds");
      +               Thread.sleep(wait);
      +
                      /**
                       * Inspired from https://github.com/opendistro-for-elasticsearch/community/issues/64
                       */
       
      -               final CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
      -               credentialsProvider.setCredentials(AuthScope.ANY,
      -                       new UsernamePasswordCredentials(opendistroUsername, opendistroPassword));
      -
                      RestClientBuilder builder = RestClient.builder(
      -                       new HttpHost(container.getHostAddress(), container.getPort(), "http"))
      +                       new HttpHost(container.getHostAddress(), container.getPort(), "https"))
                              .setHttpClientConfigCallback(new RestClientBuilder.HttpClientConfigCallback() {
                                  @Override
                                  public HttpAsyncClientBuilder customizeHttpClient(HttpAsyncClientBuilder httpClientBuilder) {
      -                               return httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider);
      +
      +                               // Set user/password basic auth credentials
      +                               final CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
      +                               credentialsProvider.setCredentials(AuthScope.ANY,
      +                                       new UsernamePasswordCredentials(opendistroUsername, opendistroPassword));
      +                               httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider);
      +
      +                               // Set SSL trust manager and context
      +                               // Create and use a trust manager accepting all server certificates
      +                               TrustManager[] acceptAllTrustManager = new TrustManager[] { new X509TrustManager() {
      +                                   public java.security.cert.X509Certificate[] getAcceptedIssuers() {
      +                                       return null;
      +                                   }
      +                                   public void checkClientTrusted(X509Certificate[] certs, String authType) {
      +                                   }
      +
      +                                   public void checkServerTrusted(X509Certificate[] certs, String authType) {
      +                                   }
      +                               } };
      +
      +                               SSLContext sslContext = null;
      +                               try {
      +                                   sslContext = SSLContext.getInstance("SSL");
      +                                   sslContext.init(null, acceptAllTrustManager, new java.security.SecureRandom());
      +                               } catch (Exception e) {
      +                                   e.printStackTrace();
      +                                   throw new RuntimeException(e);
      +                               }
      +                               httpClientBuilder.setSSLContext(sslContext);
      +
      +                               return httpClientBuilder;
                                  }
                              });
                      client = new RestHighLevelClient(builder);
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistroContainer.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistroContainer.java
      index 0db51bdef..87fc2d410 100644
      --- a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistroContainer.java
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistroContainer.java
      @@ -64,21 +64,34 @@ public ElasticsearchOpenDistroContainer(String dockerImageName, String user, Str
               logger().info("Starting an opendistro elasticsearch container using [{}]", dockerImageName);
               withNetworkAliases("elasticsearch-opendistro-" + Base58.randomString(6));
               withEnv("discovery.type", "single-node");
      -        withEnv("opendistro_security.ssl.http.enabled", "false"); // Disable https
      +        // With enforce_hostname_verification enabled, the Security plugin verifies that the hostname of the
      +        // communication partner matches the hostname in the certificate
      +//        withEnv("opendistro_security.ssl.transport.enforce_hostname_verification", "false");
      +        // Do the clients (typically the browser or the proxy) have to authenticate themselves to the http server,
      +        // default is OPTIONAL. To enforce authentication use REQUIRE, to completely disable client certificates use
      +        // NONE.
      +        withEnv("opendistro_security.ssl.http.clientauth_mode", "NONE");
      +//        withEnv("opendistro_security.ssl.http.enabled", "false"); // Disable https
       //        withEnv("opendistro_security.disabled", "true"); // Completely disable security (https; authentication...)
               addExposedPorts(ELASTICSEARCH_OPENDISTRO_DEFAULT_PORT, ELASTICSEARCH_OPENDISTRO_DEFAULT_TCP_PORT);
               HttpWaitStrategy httpWaitStrategy = new HttpWaitStrategy()
                       .forPort(ELASTICSEARCH_OPENDISTRO_DEFAULT_PORT)
      -                .forStatusCodeMatching(response -> response == HTTP_OK);
      -//                .usingTls()
      +                .forStatusCodeMatching(response -> response == HTTP_OK)
      +                .usingTls();
       
      -        // Ideally we woul like to be able to setup the user with the passed one. For the moment we only support the
      +        // Ideally we would like to be able to setup the user with the passed one. For the moment we only support the
               // out of the box admin/admin user
               if ( (user != null) && (password != null) ) {
                   httpWaitStrategy.withBasicCredentials(user, password);
               }
      +        // TODO: if we use the wait strategy then this fails as it not only connects with SSL but it
      +        // also tries to validate the server SSL certificate. We do not want that and there is currently no option to
      +        // remove that offered by the testcontainers API. We could may be use system properties but this would impact
      +        // the whole VM in which the IT test runs. We prefer for the moment just not use the wait strategy and replace
      +        // it with a dummy sleep in the caller ESOpenDistroRule to let the docker container initialize. That is why it
      +        // is commented here after.
       //        setWaitStrategy(httpWaitStrategy.withStartupTimeout(Duration.ofMinutes(2)));
      -        setWaitStrategy(httpWaitStrategy.withStartupTimeout(Duration.ofSeconds(30)));
      +//        setWaitStrategy(httpWaitStrategy.withStartupTimeout(Duration.ofSeconds(10)));
           }
       
           public String getHostPortString() {
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistro_7_x_ClientServiceIT.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistro_7_x_ClientServiceIT.java
      index 1ce24573f..536067766 100644
      --- a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistro_7_x_ClientServiceIT.java
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/ElasticsearchOpenDistro_7_x_ClientServiceIT.java
      @@ -43,6 +43,11 @@
       
       import static com.hurence.logisland.service.elasticsearch.ElasticsearchClientService.*;
       
      +/**
      + * The current implementation uses HTTPS with no server certificate validation (like the ES service does) as well as
      + * user/password http basic auth, which is currently only admin/admin as it is by default configured in the opendistro
      + * ES docker image we currently use.
      + */
       public class ElasticsearchOpenDistro_7_x_ClientServiceIT {
       
           private static final String MAPPING1 = "{'properties':{'name':{'type': 'text'},'val':{'type':'integer'}}}";
      @@ -92,11 +97,11 @@ private ElasticsearchClientService configureElasticsearchOpenDistroClientService
               final Elasticsearch_7_x_ClientService elasticsearchClientService = new Elasticsearch_7_x_ClientService();
       
               runner.addControllerService("elasticsearchClient", elasticsearchClientService);
      -
               runner.setProperty(TestProcessor.ELASTICSEARCH_CLIENT_SERVICE, "elasticsearchClient");
               runner.setProperty(elasticsearchClientService, HOSTS, esOpenDistroRule.getHostPortString());
               runner.setProperty(elasticsearchClientService, USERNAME, OPENDISTRO_USERNAME);
               runner.setProperty(elasticsearchClientService, PASSWORD, OPENDISTRO_PASSWORD);
      +        runner.setProperty(elasticsearchClientService, ENABLE_SSL, "true");
               runner.enableControllerService(elasticsearchClientService);
       
               // TODO : is this necessary ?
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientServiceIT.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientServiceIT.java
      index 2be59f966..3c30206bd 100644
      --- a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientServiceIT.java
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/integration-test/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientServiceIT.java
      @@ -29,19 +29,10 @@
       import com.hurence.logisland.util.runner.TestRunner;
       import com.hurence.logisland.util.runner.TestRunners;
       import org.elasticsearch.ElasticsearchStatusException;
      -import org.elasticsearch.action.ActionListener;
      -import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest;
      -import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
       import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest;
      -import org.elasticsearch.action.bulk.BulkItemResponse;
      -import org.elasticsearch.action.bulk.BulkProcessor;
      -import org.elasticsearch.action.bulk.BulkRequest;
      -import org.elasticsearch.action.bulk.BulkResponse;
       import org.elasticsearch.client.RequestOptions;
       import org.elasticsearch.client.indices.GetIndexRequest;
       import org.elasticsearch.client.indices.GetIndexResponse;
      -import org.elasticsearch.common.unit.ByteSizeUnit;
      -import org.elasticsearch.common.unit.ByteSizeValue;
       import org.elasticsearch.common.unit.TimeValue;
       import org.junit.After;
       import org.junit.Assert;
      @@ -52,7 +43,6 @@
       
       import java.io.IOException;
       import java.util.*;
      -import java.util.function.BiConsumer;
       
       import static com.hurence.logisland.service.elasticsearch.ElasticsearchClientService.HOSTS;
       
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/main/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientService.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/main/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientService.java
      index 1a93e0220..6282e8b47 100644
      --- a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/main/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientService.java
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/main/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientService.java
      @@ -59,14 +59,17 @@
       import org.elasticsearch.common.xcontent.XContentType;
       import org.elasticsearch.index.query.QueryBuilders;
       import org.elasticsearch.index.reindex.ReindexRequest;
      -import org.elasticsearch.search.SearchHit;
       import org.elasticsearch.search.builder.SearchSourceBuilder;
       import org.elasticsearch.search.fetch.subphase.FetchSourceContext;
       
      +import javax.net.ssl.SSLContext;
      +import javax.net.ssl.TrustManager;
      +import javax.net.ssl.X509TrustManager;
      +//import javax.security.cert.X509Certificate;
       import java.io.IOException;
      +import java.security.cert.X509Certificate;
       import java.util.*;
       import java.util.concurrent.TimeUnit;
      -import java.util.concurrent.atomic.AtomicBoolean;
       import java.util.function.BiConsumer;
       
       @Tags({ "elasticsearch", "client"})
      @@ -98,6 +101,7 @@ public List getSupportedPropertyDescriptors() {
               props.add(SAMPLER_INTERVAL);
               props.add(USERNAME);
               props.add(PASSWORD);
      +        props.add(ENABLE_SSL);
               props.add(PROP_SHIELD_LOCATION);
               props.add(HOSTS);
               props.add(PROP_SSL_CONTEXT_SERVICE);
      @@ -137,20 +141,60 @@ protected void createElasticsearchClient(ControllerServiceInitializationContext
                   final String username = context.getPropertyValue(USERNAME).asString();
                   final String password = context.getPropertyValue(PASSWORD).asString();
                   final String hosts = context.getPropertyValue(HOSTS).asString();
      +            final boolean enableSsl = context.getPropertyValue(ENABLE_SSL).asBoolean();
       
      -            esHosts = getEsHosts(hosts);
      +            esHosts = getEsHosts(hosts, enableSsl);
       
                   if (esHosts != null) {
       
                       RestClientBuilder builder = RestClient.builder(esHosts);
       
      -                if (!StringUtils.isEmpty(username) && !StringUtils.isEmpty(password)) {
      -                    final CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
      -                    credentialsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(username, password));
      +                /**
      +                 * TODO use those link to support SSL
      +                 * https://www.elastic.co/guide/en/elasticsearch/client/java-rest/current/_encrypted_communication.html
      +                 *
      +                 * https://github.com/opendistro-for-elasticsearch/community/issues/64
      +                 */
      +
      +                if ((!StringUtils.isEmpty(username) && !StringUtils.isEmpty(password)) || enableSsl) {
       
                           builder.setHttpClientConfigCallback(httpClientBuilder -> {
      -                                    return httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider);
      -                                });
      +
      +                        if (!StringUtils.isEmpty(username) && !StringUtils.isEmpty(password)) {
      +                            // Support user/password basic auth
      +                            final CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
      +                            credentialsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(username, password));
      +                            httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider);
      +                        }
      +                        if (enableSsl) {
      +                            // Support SSL
      +
      +                            // Create and use a trust manager accepting all server certificates
      +                            TrustManager[] acceptAllTrustManager = new TrustManager[] { new X509TrustManager() {
      +                                public java.security.cert.X509Certificate[] getAcceptedIssuers() {
      +                                    return null;
      +                                }
      +                                public void checkClientTrusted(X509Certificate[] certs, String authType) {
      +                                }
      +
      +                                public void checkServerTrusted(X509Certificate[] certs, String authType) {
      +                                }
      +                            } };
      +
      +                            SSLContext sslContext = null;
      +                            try {
      +                                sslContext = SSLContext.getInstance("SSL");
      +                                sslContext.init(null, acceptAllTrustManager, new java.security.SecureRandom());
      +                            } catch (Exception e) {
      +                                getLogger().error("Failed to create Elasticsearch client SSLContext due to {}",
      +                                        new Object[]{e}, e);
      +                                throw new RuntimeException(e);
      +                            }
      +
      +                            httpClientBuilder.setSSLContext(sslContext);
      +                        }
      +                        return httpClientBuilder;
      +                    });
                       }
       
                       esClient = new RestHighLevelClient(builder);
      @@ -166,9 +210,10 @@ protected void createElasticsearchClient(ControllerServiceInitializationContext
            * Get the ElasticSearch hosts.
            *
            * @param hosts A comma-separated list of ElasticSearch hosts (host:port,host2:port2, etc.)
      +     * @param enableSsl Enable ssl or not
            * @return List of HttpHost for the ES hosts
            */
      -    private HttpHost[]  getEsHosts(String hosts) {
      +    private HttpHost[]  getEsHosts(String hosts, boolean enableSsl) {
       
               if (hosts == null) {
                   return null;
      @@ -182,7 +227,7 @@ private HttpHost[]  getEsHosts(String hosts) {
                   final String hostName = addresses[0].trim();
                   final int port = Integer.parseInt(addresses[1].trim());
       
      -            esHosts[indHost] = new HttpHost(hostName, port);
      +            esHosts[indHost] = new HttpHost(hostName, port, enableSsl ? "https" : "http");
                   indHost++;
               }
               return esHosts;
      
      From e66b83a023a7bbe75c180efa684939514159dea7 Mon Sep 17 00:00:00 2001
      From: Mathieu Rossignol 
      Date: Tue, 10 Mar 2020 12:36:48 +0100
      Subject: [PATCH 33/43] Added comment in ES7 service regarding SSL
      
      ---
       .../Elasticsearch_7_x_ClientService.java             | 12 +++++++++---
       1 file changed, 9 insertions(+), 3 deletions(-)
      
      diff --git a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/main/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientService.java b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/main/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientService.java
      index 6282e8b47..133dd8dd7 100644
      --- a/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/main/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientService.java
      +++ b/logisland-components/logisland-services/logisland-service-elasticsearch/logisland-service-elasticsearch_7_x-client/src/main/java/com/hurence/logisland/service/elasticsearch/Elasticsearch_7_x_ClientService.java
      @@ -150,9 +150,8 @@ protected void createElasticsearchClient(ControllerServiceInitializationContext
                       RestClientBuilder builder = RestClient.builder(esHosts);
       
                       /**
      -                 * TODO use those link to support SSL
      +                 * Inspired from:
                        * https://www.elastic.co/guide/en/elasticsearch/client/java-rest/current/_encrypted_communication.html
      -                 *
                        * https://github.com/opendistro-for-elasticsearch/community/issues/64
                        */
       
      @@ -167,7 +166,14 @@ protected void createElasticsearchClient(ControllerServiceInitializationContext
                                   httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider);
                               }
                               if (enableSsl) {
      -                            // Support SSL
      +                            // Support SSL (ES Shield or OpenDistro)
      +
      +                            /**
      +                             * TODO: This current implementation does not verify the server certificate. One could
      +                             * improve this and provide support for a local truststore to check the server certificate.
      +                             * The same way, authentication to the server through local certificate is not supported
      +                             * yet.
      +                             */
       
                                   // Create and use a trust manager accepting all server certificates
                                   TrustManager[] acceptAllTrustManager = new TrustManager[] { new X509TrustManager() {
      
      From 0b36c05e29381c564f278ceec1e6fecad7691cef Mon Sep 17 00:00:00 2001
      From: Mathieu Rossignol 
      Date: Wed, 11 Mar 2020 17:55:22 +0100
      Subject: [PATCH 34/43] Added support for avro serializer/deserializer in
       structured streams by supporting input/output avro schema configuration
       properties
      
      ---
       .../logisland/stream/spark/structured/StructuredStream.scala    | 2 ++
       1 file changed, 2 insertions(+)
      
      diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala
      index e6b74d296..f90209ec4 100644
      --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala
      +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/src/main/scala/com/hurence/logisland/stream/spark/structured/StructuredStream.scala
      @@ -60,6 +60,8 @@ class StructuredStream extends AbstractRecordStream with SparkRecordStream {
           descriptors.add(GROUPBY)
           descriptors.add(STATE_TIMEOUT_MS)
           descriptors.add(CHUNK_SIZE)
      +    descriptors.add(AVRO_INPUT_SCHEMA)
      +    descriptors.add(AVRO_OUTPUT_SCHEMA)
       
           Collections.unmodifiableList(descriptors)
         }
      
      From 557cbec90f0b3ef215f32389494988ac2c046315 Mon Sep 17 00:00:00 2001
      From: Mathieu Rossignol 
      Date: Wed, 11 Mar 2020 17:56:31 +0100
      Subject: [PATCH 35/43] From avro 1.8.2 to avro 1.9.2 to have avro schema
       serializable and thus now usable in structured streams
      
      ---
       logisland-core/pom.xml | 2 +-
       1 file changed, 1 insertion(+), 1 deletion(-)
      
      diff --git a/logisland-core/pom.xml b/logisland-core/pom.xml
      index 2df9c55f7..b1c51e6e6 100644
      --- a/logisland-core/pom.xml
      +++ b/logisland-core/pom.xml
      @@ -27,7 +27,7 @@
                   
                       org.apache.avro
                       avro
      -                1.8.2
      +                1.9.2
                   
                   
                       com.fasterxml.jackson.module
      
      From 84cb63a8db4fe70d3d33d5f6a8f432f7ec0ab2a4 Mon Sep 17 00:00:00 2001
      From: Mathieu Rossignol 
      Date: Thu, 12 Mar 2020 17:12:54 +0100
      Subject: [PATCH 36/43] Forgotten close when loading maxmind db from URI. Alos
       upgraded versions of iptogeo service and processor dependencies
       (jackson/hadoop)
      
      ---
       .../logisland-processor-enrichment/pom.xml     |  4 ++--
       .../pom.xml                                    | 18 +++++++++---------
       .../iptogeo/maxmind/MaxmindIpToGeoService.java |  3 ++-
       3 files changed, 13 insertions(+), 12 deletions(-)
      
      diff --git a/logisland-components/logisland-processors/logisland-processor-enrichment/pom.xml b/logisland-components/logisland-processors/logisland-processor-enrichment/pom.xml
      index b1c52772f..832361448 100644
      --- a/logisland-components/logisland-processors/logisland-processor-enrichment/pom.xml
      +++ b/logisland-components/logisland-processors/logisland-processor-enrichment/pom.xml
      @@ -65,13 +65,13 @@
               
                   com.fasterxml.jackson.core
                   jackson-databind
      -            2.9.3
      +            2.10.3
                   test
               
               
                   com.maxmind.geoip2
                   geoip2
      -            2.11.0
      +            2.13.1
                   test
               
       
      diff --git a/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/pom.xml b/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/pom.xml
      index 7bd481334..507621d58 100644
      --- a/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/pom.xml
      +++ b/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/pom.xml
      @@ -41,28 +41,28 @@
                   3.7
                   true
               
      -        
      -
      -        
               
                   com.fasterxml.jackson.core
                   jackson-databind
      -            2.9.3
      +            2.10.3
      +            true
      +        
      +        
      +            com.fasterxml.jackson.core
      +            jackson-core
      +            2.10.3
                   true
               
               
                   com.maxmind.geoip2
                   geoip2
      -            2.11.0
      +            2.13.1
                   true
               
               
                   org.apache.hadoop
                   hadoop-client
      -            2.2.0
      +            3.2.1
                   provided
                    
                   true
      diff --git a/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/src/main/java/com/hurence/logisland/service/iptogeo/maxmind/MaxmindIpToGeoService.java b/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/src/main/java/com/hurence/logisland/service/iptogeo/maxmind/MaxmindIpToGeoService.java
      index 86722f917..c8559075a 100644
      --- a/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/src/main/java/com/hurence/logisland/service/iptogeo/maxmind/MaxmindIpToGeoService.java
      +++ b/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/src/main/java/com/hurence/logisland/service/iptogeo/maxmind/MaxmindIpToGeoService.java
      @@ -30,7 +30,6 @@
       import com.maxmind.geoip2.exception.GeoIp2Exception;
       import com.maxmind.geoip2.model.CityResponse;
       import com.maxmind.geoip2.record.*;
      -import com.hurence.logisland.component.PropertyValue;
       
       import java.io.File;
       import java.io.IOException;
      @@ -166,6 +165,8 @@ private void initFromUri(String dbUri) throws Exception
               long stop = System.currentTimeMillis();
               getLogger().info("Completed loading of Maxmind Geo Database in {} milliseconds.", new Object[]{stop - start});
               databaseReaderRef.set(databaseReader);
      +
      +        inputStream.close();
           }
       
           /**
      
      From dd876d0a1527caa0a960897a5245e099332deb8e Mon Sep 17 00:00:00 2001
      From: Mathieu Rossignol 
      Date: Thu, 12 Mar 2020 17:23:23 +0100
      Subject: [PATCH 37/43] Changed log messages when loading maxmind DB from URI
      
      ---
       .../service/iptogeo/maxmind/MaxmindIpToGeoService.java        | 4 ++--
       1 file changed, 2 insertions(+), 2 deletions(-)
      
      diff --git a/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/src/main/java/com/hurence/logisland/service/iptogeo/maxmind/MaxmindIpToGeoService.java b/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/src/main/java/com/hurence/logisland/service/iptogeo/maxmind/MaxmindIpToGeoService.java
      index c8559075a..2f7a57385 100644
      --- a/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/src/main/java/com/hurence/logisland/service/iptogeo/maxmind/MaxmindIpToGeoService.java
      +++ b/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/src/main/java/com/hurence/logisland/service/iptogeo/maxmind/MaxmindIpToGeoService.java
      @@ -145,7 +145,7 @@ private void initFromUri(String dbUri) throws Exception
               Configuration conf = new Configuration();
       
               String hdfsUri = conf.get("fs.defaultFS");
      -        getLogger().info("Default HDFS URI: " + hdfsUri);
      +        getLogger().info("Base default FS: " + hdfsUri);
       
               // Set HADOOP user to same as current suer
               String hadoopUser = System.getProperty("user.name");
      @@ -157,7 +157,7 @@ private void initFromUri(String dbUri) throws Exception
       
               // Create a path to config file and init input stream
               Path hdfsReadpath = new Path(dbUri);
      -        getLogger().info("Reading Maxmind DB file from HDFS at: " + dbUri);
      +        getLogger().info("Reading Maxmind DB file from URI at: " + dbUri);
               FSDataInputStream inputStream = fs.open(hdfsReadpath);
       
               long start = System.currentTimeMillis();
      
      From 4343104907aed3642ca9e3c8943dd206298dd165 Mon Sep 17 00:00:00 2001
      From: Mathieu Rossignol 
      Date: Fri, 13 Mar 2020 14:46:20 +0100
      Subject: [PATCH 38/43] Rollback park-sql-streaming-mqtt_2.11 version to not
       have conflict with bahir underlying dependency that uses scala 2.12 when
       using 2.4.0 version (don't know why)
      
      ---
       .../logisland-engine-spark_2_4/pom.xml        |  2 +-
       .../logisland-engine-spark_2_common/pom.xml   | 26 ++++++++-----------
       2 files changed, 12 insertions(+), 16 deletions(-)
      
      diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml
      index 0c8699875..fc69a1883 100644
      --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml
      +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_4/pom.xml
      @@ -43,7 +43,7 @@ http://www.w3.org/2001/XMLSchema-instance ">
                   
                       org.apache.bahir
                       spark-sql-streaming-mqtt_2.11
      -                2.4.0
      +                2.3.2
                       runtime
                       true
                   
      diff --git a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml
      index 5311d3dee..5ac3d0822 100644
      --- a/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml
      +++ b/logisland-core/logisland-engines/logisland-engine-spark_2_X/logisland-engine-spark_2_common/pom.xml
      @@ -195,23 +195,10 @@ http://www.w3.org/2001/XMLSchema-instance ">
       
               
       
      -
               
                   org.apache.bahir
                   spark-sql-streaming-mqtt_${scala.binary.version}
      -            2.4.0
      -        
      -
      -        
      -        
      -            org.apache.bahir
      -            bahir-common_${scala.binary.version}
      -            2.4.0
      +            2.3.2
               
       
               
      @@ -397,10 +384,19 @@ http://www.w3.org/2001/XMLSchema-instance ">
       
           
               
      +            
      +                org.apache.maven.plugins
      +                maven-surefire-plugin
      +                2.22.2
      +                
      +                    
      +                    false
      +                
      +            
                   
                       net.alchim31.maven
                       scala-maven-plugin
      -                3.2.2
      +                4.3.1
                       
                           
                               scala-compile-first
      
      From 0e5a5fa1bb0ec1cdb822dbfa38adce494feac72b Mon Sep 17 00:00:00 2001
      From: Mathieu Rossignol 
      Date: Mon, 16 Mar 2020 16:06:09 +0100
      Subject: [PATCH 39/43] Fixed Record JSON deserialization problem after jackson
       version bump. Also attempting to align bumped jackson version everywhere
       where possible and also use the jackson.version property
      
      ---
       .../logisland-processor-enrichment/pom.xml    |    2 +-
       .../pom.xml                                   |    2 +-
       .../logisland-engine-spark_1_6/pom.xml        |    1 +
       .../logisland-engine-vanilla/pom.xml          |    6 +-
       .../logisland-utils/pom.xml                   |    3 +
       .../logisland/serializer/JsonSerializer.java  |    8 +-
       logisland-documentation/pom.xml               |    4 +-
       .../user/components/other-processors.rst      | 2246 +++++++++++++++++
       .../user/components/services.rst              | 1068 ++++++++
       pom.xml                                       |    2 +-
       10 files changed, 3331 insertions(+), 11 deletions(-)
      
      diff --git a/logisland-components/logisland-processors/logisland-processor-enrichment/pom.xml b/logisland-components/logisland-processors/logisland-processor-enrichment/pom.xml
      index 832361448..f06889d65 100644
      --- a/logisland-components/logisland-processors/logisland-processor-enrichment/pom.xml
      +++ b/logisland-components/logisland-processors/logisland-processor-enrichment/pom.xml
      @@ -65,7 +65,7 @@
               
                   com.fasterxml.jackson.core
                   jackson-databind
      -            2.10.3
      +            ${jackson.version}
                   test
               
               
      diff --git a/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/pom.xml b/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/pom.xml
      index 507621d58..cbba578ca 100644
      --- a/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/pom.xml
      +++ b/logisland-components/logisland-services/logisland-service-ip-to-geo/logisland-service-ip-to-geo-maxmind/pom.xml
      @@ -44,7 +44,7 @@
               
                   com.fasterxml.jackson.core
                   jackson-databind
      -            2.10.3
      +            ${jackson.version}
                   true
               
               
      diff --git a/logisland-core/logisland-engines/logisland-engine-spark_1_6/pom.xml b/logisland-core/logisland-engines/logisland-engine-spark_1_6/pom.xml
      index 72c5afeee..0354112fb 100644
      --- a/logisland-core/logisland-engines/logisland-engine-spark_1_6/pom.xml
      +++ b/logisland-core/logisland-engines/logisland-engine-spark_1_6/pom.xml
      @@ -110,6 +110,7 @@ http://www.w3.org/2001/XMLSchema-instance ">
               
                   com.fasterxml.jackson.core
                   jackson-databind
      +            ${jackson.version}
                   provided
               
       
      diff --git a/logisland-core/logisland-engines/logisland-engine-vanilla/pom.xml b/logisland-core/logisland-engines/logisland-engine-vanilla/pom.xml
      index 7e9bd1081..c08f02c73 100644
      --- a/logisland-core/logisland-engines/logisland-engine-vanilla/pom.xml
      +++ b/logisland-core/logisland-engines/logisland-engine-vanilla/pom.xml
      @@ -86,19 +86,19 @@
               
                   com.fasterxml.jackson.core
                   jackson-databind
      -            2.9.8
      +            ${jackson.version}
               
       
               
                   com.fasterxml.jackson.core
                   jackson-core
      -            2.9.8
      +            ${jackson.version}
               
       
               
                   com.fasterxml.jackson.core
                   jackson-annotations
      -            2.9.8
      +            ${jackson.version}