support log levels and location for data processor logging

gaia-adm · Aug 18, 2015 · fd91724 · fd91724
1 parent 0118f9d
commit fd91724
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,8 @@ Result processing component consists of two parts:
 - receive parameters as environment variables prefixed with "P_". The following environment variables can be expected: "P_DATATYPE", "P_CONTENTTYPE". "P_CONTENTTYPE" represents the HTTP Content-Type header value. Custom metadata from data providers will be accessible with prefix "P_C_". Note that received parameter keys will always be uppercase regardless of the case used during data collection. This is to ensure compatibility between Windows (dev) and Linux (production) environments. Parameter values are case sensitive.
 - receive uploaded file on STDIN. The file can be binary or textual (i.e XML, JSON) and in theory can be quite big. It is not recommended to parse it at once. Processing ends when EOF is received from STDIN.
 - processed results are written to STDOUT in the form of JSON array containing JSON objects. JSON objects must have format expected by metrics-gateway-service ("/mgs/rest/v1/gateway/event"). It is recommended to write JSON objects to STDOUT while processing STDIN.
-- log can be written to STDERR (of all log levels, not just errors). It ends up in result upload service log under processor name.
+- log can be written to STDERR. It ends up in result upload service log under processor name.
+  - logging format is "LEVEL:LOCATION:MESSAGE", where LEVEL can be one of DEBUG,INFO,WARNING,ERROR,CRITICAL. Alternative format is just "MESSAGE", which is assumed to be an error (unexpected errors). Each message must be terminated by newline. The logging format matches Python logger basic configuration and levels match Python logger log levels.
 - must exit with 0 if there was no error, 1 if there was a general error
 - should support SIGTERM to terminate processing. After SIGTERM is sent, any output produced will be ignored by result processing service. When SIGTERM is received, STDIN will be closed as well (which may lead to parsing error due to incomplete input). SIGTERM is then a hint to application that this state is desired.
 - must support execution when STDIN is closed immediately, no "P_" parameters are present and exit with 0. This is used by result processing service to verify that the processor can be executed successfully.
@@ -94,5 +95,5 @@ Unless at least one processor is available the process will exit immediately. No
   - related to the fact we don't store processor execution state/result
 - currently there is no way for processor to tell the service version of the produced content. All result processors must thus produce data of the same version. Metrics gateway service may support multiple data format versions (i.e v1, v2 on its REST). If case of change we have to update code of all result processors.
   - if needed this could be solved by processor descriptor saying what data it produces on STDOUT and for whom
-- logging to STDERR always ends up as INFO on result-processing service - can be solved with format like "LEVEL:LOCATION:MESSAGE" (this is format used by Python logging). If LEVEL is missing, ERROR level would be assumed.
+- log level is not passed to data processor, it may result in unnecessary messages being sent to STDERR then being filtered out
 - no support for chaining multiple processors after each other. Celery supports this. We could use more streams than STDOUT and processor descriptor could specify where the stream output should go (i.e metrics-gateway or other processor).
diff --git a/service/processors.js b/service/processors.js
@@ -22,6 +22,18 @@ var logger = log4js.getLogger('processors.js');
  */
 var processorsMap = {};
 
+/**
+ * RegExp of allowed data processor STDERR format. Example: 'ERROR:HelloWorld.py:Something failed'. Format corresponds
+ * to basic Python logging configuration.
+ * @type {RegExp}
+ */
+var childLogRegExp = new RegExp('([A-Z]+):([^:]+):(.*)');
+
+/**
+ * Map of supported data processor log levels. Corresponds to Python logger log levels.
+ */
+var childLogLevelMap = {'DEBUG': 'debug', 'INFO': 'info', 'WARNING': 'warn', 'ERROR': 'error', 'CRITICAL': 'fatal'};
+
 /**
  * Verifies processor at given path with given descriptor. Performs verification of the descriptor and test execution of
  * the processor.
@@ -152,7 +164,17 @@ function onLogFromChild(processorDesc, str) {
     var logRecords = str.split(/\r?\n/);
     logRecords.forEach(function(logRecord) {
         if (logRecord.length > 0) {
-            processorDesc.logger.info(logRecord);
+            var parts = childLogRegExp.exec(logRecord);
+            if (parts) {
+                var logLevel = childLogLevelMap[parts[1]];
+                if (logLevel) {
+                    var logName = parts[2];
+                    var message = parts[3];
+                    processorDesc.logger.log(logLevel, logName + ' - ' + message);
+                    return;
+                }
+            }
+            processorDesc.logger.error(logRecord);
         }
     });
 }
@@ -224,7 +246,7 @@ function executeProcessor(processorDesc, processingMetadata, contentMetadata) {
     });
     var notifier = new ProcessingNotifier(processorDesc, child);
 
-    // log child stderr as info
+    // log child stderr into our logger
     var childErrStream = child.stderr;
     childErrStream.setEncoding('utf8');
     function onStdErrData(str) {