From e72fc0dc4b5da4a1c22b741dd12446e7dd9340e3 Mon Sep 17 00:00:00 2001 From: Georgi Date: Tue, 17 Sep 2024 14:57:30 +0200 Subject: [PATCH] fix matching multiline regex in s3 handler --- .../steps/handlers/s3_handler.py | 9 ++++- aws/logs_monitoring/tests/test_s3_handler.py | 39 +++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/aws/logs_monitoring/steps/handlers/s3_handler.py b/aws/logs_monitoring/steps/handlers/s3_handler.py index dc8d9766..6eabd19b 100644 --- a/aws/logs_monitoring/steps/handlers/s3_handler.py +++ b/aws/logs_monitoring/steps/handlers/s3_handler.py @@ -44,6 +44,11 @@ def __init__(self, context, metadata, cache_layer): if DD_MULTILINE_LOG_REGEX_PATTERN else None ) + self.multiline_regex_pattern = ( + re.compile("[\n\r\f]+(?={})".format(DD_MULTILINE_LOG_REGEX_PATTERN)) + if DD_MULTILINE_LOG_REGEX_PATTERN + else None + ) # a private data store for event attributes self.data_store = S3EventDataStore() @@ -283,12 +288,12 @@ def _extract_cloudtrail_logs(self): def _extract_other_logs(self): # Check if using multiline log regex pattern # and determine whether line or pattern separated logs - if self.multiline_regex_start_pattern: + if self.multiline_regex_start_pattern and self.multiline_regex_pattern: # We'll do string manipulation, so decode bytes into utf-8 first self.data_store.data = self.data_store.data.decode("utf-8", errors="ignore") if self.multiline_regex_start_pattern.match(self.data_store.data): - self.data_store.data = self.multiline_regex_start_pattern.split( + self.data_store.data = self.multiline_regex_pattern.split( self.data_store.data ) else: diff --git a/aws/logs_monitoring/tests/test_s3_handler.py b/aws/logs_monitoring/tests/test_s3_handler.py index df411987..f2d5a4ea 100644 --- a/aws/logs_monitoring/tests/test_s3_handler.py +++ b/aws/logs_monitoring/tests/test_s3_handler.py @@ -1,6 +1,8 @@ import gzip import unittest +import re from unittest.mock import MagicMock, patch +from importlib import reload from approvaltests.combination_approvals import verify_all_combinations from caching.cache_layer import CacheLayer @@ -108,6 +110,43 @@ def test_s3_handler(self): self.assertEqual(self.s3_handler.metadata["ddsource"], "s3") self.assertEqual(self.s3_handler.metadata["host"], "arn:aws:s3:::my-bucket") + def test_s3_handler_with_multiline_regex(self): + event = { + "Records": [ + { + "s3": { + "bucket": {"name": "my-bucket"}, + "object": {"key": "my-key"}, + } + } + ] + } + data = "2022-02-08aaa\nbbbccc\n2022-02-09bbb\n2022-02-10ccc\n" + self.s3_handler.data_store.data = data.encode("utf-8") + self.s3_handler.multiline_regex_start_pattern = re.compile("^\d{4}-\d{2}-\d{2}") + self.s3_handler.multiline_regex_pattern = re.compile( + "[\n\r\f]+(?=\d{4}-\d{2}-\d{2})" + ) + self.s3_handler._extract_data = MagicMock() + structured_lines = list(self.s3_handler.handle(event)) + self.assertEqual( + structured_lines, + [ + { + "aws": {"s3": {"bucket": "my-bucket", "key": "my-key"}}, + "message": "2022-02-08aaa\nbbbccc", + }, + { + "aws": {"s3": {"bucket": "my-bucket", "key": "my-key"}}, + "message": "2022-02-09bbb", + }, + { + "aws": {"s3": {"bucket": "my-bucket", "key": "my-key"}}, + "message": "2022-02-10ccc\n", + }, + ], + ) + def test_s3_handler_with_sns(self): event = { "Records": [