From 42bd174c3543afa5139aed17308e5ae00a4f537a Mon Sep 17 00:00:00 2001 From: Jonathan Poelen Date: Thu, 5 Sep 2024 11:06:35 +0200 Subject: [PATCH] [tool] log_siem_extractor.py: use new dita doc for check --- tools/log_siem_extractor.py | 83 +++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/tools/log_siem_extractor.py b/tools/log_siem_extractor.py index 266574358e..d8ed9e9024 100755 --- a/tools/log_siem_extractor.py +++ b/tools/log_siem_extractor.py @@ -2,6 +2,7 @@ import os import re import sys +import glob from typing import Iterable, TextIO, Callable, Any, NewType, TypeVar, NamedTuple from itertools import chain from os.path import join as path_join @@ -26,7 +27,7 @@ def identity(x: T) -> T: class DataLog(NamedTuple): formated_logs: set[str] - used_params: list[list[str]] + used_params: set[tuple[str, ...]] optional_params: set[str] LogFormatType = NewType('LogFormatType', dict[str, DataLog]) @@ -34,7 +35,7 @@ class DataLog(NamedTuple): def create_data_log(): return DataLog(formated_logs=set(), - used_params=[], + used_params=set(), optional_params=set()) @@ -67,7 +68,7 @@ def print_alert_on_list(msg: str, texts: Iterable[str], color: bool) -> None: def inject_optional_param_from_used_params(d: LogFormatType): - for _, datalog in d.items(): + for datalog in d.values(): if not datalog.used_params: continue counter_param = Counter(p for params in datalog.used_params for p in params) @@ -90,7 +91,7 @@ def update_dict(d: LogFormatType, data = ''.join(f' {colored(k)}={v}' for k, v in kvs_list) data = f'{cat}="{logid}"{data}' d[logid].formated_logs.add(data) - d[logid].used_params.append([k for k, v in kvs_list]) + d[logid].used_params.add(tuple(k for k, v in kvs_list)) proxy_logs = log_format_builder() vnc_logs = log_format_builder() @@ -166,7 +167,7 @@ def update(d, process, filenames): # noqa: ANN001, ANN202 for (logid, desc) in server_cert_regex.findall(text): log = rdp_logs[logid] log.formated_logs.add(f'{cat}="{logid}" {colored("description")}={desc}') - log.used_params.append(['description']) + log.used_params.add(('description',)) elif dirpath.startswith(f'{src_path}/mod/vnc'): update(vnc_logs, log6_process, filenames) @@ -214,8 +215,8 @@ def copy_in_rdp_and_vnc(logs): for k, d in logs.items(): rdp_logs[k].formated_logs.update(d.formated_logs) vnc_logs[k].formated_logs.update(d.formated_logs) - rdp_logs[k].used_params.extend(d.used_params) - vnc_logs[k].used_params.extend(d.used_params) + rdp_logs[k].used_params.update(d.used_params) + vnc_logs[k].used_params.update(d.used_params) copy_in_rdp_and_vnc(rdp_and_vnc_logs) @@ -247,31 +248,32 @@ def copy_in_rdp_and_vnc(logs): return proxy_logs, rdp_logs, vnc_logs -def extract_doc_siem(docfile: str) -> tuple[LogFormatType, # proxy - LogFormatType, # rdp - LogFormatType]: # vnc +def extract_doc_siem(docdir: str) -> tuple[LogFormatType, # proxy + LogFormatType, # rdp + LogFormatType]: # vnc # Format: #
# ( - # ( ... )+ # contains log - # ( ... )* # contains optional value (...) + # ( ... )+ # contains log + # ( ... )* # contains optional value (...) # )+ #
- reg_base = ( - r'(rdpproxy: \[rdpproxy\]|(?:rdpproxy: )?\[(?:RDP|VNC) Session\])' - r' (?:(?!type=|<).)*type=["”]([^"”]+)["”][^<]*' - ) - block_regex = re.compile(reg_base + r'((?:(?!).)*)', re.DOTALL) - log_regex = re.compile(reg_base + r'|((?:(?!).)*)', re.DOTALL) - optional_values_regex = re.compile(r'(\w+)') - + cat = r'rdpproxy: \[rdpproxy\]|(?:rdpproxy: )?\[(?:RDP|VNC) Session\]' + block_regex = re.compile(fr']*>(?:{cat}) [^<]*((?:(?!).)*)', re.DOTALL) + # split by rdpproxy: [rdpproxy] / wabengine: [wabauth] / others... / + split_part_regex = re.compile(r'(?=(?:(?:^|\n|]*>)(?:\w+: \[\w+\]|(?:rdpproxy: )?\[(?:RDP|VNC) Session\]) |]))', re.DOTALL | re.MULTILINE) + siem_log_regex = re.compile(fr'^(?:\n|]*>)?(({cat}) (?:(?!type=|<).)*type=["”]([^"”]+)["”][^<]*)', re.DOTALL | re.MULTILINE) + + # accept h, h and h is optional + # reject 'h :' and 'h.' + optional_values_regex = re.compile(r'(\w+)(?:,|[^[:symbol:] ]|\s+\w)') kv_siem_cpp_regex = re.compile(r'(\w+)=["”](?:[^"”\\]|\\.)+["”]') proxy_logs = log_format_builder() rdp_logs = log_format_builder() vnc_logs = log_format_builder() - d = { + dmap = { 'rdpproxy: [rdpproxy]': proxy_logs, 'rdpproxy: [RDP Session]': rdp_logs, 'rdpproxy: [VNC Session]': vnc_logs, @@ -279,26 +281,25 @@ def extract_doc_siem(docfile: str) -> tuple[LogFormatType, # proxy '[VNC Session]': vnc_logs, } - for m in block_regex.finditer(read_xmlfile(docfile)): - previous_logkey = None - for m in log_regex.finditer(m.group(0)): - note = m.group(3) - if not note: - data = m.group(0) - cat = m.group(1) - t = m.group(2) - previous_logkey = (cat, t) - kvs = kv_siem_cpp_regex.findall(data.replace('”', '"')) - datalog = d[cat][t] - datalog.formated_logs.add(data) - datalog.used_params.append(kvs) - else: - optional_values = optional_values_regex.findall(note) - if optional_values: - datalog = d[previous_logkey[0]][previous_logkey[1]] - datalog.optional_params.update(optional_values) - - for logs in d.values(): + for docfile in glob.glob('**/*.dita', root_dir=docdir, recursive=True): + for m in block_regex.finditer(read_xmlfile(f'{docdir}/{docfile}')): + previous_datalog = None + for part in split_part_regex.split(m.group(0)): + if m := siem_log_regex.match(part): + data = m.group(1) + cat = m.group(2) + ty = m.group(3) + kvs = kv_siem_cpp_regex.findall(data.replace('”', '"')) + datalog = dmap[cat][ty] + datalog.formated_logs.add(data) + datalog.used_params.add(tuple(kvs)) + previous_datalog = datalog + elif part.startswith('<'): # is + optional_values = optional_values_regex.findall(part) + if optional_values: + previous_datalog.optional_params.update(optional_values) + + for logs in dmap.values(): inject_optional_param_from_used_params(logs) return proxy_logs, rdp_logs, vnc_logs