From 718ab683679c0ce6603937b890c40041d8154ade Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Thu, 10 Aug 2023 16:07:46 +0000 Subject: [PATCH 01/21] Create a script called match-2-yara --- scripts/match-2-yar.py | 802 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 802 insertions(+) create mode 100644 scripts/match-2-yar.py diff --git a/scripts/match-2-yar.py b/scripts/match-2-yar.py new file mode 100644 index 000000000..f606a3021 --- /dev/null +++ b/scripts/match-2-yar.py @@ -0,0 +1,802 @@ +#!/usr/bin/env python2 +""" +match-2-yar + +Invoke capa to extract the capabilities of the given sample or list of samples, +and emit the matches as yara rules. + +When providing multiple samples or directories the tool will attempt to create +"super rules" based on overlapping signatures + + +Example:: + + $ python scripts/match-2-yar.py /tmp/suspicious.dll_ + ... + +Example:: + + $ python scripts/match-2-yar.py /tmp/suspicious.dll_ /tmp/suspicious2.dll_ + ... + +""" +import os +import sys +import logging +import argparse +import collections +import multiprocessing +import multiprocessing.pool +from datetime import date +from pathlib import Path + +import capa.main +import capa.rules +import capa.engine +import capa.helpers +import capa.features +import capa.exceptions +import capa.render.utils as rutils +import capa.render.verbose +import capa.features.freeze +import capa.render.result_document as rd +from capa.features.common import OS_AUTO +from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor + +import dnfile +from dncil.clr.token import Token + +from envi.memcanvas import MemoryCanvas +from vivisect.renderers import WorkspaceRenderer + +try: + from capstone import Cs, CS_ARCH_X86, CS_MODE_32, CS_MODE_64, CS_OPT_SYNTAX_INTEL + from mkyara import YaraGenerator + import yaramod +except ImportError: + print("""\nFailed to import a module try installing required Python libraries with the following: +pip install mkyara yaramod +""" ) + sys.exit(1) + + +logger = logging.getLogger("capa.match-2-yar") + + +######## Vivisect Related Classes and Functions ######## + +class BufferCanvas(MemoryCanvas): + """Subclass of Vivisect Memory canvas that captures + disassemlby output as a string rather than printing to std.out + """ + output = "" + + def addText(self, text, tag=None): + """Overwriting the method responsible for writing to std.out + """ + self.output += text + +def get_disassembly_output(vw, va, size): + """Get Vivisect's disassembly view for a given virtual addresss and size + + Args: + vw: Vivisect Workspace + va: Virtual Address to start disassembling from + size: size in bytes to disassemble + + Returns: + str: String containing vivisect's disassembly output + """ + rend = WorkspaceRenderer(vw) + mcav = BufferCanvas(vw) + mcav.renderMemory(va, size, rend=rend) + return mcav.output + + +def get_comment_for_func(vw, funcva): + """Get a CodeFeature comment for a function + + This function gets the size of a function and + uses that to get a dump of the function disassembly + with get_dissasembly_output + + Args: + vw: Vivisect Workspace + funcva: Virtual Address of function to analyze + + Returns: + str: String containing disassembly output for a function + """ + funcsize = get_function_size(vw, funcva) + return get_disassembly_output(vw, funcva, funcsize) + +def get_comment_for_cb(vw, va): + """Get a CodeFeature comment for a Code Block + + This function gets the size of a code block and + uses that to get a dump of the code block disassembly + with get_dissasembly_output + + Args: + vw: Vivisect Workspace + va: Virtual Address of Codeblock to analyze + + Returns: + str: String containing disassembly output for a function + """ + cb = vw.getCodeBlock(va) + cbva, cbsize, cbfunc = cb + return get_disassembly_output(vw, cbva, cbsize) + +def get_function_size(vw, funcva): + """Return the size of a function based on vivisect analysis + + Args: + vw: Vivisect Workspace + funcva: Virtual Address of function to analyze + + Returns: + int: size of the function + """ + fsize = 0 + if funcva not in vw.getFunctions(): + funcva = vw.getFunction(funcva) + if funcva is None: + raise Exception('Given funcva not a function or within a known function') + func_blocks = [cbva for cbva, _, _ in vw.getFunctionBlocks(funcva)] + # Figure out the size of the first linear chunk + # in this function... + cb = vw.getCodeBlock(funcva) + if cb[0] not in func_blocks: + raise Exception("funcva not in given func") + while cb is not None: + cbva, cbsize, cbfunc = cb + if cbfunc != funcva: + break + fsize += cbsize + cb = vw.getCodeBlock(cbva+cbsize) + + if fsize == 0: + raise Exception("0 length function??!?1") + + return fsize + +def get_function_bytes(vw, funcva): + """Return the bytes from a function + + Args: + vw: Vivisect Workspace + funcva: Virtual Address of function to analyze + + Returns: + bytes: bytes of a function + """ + fsize = get_function_size(vw, funcva) + return vw.readMemory(funcva, fsize) + +def get_cb_bytes(vw, va): + """Return the bytes from a code block + + Args: + vw: Vivisect Workspace + va: Virtual Address to analyze + + Returns: + int: size of the function + """ + cb = vw.getCodeBlock(va) + cbva, cbsize, cbfunc = cb + return vw.readMemory(cbva, cbsize) + + +######## Capstone Related Classes and Functions ######## + +VIVI_ARCH_TO_CAPSTONE = { + 'i386': (CS_ARCH_X86, CS_MODE_32), + 'amd64': (CS_ARCH_X86, CS_MODE_64) +} + +def mkyara_sig_generation(start_va, bytez, arch, mode): + """Mask x86/x64 instructions and generate a signature + + This uses mkyara's logic for now, but an area for research to + build out the system to test resiliency. + + Args: + start_va: virtual address of first instruction + bytez: byte string containing raw bytes of the function + arch: Capstone Architecture to use (CS_ARCH_X86 covers 32 and 64bit x86) + mode: Capstone mode to choose between 32 and 64 bit + + Returns: + str: signature string in the form of "AA BB CC DD" + """ + gen = YaraGenerator("normal", arch, mode) + gen.add_chunk(bytez, offset=start_va) + + md = Cs(arch, mode) + md.detail = True + md.syntax = CS_OPT_SYNTAX_INTEL + + sig = "" + disasm = md.disasm(bytez, start_va) + for ins in disasm: + rule_part, comment = gen._process_instruction(ins) + rule_part = gen.format_hex(rule_part) + sig += rule_part + " " + + return sig + + +def genSigAndMask(start_va, bytez, vivi_arch='i386'): + """Generate a signature and masked signature for a fuction virtual address + + This function performs the translation from vivisect arch + to the mode and arch needed by capstone + + Args: + start_va: virtual address of first instruction + bytez: byte string containing raw bytes of the function + vivi_arch: Vivisect architecture + + Returns: + str: signature string in the form of "AA BB CC DD" + """ + + arch, mode = VIVI_ARCH_TO_CAPSTONE[vivi_arch] + + # Other option for normal is loose, but we won't use those here + return mkyara_sig_generation(start_va, bytez, arch, mode) + +######## .NET Related Classes and Functions ######## + +def format_operand(pe, op): + """Return a string representation of a .NET operand + + Use a dnfile object to reference .NET tables to understand + methods, classes, and strings + + Args: + pe: dnfile object for a .NET PE + op: dncil operand from an instruction + Returns: + str: string representation of an operand + """ + if isinstance(op, Token): + op = capa.features.extractors.dnfile.helpers.resolve_dotnet_token(pe, op) + + if isinstance(op, str): + return f'"{op}"' + elif isinstance(op, int): + return hex(op) + elif isinstance(op, list): + return f"[{', '.join(['({:04X})'.format(x) for x in op])}]" + elif isinstance(op, dnfile.mdtable.MemberRefRow) and not isinstance(op.Class.row, dnfile.mdtable.TypeSpecRow): + return f"{str(op.Class.row.TypeNamespace)}.{op.Class.row.TypeName}::{op.Name}" + elif isinstance(op, (dnfile.mdtable.FieldRow, dnfile.mdtable.MethodDefRow)): + return f"{op.Name}" + else: + return "" if op is None else str(op) + +def get_sig_and_mask_for_dotnet_func(dnpe, body): + """Return the comment, sig, and bytes of a .NET Method + + Iterate a method body to get IL bytes and mask the operand + values to create a more flexible signature + + Args: + dnpe: dnfile object for a .NET PE + body: dncil method body + Returns: + str comment: Comment string with formatted .NET IL disassembly + str formatted_sig: signature as string with hex and wildcards + str func_bytes: hex bytes of a .NET method + """ + + comment = "" + sig = "" + func_bytes = "" + for insn in body.instructions: + comment += ( + "{:04X}".format(insn.offset) + + " " + + f"{' '.join('{:02x}'.format(b) for b in insn.get_bytes()) : <20}" + + f"{str(insn.opcode) : <15}" + + format_operand(dnpe, insn.operand) + + "\n" + ) + + sig += insn.get_opcode_bytes().hex() + func_bytes += insn.get_opcode_bytes().hex() + + if insn.operand: + sig += '??' * len(insn.get_operand_bytes()) + func_bytes += insn.get_operand_bytes().hex() + + # Format the sig to be in the same style as the vivi portion (bytes seperated by spaces) + formatted_sig = "" + for idx, val in enumerate(sig): + if idx > 0 and idx % 2 == 0: + formatted_sig += " " + formatted_sig += val + + + return comment, formatted_sig, func_bytes + +######## CodeFeature Extractor Related Classes and Functions ######## + +class CodeFeature(): + """Basic object that that will be used to create yara rules + """ + def __init__(self, sig: str, comment: str, bytez: bytes, filemd5:str): + self.sig = sig + self.comment = comment + self.bytez = bytez + self.filemd5 = filemd5 + +def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor): + """Returns a dictionary mapping a filemd5 to a list of CodeFeatures + + This function operates on x86/x64 PE files and creates + CodeFeatures based on basic block and function CAPA matches + + Args: + doc (rd.ResultDocument): CAPA result docs + extractor: CAPA analysis extractor object + Returns: + dict: dictionary with a key of the filemd5 mapped to a list of CodeFeatures + """ + # Grab the vivisect workspace object + try: + file_vw = extractor.vw + except: + print("No extractor workspace") + file_vw = None + raise + + # Get the filemd5 + filemd5 = doc.meta.sample.md5 + + + cb_matches = collections.defaultdict(set) + func_matches = collections.defaultdict(set) + + for rule in rutils.capability_rules(doc): + if rule.meta.scope == capa.rules.FUNCTION_SCOPE: + for addr, _ in rule.matches: + func_matches[addr.value].add(rule.meta.name) + elif rule.meta.scope == capa.rules.BASIC_BLOCK_SCOPE: + for addr, _ in rule.matches: + cb_matches[addr.value].add(rule.meta.name) + else: + # file scope + pass + + code_features = [] + + for addr, rules in cb_matches.items(): + comment = f"Basic Block at 0x{addr:08x}@{filemd5} with {len(rules)} features:\n" + for rule_name in sorted(rules): + comment += f" - {rule_name}\n" + comment += get_comment_for_cb(file_vw, addr) + + bytez = get_cb_bytes(file_vw, addr) + sig = genSigAndMask(addr, bytez, doc.meta.analysis.arch) + code_features.append(CodeFeature(sig,comment,bytez,filemd5)) + + for addr, rules in func_matches.items(): + comment = f"function at 0x{addr:08x}@{filemd5} with {len(rules)} features:\n" + for rule_name in sorted(rules): + comment += f" - {rule_name}\n" + comment += get_comment_for_func(file_vw, addr) + + bytez = get_function_bytes(file_vw, addr) + sig = genSigAndMask(addr, bytez, doc.meta.analysis.arch) + code_features.append(CodeFeature(sig,comment,bytez,filemd5)) + + + if len(code_features) == 0: + logger.warning("No code features found for %s", filemd5) + return {filemd5: code_features} + +def get_code_features_for_dotnet_doc(doc: rd.ResultDocument, extractor): + """Returns a dictionary mapping a filemd5 to a list of CodeFeatures + + This function operates on .NET PE files and creates + CodeFeatures based on .NET method CAPA matches + + Args: + doc (rd.ResultDocument): CAPA result docs + extractor: CAPA analysis extractor object + Returns: + dict: dictionary with a key of the filemd5 mapped to a list of CodeFeatures + """ + # Grab the vivisect workspace object + try: + dnpe = extractor.pe + except: + print("No dnpe file found") + raise + + filemd5 = doc.meta.sample.md5 + + func_matches = collections.defaultdict(set) + + for rule in rutils.capability_rules(doc): + if rule.meta.scope == capa.rules.FUNCTION_SCOPE: + for addr, _ in rule.matches: + func_matches[addr.value].add(rule.meta.name) + else: + # file scope + pass + + # Funcs is the cache of functions we need to reference to get + # the underlying dnfile object + funcs = list(extractor.get_functions()) + + # Return list of CodeFeature objects + code_features = [] + + logger.debug(f"Building CodeFeatures for {len(func_matches.keys())} functions in {filemd5}") + for addr, rules in func_matches.items(): + func_name = extractor.token_cache.get_method(addr) + comment = f"function {func_name} 0x{addr:08x}@{filemd5} with {len(rules)} features:\n" + for rule_name in sorted(rules): + comment += f" - {rule_name}\n" + + # Get the CILMethodBody object by working with the function + # collection we grabbed earlier + f = [x for x in funcs if x.address.real == addr][0] + func_comment, sig, bytez = get_sig_and_mask_for_dotnet_func(dnpe, f.inner) + comment += func_comment + + code_features.append(CodeFeature(sig,comment,bytez,filemd5)) + + + if len(code_features) == 0: + logger.warning("No code features found for %s", filemd5) + return {filemd5: code_features} + +######## CAPA Entrypoints ######## + +def run_capa_and_get_features(args): + """Main CAPA analysis entrypoint + + This function kicks off CAPA analysis and builds CodeFeatures that + will be used to build yara rules in the main thread. + + Args: + args: Tuple containing the following + - rules: CAPA rules loaded from a repo + - sig_paths: Path to signatures used for library identification + - format: Format for processing (dotnet or auto are the expected values) + - os_: Operating system specified + - path: Path to file for analyis + Returns: + dict: dictionary with the following keys + - path: Path to file that was analyzed + - status: Status of analysis (error or ok) + - error (Optional): Details of errors that occured + - ok (Optional): Dictionary mapping the filemd5 to a list of CodeFeatures + """ + + rules, sig_paths, format, os_, path = args + should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None) + + try: + extractor = capa.main.get_extractor( + path, format, os_, capa.main.BACKEND_VIV, sig_paths, should_save_workspace, disable_progress=True + ) + except capa.main.UnsupportedFormatError: + # i'm 100% sure if multiprocessing will reliably raise exceptions across process boundaries. + # so instead, return an object with explicit success/failure status. + # + # if success, then status=ok, and results found in property "ok" + # if error, then status=error, and human readable message in property "error" + return { + "path": path, + "status": "error", + "error": f"input file does not appear to be a PE file: {path}", + } + except capa.main.UnsupportedRuntimeError: + return { + "path": path, + "status": "error", + "error": "unsupported runtime or Python interpreter", + } + except Exception as e: + return { + "path": path, + "status": "error", + "error": f"unexpected error: {e}", + } + + meta = capa.main.collect_metadata([], path, format, os_, [], extractor) + logger.info(f"Collecting capabilities for {path}") + capabilities, counts = capa.main.find_capabilities(rules, extractor, disable_progress=True) + + meta.analysis.feature_counts = counts["feature_counts"] + meta.analysis.library_functions = counts["library_functions"] + meta.analysis.layout = capa.main.compute_layout(rules, extractor, capabilities) + + if capa.main.has_file_limitation(rules, capabilities): + # bail if capa encountered file limitation e.g. a packed binary + # do show the output in verbose mode, though. + if not (args.verbose or args.vverbose or args.json): + return { + "path": path, + "status": "error", + "error": f"Encountered file limitation", + } + + doc = rd.ResultDocument.from_capa(meta, rules, capabilities) + logger.info(f"Building code features for {path}") + if type(extractor) == DnfileFeatureExtractor: + # Handle .NET files + features = get_code_features_for_dotnet_doc(doc, extractor) + else: + # Handle other files + features = get_code_features_for_capa_doc(doc, extractor) + return {"path": path, "status": "ok", "ok": features} + + +def multi_process_capa(argv=None): + """CAPA argument handler and multiprocessing manager + + This function processes CLI arguments and kicks of capa analysis + and extacts CodeFeatures into a dictionary that maps filemd5s + to a list of CodeFeatures that will be used to build yara rules + + Args: + argv: + Returns: + dict: dictionary mapping filemd5's processed to a list of CodeFeatures + """ + if argv is None: + argv = sys.argv[1:] + + parser = argparse.ArgumentParser(description="Build YARA rules for CAPA matches") + capa.main.install_common_args(parser, wanted={"rules", "signatures", "format", "os"}) + parser.add_argument("input", type=str, nargs="+", help="Path to directory or files to analyze") + parser.add_argument( + "-n", "--parallelism", type=int, default=multiprocessing.cpu_count(), help="parallelism factor" + ) + parser.add_argument("--no-mp", action="store_true", help="disable subprocesses") + args = parser.parse_args(args=argv) + capa.main.handle_common_args(args) + + try: + rules = capa.main.get_rules(args.rules) + logger.info("successfully loaded %s rules", len(rules)) + except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e: + logger.error("%s", str(e)) + return -1 + + try: + sig_paths = capa.main.get_signatures(args.signatures) + except IOError as e: + logger.error("%s", str(e)) + return -1 + + samples = [] + for p in args.input: + path = Path(p) + if not path.exists(): + raise ValueError("Invalid path {p}") + if path.is_dir(): + samples.extend([x for x in path.rglob("*")]) + elif path.is_file(): + samples.append(path) + logger.info("Starting to process %s files", len(samples)) + + + cpu_count = multiprocessing.cpu_count() + + def pmap(f, args, parallelism=cpu_count): + """apply the given function f to the given args using subprocesses""" + return multiprocessing.Pool(parallelism).imap(f, args) + + def tmap(f, args, parallelism=cpu_count): + """apply the given function f to the given args using threads""" + return multiprocessing.pool.ThreadPool(parallelism).imap(f, args) + + def map(f, args, parallelism=None): + """apply the given function f to the given args in the current thread""" + for arg in args: + yield f(arg) + + if args.no_mp: + if args.parallelism == 1: + logger.debug("using current thread mapper") + mapper = map + else: + logger.debug("using threading mapper") + mapper = tmap + else: + logger.debug("using process mapper") + mapper = pmap + + results = {} + for result in mapper( + run_capa_and_get_features, + [(rules, sig_paths, args.format, OS_AUTO, sample) for sample in samples], + parallelism=args.parallelism, + ): + if result["status"] == "error": + logger.warning(f'{result["path"]}: {result["error"]}') + elif result["status"] == "ok": + results.update(result["ok"]) + else: + raise ValueError(f"unexpected status: {result['status']}") + + logger.info(f"Done processing {len(samples)} samples") + + return results + +######## YARA related functions ######## + +CODE_FEATURES_REFERENCED = [] + +def build_rule_from_combo(combo_dict: dict, **kwargs): + """Build a yaramod yara rule using a combination dictionary + + Args: + combo_dict: Dictionary of features that all matched on a group of files + Returns: + yaramod.Rule: yaramod representation of a yara rule generated for the file combination + """ + + # we're going to use this to create unique code features to insert the comment strings + global CODE_FEATURES_REFERENCED + + + # Build metadata for the rule + rule_name = "super_rule_" + "_".join([x[:5] for x in sorted(combo_dict["files"])]) + metadict = dict( + author=kwargs.get("author", "CAPA Matches"), + date_created=kwargs.get("date_created", date.today().isoformat()), + date_modified=kwargs.get("date_modified", date.today().isoformat()), + description=kwargs.get("description", ""), + ) + + rule = yaramod.YaraRuleBuilder().with_name(rule_name) + for metakey, metavalue in metadict.items(): + if metavalue is not None: + rule = rule.with_string_meta(metakey, metavalue) + + # Add in hash meta + rule = rule.with_name(rule_name) + for hsh in combo_dict["files"]: + rule = rule.with_string_meta("md5", hsh) + + conditions = [yaramod.of(yaramod.all(), yaramod.them())] + for codefeature in combo_dict['features']: + idx = len(CODE_FEATURES_REFERENCED) + hexstr = yaramod.YaraHexStringBuilder() + for byte in codefeature.sig.split(" "): + if byte == "??": + hexstr = hexstr.add(yaramod.wildcard()) + elif byte == '': + continue + else: + hexstr = hexstr.add(yaramod.YaraHexStringBuilder(int(byte, 16))) + rule = rule.with_hex_string(f"$c{idx}", hexstr.get()) + CODE_FEATURES_REFERENCED.append(codefeature) + + + if len(conditions) == 1: + # No fancy expression needed + rule = rule.with_condition(conditions[0].get()) + else: + rule = rule.with_condition( + yaramod.conjunction(conditions, linebreaks=True).get() + ) + return rule.get() + +TAB_CHAR = " "*4 + +def replace_tabs_with_spaces(yara_text): + """Replacing tabs with spaces in yara rule + + Args: + yara_text: string of full yara rules text + Returns: + str: formatted yara rules text + """ + return yara_text.replace("\t", TAB_CHAR) + +def add_comments_to_yara_file(yara_text): + """Add comments to yara file text + + Args: + yara_text: string of full yara rules text + Returns: + str: formatted yara rules text + """ + + for idx, feature in enumerate(CODE_FEATURES_REFERENCED): + # Find the str in yara_text + # replace it with the comment + search_str = f"$c{idx} =" + comment_str = "/*\n" + comment_str += ("\n"+2*TAB_CHAR).join(feature.comment.split("\n")) + comment_str += "*/\n" + 2*TAB_CHAR + search_str + yara_text = yara_text.replace(search_str, comment_str) + return yara_text + +def build_yara_ruleset(files_dict, **kwargs): + """Build a YARA ruleset string based on CodeFeatures + + Args: + files_dict: dictionary mapping filemd5s to list of CodeFeatures + Returns: + str: YARA ruleset + """ + + # First we'll build a dict with a key based on the masked bytes from each + # Code feature + similarity_dict = {} + for filemd5, features in files_dict.items(): + for value in features: + if value.sig not in similarity_dict: + similarity_dict[value.sig] = { + "values":[value], + "files":set([value.filemd5]) + } + else: + similarity_dict[value.sig]['values'].append(value) + similarity_dict[value.sig]['files'].add(value.filemd5) + + # Next we build out a combodict and track which files have which combos of features + file_combinations = {} + for feature, result_dict in similarity_dict.items(): + sample_combo_key = ":".join(list(sorted(result_dict["files"]))) + if sample_combo_key not in file_combinations: + file_combinations[sample_combo_key] = dict() + file_combinations[sample_combo_key]["files"] = sorted( + result_dict["files"] + ) + file_combinations[sample_combo_key]["feature_count"] = 0 + file_combinations[sample_combo_key]["features"] = [] + + # Use the full code feature from the alphabetical match + chosen_code_version = sorted(result_dict['values'], key=lambda x: x.filemd5)[0] + file_combinations[sample_combo_key]["features"].append( + chosen_code_version + ) + file_combinations[sample_combo_key]["feature_count"] += 1 + + # Create a list of combo keys and sort them so we get deterministic output + combo_keys = sorted(file_combinations.keys(), key=lambda x: (len(x), x)) + + # Build the YARA rule set based on the grouping + yara_file = yaramod.YaraFileBuilder() + observed_files = [] + + for key in combo_keys: + combo_dict = file_combinations[key] + rule = build_rule_from_combo( + combo_dict, **kwargs + ) + if rule is not None: + observed_files.extend(combo_dict["files"]) + yara_file = yara_file.with_rule(rule) + + # Turn the yaramod "file" into a string + yara_text = yara_file.get().text_formatted + + yara_text = replace_tabs_with_spaces(yara_text) + + # Add our comments to the file + yara_text = add_comments_to_yara_file(yara_text) + + return yara_text + + + +def main(argv=None): + all_features = multi_process_capa(argv) + print(build_yara_ruleset(all_features)) + +if __name__ == "__main__": + sys.exit(main()) From 44104f2525c5be5c6bdebd63bd9baf21d0cf2791 Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Fri, 11 Aug 2023 16:47:45 -0400 Subject: [PATCH 02/21] Add tests and address sources of non-deterministic output in match-2-yar --- scripts/match-2-yar.py | 32 ++++++++++------ tests/test_scripts.py | 84 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 11 deletions(-) diff --git a/scripts/match-2-yar.py b/scripts/match-2-yar.py index f606a3021..8dcffed9d 100644 --- a/scripts/match-2-yar.py +++ b/scripts/match-2-yar.py @@ -273,8 +273,12 @@ def format_operand(pe, op): return f"[{', '.join(['({:04X})'.format(x) for x in op])}]" elif isinstance(op, dnfile.mdtable.MemberRefRow) and not isinstance(op.Class.row, dnfile.mdtable.TypeSpecRow): return f"{str(op.Class.row.TypeNamespace)}.{op.Class.row.TypeName}::{op.Name}" - elif isinstance(op, (dnfile.mdtable.FieldRow, dnfile.mdtable.MethodDefRow)): + elif isinstance(op, (dnfile.mdtable.FieldRow, dnfile.mdtable.MethodDefRow, dnfile.mdtable.MemberRefRow)): return f"{op.Name}" + elif isinstance(op, (dnfile.mdtable.TypeDefRow, dnfile.mdtable.TypeRefRow)): + return f"{op.TypeNamespace}.{op.TypeName}" + elif isinstance(op, (dnfile.mdtable.TypeSpecRow, dnfile.mdtable.MethodSpecRow)): + return f"{str(op.struct)}" else: return "" if op is None else str(op) @@ -522,21 +526,27 @@ def run_capa_and_get_features(args): if capa.main.has_file_limitation(rules, capabilities): # bail if capa encountered file limitation e.g. a packed binary # do show the output in verbose mode, though. - if not (args.verbose or args.vverbose or args.json): return { "path": path, "status": "error", "error": f"Encountered file limitation", } - doc = rd.ResultDocument.from_capa(meta, rules, capabilities) - logger.info(f"Building code features for {path}") - if type(extractor) == DnfileFeatureExtractor: - # Handle .NET files - features = get_code_features_for_dotnet_doc(doc, extractor) - else: - # Handle other files - features = get_code_features_for_capa_doc(doc, extractor) + try: + doc = rd.ResultDocument.from_capa(meta, rules, capabilities) + logger.info(f"Building code features for {path}") + if type(extractor) == DnfileFeatureExtractor: + # Handle .NET files + features = get_code_features_for_dotnet_doc(doc, extractor) + else: + # Handle other files + features = get_code_features_for_capa_doc(doc, extractor) + except Exception as e: + return { + "path": path, + "status": "error", + "error": f"unexpected error: {e}", + } return {"path": path, "status": "ok", "ok": features} @@ -582,7 +592,7 @@ def multi_process_capa(argv=None): for p in args.input: path = Path(p) if not path.exists(): - raise ValueError("Invalid path {p}") + raise ValueError(f"Invalid path {p}") if path.is_dir(): samples.extend([x for x in path.rglob("*")]) elif path.is_file(): diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 7c91bc573..cef5e4b4f 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -10,6 +10,7 @@ import logging import textwrap import subprocess +from datetime import date from pathlib import Path import pytest @@ -26,6 +27,8 @@ def get_script_path(s: str): def get_file_path(): return str(CD / "data" / "9324d1a8ae37a36ae560c37448c9705a.exe_") +def get_data_path(p: str): + return str(CD / "data" / p ) def get_rules_path(): return str(CD / ".." / "rules") @@ -68,6 +71,87 @@ def test_bulk_process(tmp_path): p = run_program(get_script_path("bulk-process.py"), [str(t.parent)]) assert p.returncode == 0 +@pytest.mark.parametrize( + "script,args,expected_output_path", + [ + # Test match-2-yar x86 EXE + pytest.param( + "match-2-yar.py", + [ + get_data_path("9324d1a8ae37a36ae560c37448c9705a.exe_") + ], + "yara/expected_9324d1a8ae37a36ae560c37448c9705a.exe_.yar" + ), + # Test match-2-yar x64 EXE + pytest.param( + "match-2-yar.py", + [ + get_data_path("c2bb17c12975ea61ff43a71afd9c3ff111d018af161859abae0bdb0b3dae98f9.exe_") + ], + "yara/expected_c2bb17c12975e.yar" + ), + # Test match-2-yar x86 .NET EXE + pytest.param( + "match-2-yar.py", + [ + "-f", + "dotnet", + get_data_path("dotnet/1c444ebeba24dcba8628b7dfe5fec7c6.exe_"), + + ], + "yara/expected_1c444ebeba24dcba8628b7dfe5fec7c6.exe_.yar" + ), + # Test match-2-yar files with multiple X86 PEs + pytest.param( + "match-2-yar.py", + [ + get_data_path("Practical Malware Analysis Lab 03-04.exe_"), + get_data_path("Practical Malware Analysis Lab 11-03.exe_"), + get_data_path("Practical Malware Analysis Lab 16-01.exe_") + ], + "yara/expected_pma_03-04.exe_11-03.exe_16-01.exe" + ), + # Test match-2-yar files with CAPA file limitations are filtered out of multi sample + pytest.param( + "match-2-yar.py", + [ + get_data_path("Practical Malware Analysis Lab 01-01.exe_"), + get_data_path("Practical Malware Analysis Lab 01-02.exe_") + ], + "yara/expected_pma_01-01.exe_01-01.exe" + ), + + # Test match-2-yar multiple x86 .NET PE + pytest.param( + "match-2-yar.py", + [ + "-f", + "dotnet", + get_data_path("dotnet/1c444ebeba24dcba8628b7dfe5fec7c6.exe_"), + get_data_path("dotnet/692f7fd6d198e804d6af98eb9e390d61.exe_"), + + ], + "yara/expected_1c444ebe_692f7fd6.yar" + ), + ], +) +def test_script_expected_output(script, args, expected_output_path): + script_path = get_script_path(script) + with open(get_data_path(expected_output_path), 'rb') as f: + expected_output = f.read() + + # Update dates in expected output to be todays date + dates_to_replace = [ + b"2023-08-10", + ] + for dt in dates_to_replace: + expected_output = expected_output.replace(dt, date.today().isoformat().encode('utf8')) + + p = run_program(script_path, args) + + assert p.returncode == 0 + assert p.stdout.decode('utf8') == expected_output.decode('utf8') + def run_program(script_path, args): args = [sys.executable] + [script_path] + args From 913084a973556e391696d354b92fffa91e5be5cc Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Fri, 11 Aug 2023 16:56:33 -0400 Subject: [PATCH 03/21] Update the test repo version to include yara test files --- tests/data | 2 +- tests/test_scripts.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data b/tests/data index faf741a53..6fc8f093b 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit faf741a538224f52d4412468f910d52a70911662 +Subproject commit 6fc8f093b86f7213cbffd510ed029dbf28e439d0 diff --git a/tests/test_scripts.py b/tests/test_scripts.py index cef5e4b4f..91a1ae340 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -118,7 +118,7 @@ def test_bulk_process(tmp_path): get_data_path("Practical Malware Analysis Lab 01-01.exe_"), get_data_path("Practical Malware Analysis Lab 01-02.exe_") ], - "yara/expected_pma_01-01.exe_01-01.exe" + "yara/expected_pma_01-01.exe_01-02.exe" ), # Test match-2-yar multiple x86 .NET PE From 31789361741468c905bef1e2d6d3264adf2f88df Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Fri, 11 Aug 2023 16:59:44 -0400 Subject: [PATCH 04/21] Update changelog for match-2-yar --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bd9b6e7f..e6d9034ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ - ELF: implement file import and export name extractor #1607 #1608 @Aayush-Goel-04 - bump pydantic from 1.10.9 to 2.1.1 #1582 @Aayush-Goel-04 - develop script to highlight the features that are not used during matching #331 @Aayush-Goel-04 +- add script to create code-based YARA based on CAPA match details called match-2-yar @jconnor0426 + ### Breaking Changes From e6edf434e3aa082a7f722284f8cbf462e20035bf Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Tue, 22 Aug 2023 15:15:24 -0400 Subject: [PATCH 05/21] Updating to be compliant with code style of project --- scripts/match-2-yar.py | 326 ++++++++++++++++++++++------------------- tests/test_scripts.py | 83 +++++------ 2 files changed, 220 insertions(+), 189 deletions(-) diff --git a/scripts/match-2-yar.py b/scripts/match-2-yar.py index 8dcffed9d..08a095e8f 100644 --- a/scripts/match-2-yar.py +++ b/scripts/match-2-yar.py @@ -1,8 +1,16 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 """ +Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. +You may obtain a copy of the License at: [package root]/LICENSE.txt +Unless required by applicable law or agreed to in writing, software distributed under the License + is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and limitations under the License. + match-2-yar -Invoke capa to extract the capabilities of the given sample or list of samples, +Invoke capa to extract the capabilities of the given sample or list of samples, and emit the matches as yara rules. When providing multiple samples or directories the tool will attempt to create @@ -27,8 +35,14 @@ import collections import multiprocessing import multiprocessing.pool -from datetime import date +from typing import Set, Dict, List from pathlib import Path +from datetime import date + +import dnfile +from envi.memcanvas import MemoryCanvas +from dncil.clr.token import Token +from vivisect.renderers import WorkspaceRenderer import capa.main import capa.rules @@ -43,39 +57,37 @@ from capa.features.common import OS_AUTO from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor -import dnfile -from dncil.clr.token import Token - -from envi.memcanvas import MemoryCanvas -from vivisect.renderers import WorkspaceRenderer - try: - from capstone import Cs, CS_ARCH_X86, CS_MODE_32, CS_MODE_64, CS_OPT_SYNTAX_INTEL - from mkyara import YaraGenerator import yaramod + from mkyara import YaraGenerator + from capstone import CS_MODE_32, CS_MODE_64, CS_ARCH_X86, CS_OPT_SYNTAX_INTEL, Cs except ImportError: - print("""\nFailed to import a module try installing required Python libraries with the following: + print( + """\nFailed to import a module try installing required Python libraries with the following: pip install mkyara yaramod -""" ) +""" + ) sys.exit(1) logger = logging.getLogger("capa.match-2-yar") -######## Vivisect Related Classes and Functions ######## +# Vivisect Related Classes and Functions + class BufferCanvas(MemoryCanvas): """Subclass of Vivisect Memory canvas that captures disassemlby output as a string rather than printing to std.out """ + output = "" def addText(self, text, tag=None): - """Overwriting the method responsible for writing to std.out - """ + """Overwriting the method responsible for writing to std.out""" self.output += text + def get_disassembly_output(vw, va, size): """Get Vivisect's disassembly view for a given virtual addresss and size @@ -83,7 +95,7 @@ def get_disassembly_output(vw, va, size): vw: Vivisect Workspace va: Virtual Address to start disassembling from size: size in bytes to disassemble - + Returns: str: String containing vivisect's disassembly output """ @@ -95,28 +107,29 @@ def get_disassembly_output(vw, va, size): def get_comment_for_func(vw, funcva): """Get a CodeFeature comment for a function - - This function gets the size of a function and - uses that to get a dump of the function disassembly + + This function gets the size of a function and + uses that to get a dump of the function disassembly with get_dissasembly_output Args: vw: Vivisect Workspace funcva: Virtual Address of function to analyze - + Returns: str: String containing disassembly output for a function """ funcsize = get_function_size(vw, funcva) return get_disassembly_output(vw, funcva, funcsize) + def get_comment_for_cb(vw, va): """Get a CodeFeature comment for a Code Block - - This function gets the size of a code block and - uses that to get a dump of the code block disassembly + + This function gets the size of a code block and + uses that to get a dump of the code block disassembly with get_dissasembly_output - + Args: vw: Vivisect Workspace va: Virtual Address of Codeblock to analyze @@ -128,13 +141,14 @@ def get_comment_for_cb(vw, va): cbva, cbsize, cbfunc = cb return get_disassembly_output(vw, cbva, cbsize) + def get_function_size(vw, funcva): """Return the size of a function based on vivisect analysis Args: vw: Vivisect Workspace funcva: Virtual Address of function to analyze - + Returns: int: size of the function """ @@ -142,7 +156,7 @@ def get_function_size(vw, funcva): if funcva not in vw.getFunctions(): funcva = vw.getFunction(funcva) if funcva is None: - raise Exception('Given funcva not a function or within a known function') + raise Exception("Given funcva not a function or within a known function") func_blocks = [cbva for cbva, _, _ in vw.getFunctionBlocks(funcva)] # Figure out the size of the first linear chunk # in this function... @@ -154,33 +168,35 @@ def get_function_size(vw, funcva): if cbfunc != funcva: break fsize += cbsize - cb = vw.getCodeBlock(cbva+cbsize) + cb = vw.getCodeBlock(cbva + cbsize) if fsize == 0: raise Exception("0 length function??!?1") - + return fsize + def get_function_bytes(vw, funcva): """Return the bytes from a function - + Args: vw: Vivisect Workspace funcva: Virtual Address of function to analyze - + Returns: bytes: bytes of a function """ fsize = get_function_size(vw, funcva) return vw.readMemory(funcva, fsize) + def get_cb_bytes(vw, va): """Return the bytes from a code block - + Args: vw: Vivisect Workspace va: Virtual Address to analyze - + Returns: int: size of the function """ @@ -189,12 +205,10 @@ def get_cb_bytes(vw, va): return vw.readMemory(cbva, cbsize) -######## Capstone Related Classes and Functions ######## +# Capstone Related Classes and Functions + +VIVI_ARCH_TO_CAPSTONE = {"i386": (CS_ARCH_X86, CS_MODE_32), "amd64": (CS_ARCH_X86, CS_MODE_64)} -VIVI_ARCH_TO_CAPSTONE = { - 'i386': (CS_ARCH_X86, CS_MODE_32), - 'amd64': (CS_ARCH_X86, CS_MODE_64) -} def mkyara_sig_generation(start_va, bytez, arch, mode): """Mask x86/x64 instructions and generate a signature @@ -207,7 +221,7 @@ def mkyara_sig_generation(start_va, bytez, arch, mode): bytez: byte string containing raw bytes of the function arch: Capstone Architecture to use (CS_ARCH_X86 covers 32 and 64bit x86) mode: Capstone mode to choose between 32 and 64 bit - + Returns: str: signature string in the form of "AA BB CC DD" """ @@ -226,9 +240,9 @@ def mkyara_sig_generation(start_va, bytez, arch, mode): sig += rule_part + " " return sig - -def genSigAndMask(start_va, bytez, vivi_arch='i386'): + +def genSigAndMask(start_va, bytez, vivi_arch="i386"): """Generate a signature and masked signature for a fuction virtual address This function performs the translation from vivisect arch @@ -238,24 +252,26 @@ def genSigAndMask(start_va, bytez, vivi_arch='i386'): start_va: virtual address of first instruction bytez: byte string containing raw bytes of the function vivi_arch: Vivisect architecture - + Returns: str: signature string in the form of "AA BB CC DD" """ - + arch, mode = VIVI_ARCH_TO_CAPSTONE[vivi_arch] # Other option for normal is loose, but we won't use those here return mkyara_sig_generation(start_va, bytez, arch, mode) -######## .NET Related Classes and Functions ######## + +# .NET Related Classes and Functions + def format_operand(pe, op): """Return a string representation of a .NET operand - + Use a dnfile object to reference .NET tables to understand methods, classes, and strings - + Args: pe: dnfile object for a .NET PE op: dncil operand from an instruction @@ -272,19 +288,27 @@ def format_operand(pe, op): elif isinstance(op, list): return f"[{', '.join(['({:04X})'.format(x) for x in op])}]" elif isinstance(op, dnfile.mdtable.MemberRefRow) and not isinstance(op.Class.row, dnfile.mdtable.TypeSpecRow): - return f"{str(op.Class.row.TypeNamespace)}.{op.Class.row.TypeName}::{op.Name}" + retstr = getattr(op.Class.row, "TypeNamespace", "") + if retstr != "": + retstr += "." + retstr += getattr(op.Class.row, "TypeName", "") + if retstr != "": + retstr += "::" + retstr += op.Name + return retstr elif isinstance(op, (dnfile.mdtable.FieldRow, dnfile.mdtable.MethodDefRow, dnfile.mdtable.MemberRefRow)): return f"{op.Name}" elif isinstance(op, (dnfile.mdtable.TypeDefRow, dnfile.mdtable.TypeRefRow)): - return f"{op.TypeNamespace}.{op.TypeName}" + return f"{op.TypeNamespace}.{op.TypeName}" elif isinstance(op, (dnfile.mdtable.TypeSpecRow, dnfile.mdtable.MethodSpecRow)): return f"{str(op.struct)}" else: return "" if op is None else str(op) + def get_sig_and_mask_for_dotnet_func(dnpe, body): """Return the comment, sig, and bytes of a .NET Method - + Iterate a method body to get IL bytes and mask the operand values to create a more flexible signature @@ -302,45 +326,47 @@ def get_sig_and_mask_for_dotnet_func(dnpe, body): func_bytes = "" for insn in body.instructions: comment += ( - "{:04X}".format(insn.offset) - + " " - + f"{' '.join('{:02x}'.format(b) for b in insn.get_bytes()) : <20}" - + f"{str(insn.opcode) : <15}" - + format_operand(dnpe, insn.operand) - + "\n" - ) + "{:04X}".format(insn.offset) + + " " + + f"{' '.join('{:02x}'.format(b) for b in insn.get_bytes()) : <20}" + + f"{str(insn.opcode) : <15}" + + format_operand(dnpe, insn.operand) + + "\n" + ) sig += insn.get_opcode_bytes().hex() func_bytes += insn.get_opcode_bytes().hex() if insn.operand: - sig += '??' * len(insn.get_operand_bytes()) + sig += "??" * len(insn.get_operand_bytes()) func_bytes += insn.get_operand_bytes().hex() # Format the sig to be in the same style as the vivi portion (bytes seperated by spaces) formatted_sig = "" for idx, val in enumerate(sig): - if idx > 0 and idx % 2 == 0: + if idx > 0 and idx % 2 == 0: formatted_sig += " " formatted_sig += val - - + return comment, formatted_sig, func_bytes -######## CodeFeature Extractor Related Classes and Functions ######## -class CodeFeature(): - """Basic object that that will be used to create yara rules - """ - def __init__(self, sig: str, comment: str, bytez: bytes, filemd5:str): +# CodeFeature Extractor Related Classes and Functions + + +class CodeFeature: + """Basic object that that will be used to create yara rules""" + + def __init__(self, sig: str, comment: str, bytez: bytes, filemd5: str): self.sig = sig self.comment = comment self.bytez = bytez self.filemd5 = filemd5 + def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor): - """Returns a dictionary mapping a filemd5 to a list of CodeFeatures - + """Returns a dictionary mapping a filemd5 to a list of CodeFeatures + This function operates on x86/x64 PE files and creates CodeFeatures based on basic block and function CAPA matches @@ -353,25 +379,24 @@ def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor): # Grab the vivisect workspace object try: file_vw = extractor.vw - except: + except AttributeError: print("No extractor workspace") file_vw = None raise - # Get the filemd5 + # Get the filemd5 filemd5 = doc.meta.sample.md5 - cb_matches = collections.defaultdict(set) func_matches = collections.defaultdict(set) for rule in rutils.capability_rules(doc): if rule.meta.scope == capa.rules.FUNCTION_SCOPE: - for addr, _ in rule.matches: - func_matches[addr.value].add(rule.meta.name) + for addr_object, _ in rule.matches: + func_matches[addr_object.value].add(rule.meta.name) elif rule.meta.scope == capa.rules.BASIC_BLOCK_SCOPE: - for addr, _ in rule.matches: - cb_matches[addr.value].add(rule.meta.name) + for addr_object, _ in rule.matches: + cb_matches[addr_object.value].add(rule.meta.name) else: # file scope pass @@ -386,7 +411,7 @@ def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor): bytez = get_cb_bytes(file_vw, addr) sig = genSigAndMask(addr, bytez, doc.meta.analysis.arch) - code_features.append(CodeFeature(sig,comment,bytez,filemd5)) + code_features.append(CodeFeature(sig, comment, bytez, filemd5)) for addr, rules in func_matches.items(): comment = f"function at 0x{addr:08x}@{filemd5} with {len(rules)} features:\n" @@ -396,16 +421,16 @@ def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor): bytez = get_function_bytes(file_vw, addr) sig = genSigAndMask(addr, bytez, doc.meta.analysis.arch) - code_features.append(CodeFeature(sig,comment,bytez,filemd5)) - + code_features.append(CodeFeature(sig, comment, bytez, filemd5)) if len(code_features) == 0: logger.warning("No code features found for %s", filemd5) return {filemd5: code_features} + def get_code_features_for_dotnet_doc(doc: rd.ResultDocument, extractor): - """Returns a dictionary mapping a filemd5 to a list of CodeFeatures - + """Returns a dictionary mapping a filemd5 to a list of CodeFeatures + This function operates on .NET PE files and creates CodeFeatures based on .NET method CAPA matches @@ -418,7 +443,7 @@ def get_code_features_for_dotnet_doc(doc: rd.ResultDocument, extractor): # Grab the vivisect workspace object try: dnpe = extractor.pe - except: + except AttributeError: print("No dnpe file found") raise @@ -428,20 +453,20 @@ def get_code_features_for_dotnet_doc(doc: rd.ResultDocument, extractor): for rule in rutils.capability_rules(doc): if rule.meta.scope == capa.rules.FUNCTION_SCOPE: - for addr, _ in rule.matches: - func_matches[addr.value].add(rule.meta.name) + for addr_object, _ in rule.matches: + func_matches[addr_object.value].add(rule.meta.name) else: # file scope pass - # Funcs is the cache of functions we need to reference to get + # Funcs is the cache of functions we need to reference to get # the underlying dnfile object funcs = list(extractor.get_functions()) # Return list of CodeFeature objects code_features = [] - logger.debug(f"Building CodeFeatures for {len(func_matches.keys())} functions in {filemd5}") + logger.debug("Building CodeFeatures for %s functions in %s", len(func_matches.keys()), filemd5) for addr, rules in func_matches.items(): func_name = extractor.token_cache.get_method(addr) comment = f"function {func_name} 0x{addr:08x}@{filemd5} with {len(rules)} features:\n" @@ -454,23 +479,24 @@ def get_code_features_for_dotnet_doc(doc: rd.ResultDocument, extractor): func_comment, sig, bytez = get_sig_and_mask_for_dotnet_func(dnpe, f.inner) comment += func_comment - code_features.append(CodeFeature(sig,comment,bytez,filemd5)) - + code_features.append(CodeFeature(sig, comment, bytez, filemd5)) if len(code_features) == 0: logger.warning("No code features found for %s", filemd5) return {filemd5: code_features} -######## CAPA Entrypoints ######## + +# CAPA Entrypoints + def run_capa_and_get_features(args): """Main CAPA analysis entrypoint - - This function kicks off CAPA analysis and builds CodeFeatures that + + This function kicks off CAPA analysis and builds CodeFeatures that will be used to build yara rules in the main thread. Args: - args: Tuple containing the following + args: Tuple containing the following - rules: CAPA rules loaded from a repo - sig_paths: Path to signatures used for library identification - format: Format for processing (dotnet or auto are the expected values) @@ -516,7 +542,7 @@ def run_capa_and_get_features(args): } meta = capa.main.collect_metadata([], path, format, os_, [], extractor) - logger.info(f"Collecting capabilities for {path}") + logger.info("Collecting capabilities for %s", path) capabilities, counts = capa.main.find_capabilities(rules, extractor, disable_progress=True) meta.analysis.feature_counts = counts["feature_counts"] @@ -526,15 +552,15 @@ def run_capa_and_get_features(args): if capa.main.has_file_limitation(rules, capabilities): # bail if capa encountered file limitation e.g. a packed binary # do show the output in verbose mode, though. - return { - "path": path, - "status": "error", - "error": f"Encountered file limitation", - } + return { + "path": path, + "status": "error", + "error": "Encountered file limitation", + } try: doc = rd.ResultDocument.from_capa(meta, rules, capabilities) - logger.info(f"Building code features for {path}") + logger.info("Building code features for %s", path) if type(extractor) == DnfileFeatureExtractor: # Handle .NET files features = get_code_features_for_dotnet_doc(doc, extractor) @@ -552,13 +578,13 @@ def run_capa_and_get_features(args): def multi_process_capa(argv=None): """CAPA argument handler and multiprocessing manager - + This function processes CLI arguments and kicks of capa analysis and extacts CodeFeatures into a dictionary that maps filemd5s to a list of CodeFeatures that will be used to build yara rules Args: - argv: + argv: Returns: dict: dictionary mapping filemd5's processed to a list of CodeFeatures """ @@ -568,9 +594,7 @@ def multi_process_capa(argv=None): parser = argparse.ArgumentParser(description="Build YARA rules for CAPA matches") capa.main.install_common_args(parser, wanted={"rules", "signatures", "format", "os"}) parser.add_argument("input", type=str, nargs="+", help="Path to directory or files to analyze") - parser.add_argument( - "-n", "--parallelism", type=int, default=multiprocessing.cpu_count(), help="parallelism factor" - ) + parser.add_argument("-n", "--parallelism", type=int, default=multiprocessing.cpu_count(), help="parallelism factor") parser.add_argument("--no-mp", action="store_true", help="disable subprocesses") args = parser.parse_args(args=argv) capa.main.handle_common_args(args) @@ -594,11 +618,11 @@ def multi_process_capa(argv=None): if not path.exists(): raise ValueError(f"Invalid path {p}") if path.is_dir(): - samples.extend([x for x in path.rglob("*")]) + for subpath in path.rglob("*"): + samples.append(subpath) elif path.is_file(): samples.append(path) logger.info("Starting to process %s files", len(samples)) - cpu_count = multiprocessing.cpu_count() @@ -633,23 +657,25 @@ def map(f, args, parallelism=None): parallelism=args.parallelism, ): if result["status"] == "error": - logger.warning(f'{result["path"]}: {result["error"]}') + logger.warning("%s: %s", result["path"], result["error"]) elif result["status"] == "ok": results.update(result["ok"]) else: - raise ValueError(f"unexpected status: {result['status']}") + raise ValueError("unexpected status: %s", result["status"]) - logger.info(f"Done processing {len(samples)} samples") + logger.info("Done processing %s samples", len(samples)) return results -######## YARA related functions ######## -CODE_FEATURES_REFERENCED = [] +# YARA related functions + +CODE_FEATURES_REFERENCED: List[CodeFeature] = [] + def build_rule_from_combo(combo_dict: dict, **kwargs): """Build a yaramod yara rule using a combination dictionary - + Args: combo_dict: Dictionary of features that all matched on a group of files Returns: @@ -659,15 +685,14 @@ def build_rule_from_combo(combo_dict: dict, **kwargs): # we're going to use this to create unique code features to insert the comment strings global CODE_FEATURES_REFERENCED - # Build metadata for the rule rule_name = "super_rule_" + "_".join([x[:5] for x in sorted(combo_dict["files"])]) - metadict = dict( - author=kwargs.get("author", "CAPA Matches"), - date_created=kwargs.get("date_created", date.today().isoformat()), - date_modified=kwargs.get("date_modified", date.today().isoformat()), - description=kwargs.get("description", ""), - ) + metadict = { + "author": kwargs.get("author", "CAPA Matches"), + "date_created": kwargs.get("date_created", date.today().isoformat()), + "date_modified": kwargs.get("date_modified", date.today().isoformat()), + "description": kwargs.get("description", ""), + } rule = yaramod.YaraRuleBuilder().with_name(rule_name) for metakey, metavalue in metadict.items(): @@ -680,30 +705,29 @@ def build_rule_from_combo(combo_dict: dict, **kwargs): rule = rule.with_string_meta("md5", hsh) conditions = [yaramod.of(yaramod.all(), yaramod.them())] - for codefeature in combo_dict['features']: + for codefeature in combo_dict["features"]: idx = len(CODE_FEATURES_REFERENCED) hexstr = yaramod.YaraHexStringBuilder() for byte in codefeature.sig.split(" "): if byte == "??": hexstr = hexstr.add(yaramod.wildcard()) - elif byte == '': + elif byte == "": continue else: hexstr = hexstr.add(yaramod.YaraHexStringBuilder(int(byte, 16))) rule = rule.with_hex_string(f"$c{idx}", hexstr.get()) CODE_FEATURES_REFERENCED.append(codefeature) - if len(conditions) == 1: # No fancy expression needed rule = rule.with_condition(conditions[0].get()) else: - rule = rule.with_condition( - yaramod.conjunction(conditions, linebreaks=True).get() - ) + rule = rule.with_condition(yaramod.conjunction(conditions, linebreaks=True).get()) return rule.get() -TAB_CHAR = " "*4 + +TAB_CHAR = " " * 4 + def replace_tabs_with_spaces(yara_text): """Replacing tabs with spaces in yara rule @@ -715,6 +739,7 @@ def replace_tabs_with_spaces(yara_text): """ return yara_text.replace("\t", TAB_CHAR) + def add_comments_to_yara_file(yara_text): """Add comments to yara file text @@ -729,11 +754,23 @@ def add_comments_to_yara_file(yara_text): # replace it with the comment search_str = f"$c{idx} =" comment_str = "/*\n" - comment_str += ("\n"+2*TAB_CHAR).join(feature.comment.split("\n")) - comment_str += "*/\n" + 2*TAB_CHAR + search_str + comment_str += ("\n" + 2 * TAB_CHAR).join(feature.comment.split("\n")) + comment_str += "*/\n" + 2 * TAB_CHAR + search_str yara_text = yara_text.replace(search_str, comment_str) return yara_text + +class SimilarityDictEntry: + """Simple object to hold information about a feature in a similarity dictionary""" + + values: List[CodeFeature] + files: Set[str] + + def __init__(self) -> None: + self.values = [] + self.files = set() + + def build_yara_ruleset(files_dict, **kwargs): """Build a YARA ruleset string based on CodeFeatures @@ -749,35 +786,30 @@ def build_yara_ruleset(files_dict, **kwargs): for filemd5, features in files_dict.items(): for value in features: if value.sig not in similarity_dict: - similarity_dict[value.sig] = { - "values":[value], - "files":set([value.filemd5]) - } - else: - similarity_dict[value.sig]['values'].append(value) - similarity_dict[value.sig]['files'].add(value.filemd5) + similarity_dict[value.sig] = SimilarityDictEntry() + similarity_dict[value.sig].values.append(value) + similarity_dict[value.sig].files.add(filemd5) # Next we build out a combodict and track which files have which combos of features - file_combinations = {} + file_combinations: Dict[str, dict] = {} for feature, result_dict in similarity_dict.items(): - sample_combo_key = ":".join(list(sorted(result_dict["files"]))) + logger.debug("Processing feature: %s", feature) + sample_combo_key = ":".join(sorted(result_dict.files)) + # logger.debug("Combo Key: %s", sample_combo_key) if sample_combo_key not in file_combinations: - file_combinations[sample_combo_key] = dict() - file_combinations[sample_combo_key]["files"] = sorted( - result_dict["files"] - ) + file_combinations[sample_combo_key] = {} + file_combinations[sample_combo_key]["files"] = sorted(result_dict.files) file_combinations[sample_combo_key]["feature_count"] = 0 file_combinations[sample_combo_key]["features"] = [] - # Use the full code feature from the alphabetical match - chosen_code_version = sorted(result_dict['values'], key=lambda x: x.filemd5)[0] - file_combinations[sample_combo_key]["features"].append( - chosen_code_version - ) + chosen_code_version = sorted(result_dict.values, key=lambda x: x.filemd5)[0] + file_combinations[sample_combo_key]["features"].append(chosen_code_version) + logger.warning("Adding %s to %s", chosen_code_version.sig, sample_combo_key) file_combinations[sample_combo_key]["feature_count"] += 1 # Create a list of combo keys and sort them so we get deterministic output combo_keys = sorted(file_combinations.keys(), key=lambda x: (len(x), x)) + logger.warning("Combo Keys: %s", combo_keys) # Build the YARA rule set based on the grouping yara_file = yaramod.YaraFileBuilder() @@ -785,9 +817,7 @@ def build_yara_ruleset(files_dict, **kwargs): for key in combo_keys: combo_dict = file_combinations[key] - rule = build_rule_from_combo( - combo_dict, **kwargs - ) + rule = build_rule_from_combo(combo_dict, **kwargs) if rule is not None: observed_files.extend(combo_dict["files"]) yara_file = yara_file.with_rule(rule) @@ -803,10 +833,10 @@ def build_yara_ruleset(files_dict, **kwargs): return yara_text - def main(argv=None): all_features = multi_process_capa(argv) print(build_yara_ruleset(all_features)) + if __name__ == "__main__": sys.exit(main()) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 91a1ae340..6b4e392d4 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -10,8 +10,8 @@ import logging import textwrap import subprocess -from datetime import date from pathlib import Path +from datetime import date import pytest @@ -27,8 +27,10 @@ def get_script_path(s: str): def get_file_path(): return str(CD / "data" / "9324d1a8ae37a36ae560c37448c9705a.exe_") + def get_data_path(p: str): - return str(CD / "data" / p ) + return str(CD / "data" / p) + def get_rules_path(): return str(CD / ".." / "rules") @@ -71,87 +73,80 @@ def test_bulk_process(tmp_path): p = run_program(get_script_path("bulk-process.py"), [str(t.parent)]) assert p.returncode == 0 + @pytest.mark.parametrize( "script,args,expected_output_path", [ # Test match-2-yar x86 EXE pytest.param( - "match-2-yar.py", - [ - get_data_path("9324d1a8ae37a36ae560c37448c9705a.exe_") - ], - "yara/expected_9324d1a8ae37a36ae560c37448c9705a.exe_.yar" + "match-2-yar.py", + [get_data_path("9324d1a8ae37a36ae560c37448c9705a.exe_")], + "yara/expected_9324d1a8ae37a36ae560c37448c9705a.exe_.yar", ), # Test match-2-yar x64 EXE pytest.param( - "match-2-yar.py", - [ - get_data_path("c2bb17c12975ea61ff43a71afd9c3ff111d018af161859abae0bdb0b3dae98f9.exe_") - ], - "yara/expected_c2bb17c12975e.yar" + "match-2-yar.py", + [get_data_path("c2bb17c12975ea61ff43a71afd9c3ff111d018af161859abae0bdb0b3dae98f9.exe_")], + "yara/expected_c2bb17c12975e.yar", ), # Test match-2-yar x86 .NET EXE pytest.param( - "match-2-yar.py", + "match-2-yar.py", [ - "-f", - "dotnet", + "-f", + "dotnet", get_data_path("dotnet/1c444ebeba24dcba8628b7dfe5fec7c6.exe_"), - - ], - "yara/expected_1c444ebeba24dcba8628b7dfe5fec7c6.exe_.yar" + ], + "yara/expected_1c444ebeba24dcba8628b7dfe5fec7c6.exe_.yar", ), # Test match-2-yar files with multiple X86 PEs pytest.param( - "match-2-yar.py", - [ + "match-2-yar.py", + [ get_data_path("Practical Malware Analysis Lab 03-04.exe_"), get_data_path("Practical Malware Analysis Lab 11-03.exe_"), - get_data_path("Practical Malware Analysis Lab 16-01.exe_") - ], - "yara/expected_pma_03-04.exe_11-03.exe_16-01.exe" + get_data_path("Practical Malware Analysis Lab 16-01.exe_"), + ], + "yara/expected_pma_03-04.exe_11-03.exe_16-01.exe", ), # Test match-2-yar files with CAPA file limitations are filtered out of multi sample pytest.param( - "match-2-yar.py", - [ + "match-2-yar.py", + [ get_data_path("Practical Malware Analysis Lab 01-01.exe_"), - get_data_path("Practical Malware Analysis Lab 01-02.exe_") - ], - "yara/expected_pma_01-01.exe_01-02.exe" + get_data_path("Practical Malware Analysis Lab 01-02.exe_"), + ], + "yara/expected_pma_01-01.exe_01-02.exe", ), - # Test match-2-yar multiple x86 .NET PE pytest.param( - "match-2-yar.py", + "match-2-yar.py", [ - "-f", - "dotnet", + "-f", + "dotnet", get_data_path("dotnet/1c444ebeba24dcba8628b7dfe5fec7c6.exe_"), get_data_path("dotnet/692f7fd6d198e804d6af98eb9e390d61.exe_"), - - ], - "yara/expected_1c444ebe_692f7fd6.yar" + ], + "yara/expected_1c444ebe_692f7fd6.yar", ), ], ) def test_script_expected_output(script, args, expected_output_path): script_path = get_script_path(script) - with open(get_data_path(expected_output_path), 'rb') as f: - expected_output = f.read() - + + expected_output = Path(get_data_path(expected_output_path)).read_bytes() # Update dates in expected output to be todays date dates_to_replace = [ b"2023-08-10", ] for dt in dates_to_replace: - expected_output = expected_output.replace(dt, date.today().isoformat().encode('utf8')) + expected_output = expected_output.replace(dt, date.today().isoformat().encode("utf8")) p = run_program(script_path, args) assert p.returncode == 0 - assert p.stdout.decode('utf8') == expected_output.decode('utf8') - + assert p.stdout.decode("utf8") == expected_output.decode("utf8") + def run_program(script_path, args): args = [sys.executable] + [script_path] + args @@ -283,3 +278,9 @@ def test_detect_duplicate_features(tmpdir): args = [rule_dir.strpath, rule_path] overlaps_found = run_program(script_path, args) assert overlaps_found.returncode == expected_overlaps + + +# Rough outline for function extract bytes, function length, function masking +# Use importlib to import the script +# Use fixtures vivisect to get a vivisect workspace for a given path +# We can use known functions from the yara matches to extract out length, bytes, and masked sig From f04359c6a6ce87f2d2a5d708ec43c825b7cde7eb Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Tue, 22 Aug 2023 17:24:58 -0400 Subject: [PATCH 06/21] Add test to validate match-2-yar feature extraction --- scripts/match-2-yar.py | 48 ++++++++++++++++++----- tests/test_scripts.py | 88 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 9 deletions(-) diff --git a/scripts/match-2-yar.py b/scripts/match-2-yar.py index 08a095e8f..17d02f646 100644 --- a/scripts/match-2-yar.py +++ b/scripts/match-2-yar.py @@ -30,12 +30,14 @@ """ import os import sys +import json import logging import argparse +import binascii import collections import multiprocessing import multiprocessing.pool -from typing import Set, Dict, List +from typing import Set, Dict, List, Union from pathlib import Path from datetime import date @@ -323,7 +325,7 @@ def get_sig_and_mask_for_dotnet_func(dnpe, body): comment = "" sig = "" - func_bytes = "" + func_bytes = b"" for insn in body.instructions: comment += ( "{:04X}".format(insn.offset) @@ -335,11 +337,11 @@ def get_sig_and_mask_for_dotnet_func(dnpe, body): ) sig += insn.get_opcode_bytes().hex() - func_bytes += insn.get_opcode_bytes().hex() + func_bytes += insn.get_opcode_bytes() if insn.operand: sig += "??" * len(insn.get_operand_bytes()) - func_bytes += insn.get_operand_bytes().hex() + func_bytes += insn.get_operand_bytes() # Format the sig to be in the same style as the vivi portion (bytes seperated by spaces) formatted_sig = "" @@ -357,11 +359,25 @@ def get_sig_and_mask_for_dotnet_func(dnpe, body): class CodeFeature: """Basic object that that will be used to create yara rules""" - def __init__(self, sig: str, comment: str, bytez: bytes, filemd5: str): - self.sig = sig + def __init__( + self, sig: str, comment: str, bytez: bytes, filemd5: str, addr: Union[int, tuple[int, int], None], scope: str + ): + self.sig = sig.strip().upper() self.comment = comment self.bytez = bytez + self.addr = addr self.filemd5 = filemd5 + self.scope = scope + + def json(self): + return { + "sig": self.sig, + "comment": self.comment, + "bytez": binascii.hexlify(self.bytez, " ", bytes_per_sep=1).decode("utf8").upper(), + "addr": self.addr, + "filemd5": self.filemd5, + "scope": self.scope, + } def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor): @@ -411,7 +427,7 @@ def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor): bytez = get_cb_bytes(file_vw, addr) sig = genSigAndMask(addr, bytez, doc.meta.analysis.arch) - code_features.append(CodeFeature(sig, comment, bytez, filemd5)) + code_features.append(CodeFeature(sig, comment, bytez, filemd5, addr, capa.rules.BASIC_BLOCK_SCOPE)) for addr, rules in func_matches.items(): comment = f"function at 0x{addr:08x}@{filemd5} with {len(rules)} features:\n" @@ -421,7 +437,7 @@ def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor): bytez = get_function_bytes(file_vw, addr) sig = genSigAndMask(addr, bytez, doc.meta.analysis.arch) - code_features.append(CodeFeature(sig, comment, bytez, filemd5)) + code_features.append(CodeFeature(sig, comment, bytez, filemd5, addr, capa.rules.FUNCTION_SCOPE)) if len(code_features) == 0: logger.warning("No code features found for %s", filemd5) @@ -479,7 +495,7 @@ def get_code_features_for_dotnet_doc(doc: rd.ResultDocument, extractor): func_comment, sig, bytez = get_sig_and_mask_for_dotnet_func(dnpe, f.inner) comment += func_comment - code_features.append(CodeFeature(sig, comment, bytez, filemd5)) + code_features.append(CodeFeature(sig, comment, bytez, filemd5, addr, capa.rules.FUNCTION_SCOPE)) if len(code_features) == 0: logger.warning("No code features found for %s", filemd5) @@ -596,6 +612,7 @@ def multi_process_capa(argv=None): parser.add_argument("input", type=str, nargs="+", help="Path to directory or files to analyze") parser.add_argument("-n", "--parallelism", type=int, default=multiprocessing.cpu_count(), help="parallelism factor") parser.add_argument("--no-mp", action="store_true", help="disable subprocesses") + parser.add_argument("--dump-features", action="store_true", help="output feature dictionary as json") args = parser.parse_args(args=argv) capa.main.handle_common_args(args) @@ -665,9 +682,22 @@ def map(f, args, parallelism=None): logger.info("Done processing %s samples", len(samples)) + if args.dump_features: + dump_file_features(results) + sys.exit(0) + return results +# Output related functions + + +def dump_file_features(result_dict: dict): + """Print out bytes for the code features extracted""" + output_dict = {filemd5: [x.json() for x in features] for filemd5, features in result_dict.items()} + print(json.dumps(output_dict, indent=4)) + + # YARA related functions CODE_FEATURES_REFERENCED: List[CodeFeature] = [] diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 6b4e392d4..ccbba7f41 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -7,14 +7,18 @@ # See the License for the specific language governing permissions and limitations under the License. import sys +import json import logging import textwrap import subprocess from pathlib import Path from datetime import date +from functools import lru_cache import pytest +import capa.rules + logger = logging.getLogger(__name__) CD = Path(__file__).resolve().parent @@ -154,6 +158,16 @@ def run_program(script_path, args): return subprocess.run(args, stdout=subprocess.PIPE) +@lru_cache(maxsize=1) +def get_match_2_yar_features(path, is_dotnet): + script_path = get_script_path("match-2-yar.py") + args = ["--dump-features", path] + if is_dotnet: + args.extend(["-f", "dotnet"]) + p = run_program(script_path, args) + return p.stdout + + def test_proto_conversion(tmp_path): t = tmp_path / "proto-test" t.mkdir() @@ -284,3 +298,77 @@ def test_detect_duplicate_features(tmpdir): # Use importlib to import the script # Use fixtures vivisect to get a vivisect workspace for a given path # We can use known functions from the yara matches to extract out length, bytes, and masked sig +@pytest.mark.parametrize( + "path,is_dotnet,filemd5,addr,scope,expected_bytestring,expected_sig", + [ + # Test match-2-yar x86 EXE - Basic Block Extraction + pytest.param( + get_data_path("9324d1a8ae37a36ae560c37448c9705a.exe_"), + False, + "9324d1a8ae37a36ae560c37448c9705a", + 0x004031A0, + capa.rules.BASIC_BLOCK_SCOPE, + "83 EC 10 B0 6C 8B 15 24 A0 40 00 88 44 24 01 88 44 24 02 B0 6F 8D 4C 24 00 88 44 24 04 88 44 24 0B 8B 44 24 14 C6 44 24 00 44 50 51 52 6A 00 C6 44 24 13 53 C6 44 24 15 72 C6 44 24 16 74 C6 44 24 17 57 C6 44 24 18 69 C6 44 24 19 6E C6 44 24 1A 64 C6 44 24 1C 77 C6 44 24 1D 00 E8 EF F7 FF FF A3 C4 A9 40 00 33 C0 83 C4 20 C2 04 00", + "83 EC 10 B0 6C 8B 15 ?? ?? ?? ?? 88 44 24 ?? 88 44 24 ?? B0 6F 8D 4C 24 ?? 88 44 24 ?? 88 44 24 ?? 8B 44 24 ?? C6 44 24 ?? 44 50 51 52 6A 00 C6 44 24 ?? 53 C6 44 24 ?? 72 C6 44 24 ?? 74 C6 44 24 ?? 57 C6 44 24 ?? 69 C6 44 24 ?? 6E C6 44 24 ?? 64 C6 44 24 ?? 77 C6 44 24 ?? 00 E8 ?? ?? ?? ?? A3 ?? ?? ?? ?? 33 C0 83 C4 20 C2 04 00", + ), + # Test match-2-yar x86 EXE - Function Extraction + pytest.param( + get_data_path("9324d1a8ae37a36ae560c37448c9705a.exe_"), + False, + "9324d1a8ae37a36ae560c37448c9705a", + 0x004019C0, + capa.rules.FUNCTION_SCOPE, + "81 EC 7C 04 00 00 53 55 8B 2D 14 92 40 00 56 8B F1 57 6A 00 8D 44 24 14 8B 8E A8 00 00 00 6A 04 B3 02 50 51 C7 44 24 28 03 00 00 00 C7 44 24 2C 00 00 00 00 C6 44 24 20 05 88 5C 24 21 C6 44 24 22 00 88 5C 24 23 FF D5 B9 96 00 00 00 33 C0 8D BC 24 34 02 00 00 8B 96 A8 00 00 00 F3 AB 8D 44 24 18 8D 4C 24 2C 50 6A 00 6A 00 51 6A 00 89 54 24 44 C7 44 24 40 01 00 00 00 FF 15 10 92 40 00 85 C0 7F 0C 8B 96 A8 00 00 00 52 E9 5D 02 00 00 8B 8E A8 00 00 00 6A 00 8D 84 24 38 02 00 00 68 58 02 00 00 50 51 FF 15 0C 92 40 00 80 BC 24 34 02 00 00 05 0F 85 2C 02 00 00 8A 84 24 35 02 00 00 84 C0 74 0A 3A C3 0F 85 19 02 00 00 EB 08 3A C3 0F 85 30 01 00 00 8B 0D D4 AA 40 00 68 A0 A5 40 00 E8 49 2C 00 00 85 C0 0F 86 18 01 00 00 8B 0D D4 AA 40 00 68 A0 A5 40 00 E8 31 2C 00 00 8B 0D D4 AA 40 00 68 A0 A6 40 00 8B D8 E8 1F 2C 00 00 89 44 24 14 B9 40 00 00 00 33 C0 8D BC 24 30 01 00 00 F3 AB 66 AB 8B 3D 94 90 40 00 8D 94 24 32 01 00 00 68 A0 A5 40 00 52 C6 84 24 38 01 00 00 05 88 9C 24 39 01 00 00 FF D7 8D 44 24 14 6A 04 8D 8C 1C 36 01 00 00 50 51 8B 0D D4 AA 40 00 E8 3B 2A 00 00 8D 94 1C 33 01 00 00 68 A0 A6 40 00 52 FF D7 8B 44 24 14 6A 00 8D 94 24 34 01 00 00 8D 4C 18 03 8B 86 A8 00 00 00 51 52 50 FF D5 8D 54 24 18 33 C0 B9 96 00 00 00 8D BC 24 34 02 00 00 52 50 F3 AB 8B 8E A8 00 00 00 50 8D 44 24 38 89 4C 24 3C 50 6A 00 C7 44 24 40 01 00 00 00 FF 15 10 92 40 00 85 C0 0F 8E 18 01 00 00 8B 86 A8 00 00 00 6A 00 8D 94 24 38 02 00 00 68 58 02 00 00 52 50 FF 15 0C 92 40 00 80 BC 24 34 02 00 00 05 0F 85 EE 00 00 00 8A 84 24 35 02 00 00 84 C0 0F 85 DF 00 00 00 8B 94 24 90 04 00 00 52 FF 15 FC 91 40 00 85 C0 0F 84 D6 00 00 00 C6 44 24 20 05 C6 44 24 21 01 C6 44 24 22 00 C6 44 24 23 01 8B 40 0C 8B 08 8B 84 24 94 04 00 00 50 8B 11 89 54 24 28 FF 15 08 92 40 00 8B 96 A8 00 00 00 6A 00 8D 4C 24 24 6A 0A 51 52 66 89 44 24 38 FF D5 B9 96 00 00 00 33 C0 8D BC 24 34 02 00 00 8D 54 24 2C F3 AB 8B 86 A8 00 00 00 8D 4C 24 18 51 6A 00 6A 00 52 6A 00 89 44 24 44 C7 44 24 40 01 00 00 00 FF 15 10 92 40 00 85 C0 7F 09 8B 86 A8 00 00 00 50 EB 47 8B 96 A8 00 00 00 6A 00 8D 8C 24 38 02 00 00 68 58 02 00 00 51 52 FF 15 0C 92 40 00 80 BC 24 34 02 00 00 05 75 D1 8A 84 24 35 02 00 00 84 C0 75 C6 5F 5E 5D B0 01 5B 81 C4 7C 04 00 00 C2 08 00 8B 8E A8 00 00 00 51 FF 15 04 92 40 00 5F 5E 5D 32 C0 5B 81 C4 7C 04 00 00 C2 08 00", + "81 EC 7C 04 00 00 53 55 8B 2D ?? ?? ?? ?? 56 8B F1 57 6A 00 8D 44 24 ?? 8B 8E ?? ?? ?? ?? 6A 04 B3 02 50 51 C7 44 24 ?? 03 00 00 00 C7 44 24 ?? 00 00 00 00 C6 44 24 ?? 05 88 5C 24 ?? C6 44 24 ?? 00 88 5C 24 ?? FF D5 B9 96 00 00 00 33 C0 8D BC 24 ?? ?? ?? ?? 8B 96 ?? ?? ?? ?? F3 AB 8D 44 24 ?? 8D 4C 24 ?? 50 6A 00 6A 00 51 6A 00 89 54 24 ?? C7 44 24 ?? 01 00 00 00 FF 15 ?? ?? ?? ?? 85 C0 7F ?? 8B 96 ?? ?? ?? ?? 52 E9 ?? ?? ?? ?? 8B 8E ?? ?? ?? ?? 6A 00 8D 84 24 ?? ?? ?? ?? 68 58 02 00 00 50 51 FF 15 ?? ?? ?? ?? 80 BC 24 ?? ?? ?? ?? 05 0F 85 ?? ?? ?? ?? 8A 84 24 ?? ?? ?? ?? 84 C0 74 ?? 3A C3 0F 85 ?? ?? ?? ?? EB ?? 3A C3 0F 85 ?? ?? ?? ?? 8B 0D ?? ?? ?? ?? 68 A0 A5 40 00 E8 ?? ?? ?? ?? 85 C0 0F 86 ?? ?? ?? ?? 8B 0D ?? ?? ?? ?? 68 A0 A5 40 00 E8 ?? ?? ?? ?? 8B 0D ?? ?? ?? ?? 68 A0 A6 40 00 8B D8 E8 ?? ?? ?? ?? 89 44 24 ?? B9 40 00 00 00 33 C0 8D BC 24 ?? ?? ?? ?? F3 AB 66 AB 8B 3D ?? ?? ?? ?? 8D 94 24 ?? ?? ?? ?? 68 A0 A5 40 00 52 C6 84 24 ?? ?? ?? ?? 05 88 9C 24 ?? ?? ?? ?? FF D7 8D 44 24 ?? 6A 04 8D 8C 1C ?? ?? ?? ?? 50 51 8B 0D ?? ?? ?? ?? E8 ?? ?? ?? ?? 8D 94 1C ?? ?? ?? ?? 68 A0 A6 40 00 52 FF D7 8B 44 24 ?? 6A 00 8D 94 24 ?? ?? ?? ?? 8D 4C 18 ?? 8B 86 ?? ?? ?? ?? 51 52 50 FF D5 8D 54 24 ?? 33 C0 B9 96 00 00 00 8D BC 24 ?? ?? ?? ?? 52 50 F3 AB 8B 8E ?? ?? ?? ?? 50 8D 44 24 ?? 89 4C 24 ?? 50 6A 00 C7 44 24 ?? 01 00 00 00 FF 15 ?? ?? ?? ?? 85 C0 0F 8E ?? ?? ?? ?? 8B 86 ?? ?? ?? ?? 6A 00 8D 94 24 ?? ?? ?? ?? 68 58 02 00 00 52 50 FF 15 ?? ?? ?? ?? 80 BC 24 ?? ?? ?? ?? 05 0F 85 ?? ?? ?? ?? 8A 84 24 ?? ?? ?? ?? 84 C0 0F 85 ?? ?? ?? ?? 8B 94 24 ?? ?? ?? ?? 52 FF 15 ?? ?? ?? ?? 85 C0 0F 84 ?? ?? ?? ?? C6 44 24 ?? 05 C6 44 24 ?? 01 C6 44 24 ?? 00 C6 44 24 ?? 01 8B 40 ?? 8B 08 8B 84 24 ?? ?? ?? ?? 50 8B 11 89 54 24 ?? FF 15 ?? ?? ?? ?? 8B 96 ?? ?? ?? ?? 6A 00 8D 4C 24 ?? 6A 0A 51 52 66 89 44 24 ?? FF D5 B9 96 00 00 00 33 C0 8D BC 24 ?? ?? ?? ?? 8D 54 24 ?? F3 AB 8B 86 ?? ?? ?? ?? 8D 4C 24 ?? 51 6A 00 6A 00 52 6A 00 89 44 24 ?? C7 44 24 ?? 01 00 00 00 FF 15 ?? ?? ?? ?? 85 C0 7F ?? 8B 86 ?? ?? ?? ?? 50 EB ?? 8B 96 ?? ?? ?? ?? 6A 00 8D 8C 24 ?? ?? ?? ?? 68 58 02 00 00 51 52 FF 15 ?? ?? ?? ?? 80 BC 24 ?? ?? ?? ?? 05 75 ?? 8A 84 24 ?? ?? ?? ?? 84 C0 75 ?? 5F 5E 5D B0 01 5B 81 C4 7C 04 00 00 C2 08 00 8B 8E ?? ?? ?? ?? 51 FF 15 ?? ?? ?? ?? 5F 5E 5D 32 C0 5B 81 C4 7C 04 00 00 C2 08 00", + ), + # Test match-2-yar x64 EXE - Basic Block Extraction + pytest.param( + get_data_path("c2bb17c12975ea61ff43a71afd9c3ff111d018af161859abae0bdb0b3dae98f9.exe_"), + False, + "50580ef0b882905316c4569162ea07d9", + 0x14000109F, + capa.rules.BASIC_BLOCK_SCOPE, + "33 C9 BA 1F 03 00 00 41 B8 00 10 00 00 44 8D 49 40 FF 15 4A 0F 00 00 41 B8 1F 03 00 00 48 8B D7 48 8B C8 48 8B D8 E8 65 0D 00 00 48 8D 0D 7F 11 00 00 C7 44 24 20 20 00 00 00 C7 44 24 24 01 00 00 00 48 C7 44 24 28 00 00 00 00 48 89 5C 24 30 48 C7 44 24 38 00 00 00 00 FF 15 0A 0F 00 00 4C 8D 44 24 20 48 8D 15 46 11 00 00 48 8D 0D 77 11 00 00 FF 15 F9 0E 00 00 33 C0 48 8B 4C 24 40 48 33 CC E8 2A 00 00 00 48 8B 5C 24 60 48 83 C4 50 5F C3", + "33 C9 BA 1F 03 00 00 41 B8 00 10 00 00 44 8D 49 ?? FF 15 ?? ?? ?? ?? 41 B8 1F 03 00 00 48 8B D7 48 8B C8 48 8B D8 E8 ?? ?? ?? ?? 48 8D 0D ?? ?? ?? ?? C7 44 24 ?? 20 00 00 00 C7 44 24 ?? 01 00 00 00 48 C7 44 24 ?? 00 00 00 00 48 89 5C 24 ?? 48 C7 44 24 ?? 00 00 00 00 FF 15 ?? ?? ?? ?? 4C 8D 44 24 ?? 48 8D 15 ?? ?? ?? ?? 48 8D 0D ?? ?? ?? ?? FF 15 ?? ?? ?? ?? 33 C0 48 8B 4C 24 ?? 48 33 CC E8 ?? ?? ?? ?? 48 8B 5C 24 ?? 48 83 C4 50 5F C3", + ), + # Test match-2-yar x64 EXE - Function Extraction + pytest.param( + get_data_path("c2bb17c12975ea61ff43a71afd9c3ff111d018af161859abae0bdb0b3dae98f9.exe_"), + False, + "50580ef0b882905316c4569162ea07d9", + 0x140001010, + capa.rules.FUNCTION_SCOPE, + "48 89 5C 24 08 57 48 83 EC 50 48 8B 05 DF 1F 00 00 48 33 C4 48 89 44 24 40 66 0F 6F 15 8F 12 00 00 48 8D 3D 08 20 00 00 33 C9 B8 00 03 00 00 90 F3 0F 6F 04 39 66 0F EF C2 F3 0F 7F 04 39 F3 0F 6F 4C 39 10 66 0F EF CA F3 0F 7F 4C 39 10 F3 0F 6F 44 39 20 66 0F EF C2 F3 0F 7F 44 39 20 F3 0F 6F 44 39 30 66 0F EF C2 F3 0F 7F 44 39 30 48 83 C1 40 48 3B C8 7C B9 66 0F 1F 84 00 00 00 00 00 80 34 38 62 48 FF C0 48 3D 1F 03 00 00 7C F1 33 C9 BA 1F 03 00 00 41 B8 00 10 00 00 44 8D 49 40 FF 15 4A 0F 00 00 41 B8 1F 03 00 00 48 8B D7 48 8B C8 48 8B D8 E8 65 0D 00 00 48 8D 0D 7F 11 00 00 C7 44 24 20 20 00 00 00 C7 44 24 24 01 00 00 00 48 C7 44 24 28 00 00 00 00 48 89 5C 24 30 48 C7 44 24 38 00 00 00 00 FF 15 0A 0F 00 00 4C 8D 44 24 20 48 8D 15 46 11 00 00 48 8D 0D 77 11 00 00 FF 15 F9 0E 00 00 33 C0 48 8B 4C 24 40 48 33 CC E8 2A 00 00 00 48 8B 5C 24 60 48 83 C4 50 5F C3", + "48 89 5C 24 ?? 57 48 83 EC 50 48 8B 05 ?? ?? ?? ?? 48 33 C4 48 89 44 24 ?? 66 0F 6F 15 ?? ?? 00 00 48 8D 3D ?? ?? ?? ?? 33 C9 B8 00 03 00 00 90 F3 0F 6F 04 39 66 0F EF C2 F3 0F 7F 04 39 F3 0F 6F 4C 39 ?? 66 0F EF CA F3 0F 7F 4C 39 ?? F3 0F 6F 44 39 ?? 66 0F EF C2 F3 0F 7F 44 39 ?? F3 0F 6F 44 39 ?? 66 0F EF C2 F3 0F 7F 44 39 ?? 48 83 C1 40 48 3B C8 7C ?? 66 0F 1F 84 00 ?? ?? 00 00 80 34 38 62 48 FF C0 48 3D 1F 03 00 00 7C ?? 33 C9 BA 1F 03 00 00 41 B8 00 10 00 00 44 8D 49 ?? FF 15 ?? ?? ?? ?? 41 B8 1F 03 00 00 48 8B D7 48 8B C8 48 8B D8 E8 ?? ?? ?? ?? 48 8D 0D ?? ?? ?? ?? C7 44 24 ?? 20 00 00 00 C7 44 24 ?? 01 00 00 00 48 C7 44 24 ?? 00 00 00 00 48 89 5C 24 ?? 48 C7 44 24 ?? 00 00 00 00 FF 15 ?? ?? ?? ?? 4C 8D 44 24 ?? 48 8D 15 ?? ?? ?? ?? 48 8D 0D ?? ?? ?? ?? FF 15 ?? ?? ?? ?? 33 C0 48 8B 4C 24 ?? 48 33 CC E8 ?? ?? ?? ?? 48 8B 5C 24 ?? 48 83 C4 50 5F C3", + ), + # Test match-2-yar .NET EXE - Function Extraction + pytest.param( + get_data_path("dotnet/1c444ebeba24dcba8628b7dfe5fec7c6.exe_"), + True, + "1c444ebeba24dcba8628b7dfe5fec7c6", + 0x06000073, + capa.rules.FUNCTION_SCOPE, + "03 28 7D 00 00 06 0A 12 01 FE 15 0A 00 00 02 03 12 01 28 7F 00 00 06 26 12 01 7B 7B 00 00 04 12 01 7B 79 00 00 04 59 0C 12 01 7B 7C 00 00 04 12 01 7B 7A 00 00 04 59 0D 06 28 77 00 00 06 13 04 06 08 09 28 76 00 00 06 13 05 11 04 11 05 28 7A 00 00 06 13 06 11 04 16 16 08 09 06 16 16 20 20 00 CC 00 28 75 00 00 06 26 11 04 11 06 28 7A 00 00 06 26 11 04 28 78 00 00 06 26 03 06 28 7E 00 00 06 26 11 05 28 65 00 00 0A 13 07 11 05 28 79 00 00 06 26 11 07 2A", + "03 28 ?? ?? ?? ?? 0A 12 ?? FE 15 ?? ?? ?? ?? 03 12 ?? 28 ?? ?? ?? ?? 26 12 ?? 7B ?? ?? ?? ?? 12 ?? 7B ?? ?? ?? ?? 59 0C 12 ?? 7B ?? ?? ?? ?? 12 ?? 7B ?? ?? ?? ?? 59 0D 06 28 ?? ?? ?? ?? 13 ?? 06 08 09 28 ?? ?? ?? ?? 13 ?? 11 ?? 11 ?? 28 ?? ?? ?? ?? 13 ?? 11 ?? 16 16 08 09 06 16 16 20 ?? ?? ?? ?? 28 ?? ?? ?? ?? 26 11 ?? 11 ?? 28 ?? ?? ?? ?? 26 11 ?? 28 ?? ?? ?? ?? 26 03 06 28 ?? ?? ?? ?? 26 11 ?? 28 ?? ?? ?? ?? 13 ?? 11 ?? 28 ?? ?? ?? ?? 26 11 ?? 2A", + ), + ], +) +def test_match2yar_feature_extraction(path, is_dotnet, filemd5, addr, scope, expected_bytestring, expected_sig): + """Test extracting a function byte string using vivisect workspaces""" + output = get_match_2_yar_features(path, is_dotnet) + + output = output.decode("utf8") + output_data = json.loads(output) + + # Get data for filemd5: + file_features = output_data[filemd5] + + # Filter for addr with correct scope + addr_features = [x for x in file_features if x["addr"] == addr and x["scope"] == scope] + + # This should be unique + assert len(addr_features) == 1 + + # Check extraction and masking + assert addr_features[0]["bytez"] == expected_bytestring + assert addr_features[0]["sig"] == expected_sig From db9b2b4c380c958ed509915ec15039b7357f3da4 Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Tue, 22 Aug 2023 17:39:12 -0400 Subject: [PATCH 07/21] Address code style issues --- scripts/match-2-yar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/match-2-yar.py b/scripts/match-2-yar.py index 17d02f646..b41d00d39 100644 --- a/scripts/match-2-yar.py +++ b/scripts/match-2-yar.py @@ -577,7 +577,7 @@ def run_capa_and_get_features(args): try: doc = rd.ResultDocument.from_capa(meta, rules, capabilities) logger.info("Building code features for %s", path) - if type(extractor) == DnfileFeatureExtractor: + if extractor is DnfileFeatureExtractor: # Handle .NET files features = get_code_features_for_dotnet_doc(doc, extractor) else: From a33194c6d04c500116d558719a8f337b0c672043 Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Tue, 22 Aug 2023 17:44:26 -0400 Subject: [PATCH 08/21] Syncs data directory with current master --- tests/data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data b/tests/data index 6fc8f093b..65d3a99c8 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 6fc8f093b86f7213cbffd510ed029dbf28e439d0 +Subproject commit 65d3a99c865177dfc6370b7f3e48163c233d26dc From dd5ff321d49627a898c49997681ef1e2c84d5af5 Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Wed, 23 Aug 2023 11:30:50 -0400 Subject: [PATCH 09/21] Adds match-2-yar test dependencies to github action --- .github/workflows/tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index bd68f26c6..c09edc2cc 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -126,6 +126,9 @@ jobs: - name: Install capa if: ${{ env.BN_SERIAL != 0 }} run: pip install -e .[dev] + - name: Install Match-2-Yar Dependencies + if: ${{ env.BN_SERIAL != 0 }} + run: pip install mkyara yaramod - name: install Binary Ninja if: ${{ env.BN_SERIAL != 0 }} run: | From 72c1cc362cbc0138843b2537bf4427a70325f3fd Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Wed, 23 Aug 2023 11:35:27 -0400 Subject: [PATCH 10/21] Revert "Adds match-2-yar test dependencies to github action" This reverts commit dd5ff321d49627a898c49997681ef1e2c84d5af5. --- .github/workflows/tests.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c09edc2cc..bd68f26c6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -126,9 +126,6 @@ jobs: - name: Install capa if: ${{ env.BN_SERIAL != 0 }} run: pip install -e .[dev] - - name: Install Match-2-Yar Dependencies - if: ${{ env.BN_SERIAL != 0 }} - run: pip install mkyara yaramod - name: install Binary Ninja if: ${{ env.BN_SERIAL != 0 }} run: | From beaf8b256da5e784cb604fca6d03243fd004ad69 Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Wed, 23 Aug 2023 11:38:22 -0400 Subject: [PATCH 11/21] Add dev dependency to support running match-2-yar testing --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 35b5554c2..540111574 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,6 +94,8 @@ dev = [ "types-psutil==5.8.23", "types_requests==2.31.0.2", "types-protobuf==4.23.0.3", + "yaramod==3.20.1", + "mkYARA==1.0.0", ] build = [ "pyinstaller==5.10.1", From 036fccb4df692f81756779d5a367fa6e16c66dcb Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Wed, 23 Aug 2023 15:04:04 -0400 Subject: [PATCH 12/21] Address type issue --- scripts/match-2-yar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/match-2-yar.py b/scripts/match-2-yar.py index b41d00d39..71380b4ec 100644 --- a/scripts/match-2-yar.py +++ b/scripts/match-2-yar.py @@ -577,7 +577,7 @@ def run_capa_and_get_features(args): try: doc = rd.ResultDocument.from_capa(meta, rules, capabilities) logger.info("Building code features for %s", path) - if extractor is DnfileFeatureExtractor: + if isinstance(extractor, DnfileFeatureExtractor): # Handle .NET files features = get_code_features_for_dotnet_doc(doc, extractor) else: From 27049b7d6b066d36dfb2f82c242d0495a9960686 Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Thu, 24 Aug 2023 11:43:46 -0400 Subject: [PATCH 13/21] Remove unnecessary debug logging and updated one expected yara file --- scripts/match-2-yar.py | 2 -- tests/data | 2 +- tests/test_scripts.py | 6 +----- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/scripts/match-2-yar.py b/scripts/match-2-yar.py index 71380b4ec..81d93cd83 100644 --- a/scripts/match-2-yar.py +++ b/scripts/match-2-yar.py @@ -834,12 +834,10 @@ def build_yara_ruleset(files_dict, **kwargs): chosen_code_version = sorted(result_dict.values, key=lambda x: x.filemd5)[0] file_combinations[sample_combo_key]["features"].append(chosen_code_version) - logger.warning("Adding %s to %s", chosen_code_version.sig, sample_combo_key) file_combinations[sample_combo_key]["feature_count"] += 1 # Create a list of combo keys and sort them so we get deterministic output combo_keys = sorted(file_combinations.keys(), key=lambda x: (len(x), x)) - logger.warning("Combo Keys: %s", combo_keys) # Build the YARA rule set based on the grouping yara_file = yaramod.YaraFileBuilder() diff --git a/tests/data b/tests/data index 65d3a99c8..b5c583cf8 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 65d3a99c865177dfc6370b7f3e48163c233d26dc +Subproject commit b5c583cf867a52431c543b74d59f5a8a0a65d498 diff --git a/tests/test_scripts.py b/tests/test_scripts.py index ccbba7f41..8af2a7b10 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -140,11 +140,7 @@ def test_script_expected_output(script, args, expected_output_path): expected_output = Path(get_data_path(expected_output_path)).read_bytes() # Update dates in expected output to be todays date - dates_to_replace = [ - b"2023-08-10", - ] - for dt in dates_to_replace: - expected_output = expected_output.replace(dt, date.today().isoformat().encode("utf8")) + expected_output = expected_output.replace(b"EXPECTED_DATE", date.today().isoformat().encode("utf8")) p = run_program(script_path, args) From f4d0c2f6c50efdb8431e6cfc2a410ed3c8245ec7 Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Thu, 24 Aug 2023 14:04:14 -0400 Subject: [PATCH 14/21] Remove type hint incompatible with python 3.8 and remove unnecessary comments --- scripts/match-2-yar.py | 6 ++---- tests/test_scripts.py | 6 +----- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/scripts/match-2-yar.py b/scripts/match-2-yar.py index 81d93cd83..6b516f4e5 100644 --- a/scripts/match-2-yar.py +++ b/scripts/match-2-yar.py @@ -37,7 +37,7 @@ import collections import multiprocessing import multiprocessing.pool -from typing import Set, Dict, List, Union +from typing import Set, Dict, List from pathlib import Path from datetime import date @@ -359,9 +359,7 @@ def get_sig_and_mask_for_dotnet_func(dnpe, body): class CodeFeature: """Basic object that that will be used to create yara rules""" - def __init__( - self, sig: str, comment: str, bytez: bytes, filemd5: str, addr: Union[int, tuple[int, int], None], scope: str - ): + def __init__(self, sig: str, comment: str, bytez: bytes, filemd5: str, addr, scope: str): self.sig = sig.strip().upper() self.comment = comment self.bytez = bytez diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 8af2a7b10..226e041c9 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -290,10 +290,6 @@ def test_detect_duplicate_features(tmpdir): assert overlaps_found.returncode == expected_overlaps -# Rough outline for function extract bytes, function length, function masking -# Use importlib to import the script -# Use fixtures vivisect to get a vivisect workspace for a given path -# We can use known functions from the yara matches to extract out length, bytes, and masked sig @pytest.mark.parametrize( "path,is_dotnet,filemd5,addr,scope,expected_bytestring,expected_sig", [ @@ -350,7 +346,7 @@ def test_detect_duplicate_features(tmpdir): ], ) def test_match2yar_feature_extraction(path, is_dotnet, filemd5, addr, scope, expected_bytestring, expected_sig): - """Test extracting a function byte string using vivisect workspaces""" + """Test extracting and masking bytes based on matches using match-2-yar script""" output = get_match_2_yar_features(path, is_dotnet) output = output.decode("utf8") From 21e067b2717dec8a62033c66cf014ca28b24c2a0 Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Tue, 29 Aug 2023 09:43:48 -0400 Subject: [PATCH 15/21] Update spelling and name suggestions --- scripts/match-2-yar.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/match-2-yar.py b/scripts/match-2-yar.py index 6b516f4e5..81254477d 100644 --- a/scripts/match-2-yar.py +++ b/scripts/match-2-yar.py @@ -65,7 +65,7 @@ from capstone import CS_MODE_32, CS_MODE_64, CS_ARCH_X86, CS_OPT_SYNTAX_INTEL, Cs except ImportError: print( - """\nFailed to import a module try installing required Python libraries with the following: + """\nFailed to import a module try installing required Python libraries with the following command: pip install mkyara yaramod """ ) @@ -91,7 +91,7 @@ def addText(self, text, tag=None): def get_disassembly_output(vw, va, size): - """Get Vivisect's disassembly view for a given virtual addresss and size + """Get Vivisect's disassembly view for a given virtual address and size Args: vw: Vivisect Workspace @@ -107,8 +107,8 @@ def get_disassembly_output(vw, va, size): return mcav.output -def get_comment_for_func(vw, funcva): - """Get a CodeFeature comment for a function +def get_disassembly_for_func(vw, funcva): + """Get vivisect disassembly for a function This function gets the size of a function and uses that to get a dump of the function disassembly @@ -125,8 +125,8 @@ def get_comment_for_func(vw, funcva): return get_disassembly_output(vw, funcva, funcsize) -def get_comment_for_cb(vw, va): - """Get a CodeFeature comment for a Code Block +def get_disassembly_for_cb(vw, va): + """Get vivisect disassembly for a for a Code Block This function gets the size of a code block and uses that to get a dump of the code block disassembly @@ -421,7 +421,7 @@ def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor): comment = f"Basic Block at 0x{addr:08x}@{filemd5} with {len(rules)} features:\n" for rule_name in sorted(rules): comment += f" - {rule_name}\n" - comment += get_comment_for_cb(file_vw, addr) + comment += get_disassembly_for_cb(file_vw, addr) bytez = get_cb_bytes(file_vw, addr) sig = genSigAndMask(addr, bytez, doc.meta.analysis.arch) @@ -431,7 +431,7 @@ def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor): comment = f"function at 0x{addr:08x}@{filemd5} with {len(rules)} features:\n" for rule_name in sorted(rules): comment += f" - {rule_name}\n" - comment += get_comment_for_func(file_vw, addr) + comment += get_disassembly_for_func(file_vw, addr) bytez = get_function_bytes(file_vw, addr) sig = genSigAndMask(addr, bytez, doc.meta.analysis.arch) From b544ea3348e870398f476cbecbd36fe79d041422 Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Tue, 29 Aug 2023 10:12:50 -0400 Subject: [PATCH 16/21] Simplify the match-2-yar function size logic --- scripts/match-2-yar.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/scripts/match-2-yar.py b/scripts/match-2-yar.py index 81254477d..2c14e0b24 100644 --- a/scripts/match-2-yar.py +++ b/scripts/match-2-yar.py @@ -155,19 +155,30 @@ def get_function_size(vw, funcva): int: size of the function """ fsize = 0 - if funcva not in vw.getFunctions(): - funcva = vw.getFunction(funcva) - if funcva is None: - raise Exception("Given funcva not a function or within a known function") - func_blocks = [cbva for cbva, _, _ in vw.getFunctionBlocks(funcva)] + # Get the effective function given a virtual address + effective_funcva = vw.getFunction(funcva) + if effective_funcva is None: + raise Exception("Given funcva not a function or within a known function") + # These should only disagree if the funcva provided + # wasn't the start of a function + if effective_funcva != funcva: + logger.debug("Requested function addr %s was contained in function %s", hex(funcva), hex(effective_funcva)) + + # Get the blocks of the effective function + func_blocks = [cbva for cbva, _, _ in vw.getFunctionBlocks(effective_funcva)] + # Figure out the size of the first linear chunk - # in this function... + # in this function. + # Note: if funcva isn't the start of the function (funcva != effective_funcva) + # Then we'll get everything from funcva and after cb = vw.getCodeBlock(funcva) if cb[0] not in func_blocks: - raise Exception("funcva not in given func") + raise Exception( + "Provided funcva not in effective func [funcva=%s, effective_funcva=%s]", hex(funcva), hex(effective_funcva) + ) while cb is not None: cbva, cbsize, cbfunc = cb - if cbfunc != funcva: + if cbfunc != effective_funcva: break fsize += cbsize cb = vw.getCodeBlock(cbva + cbsize) From 6919c5b7a286cb9368f68b8e85b7ea1e359fcc8b Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Tue, 29 Aug 2023 10:21:35 -0400 Subject: [PATCH 17/21] Move match-2-yar dependecies to another optional set of dependencies --- .github/workflows/tests.yml | 2 +- pyproject.toml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index bd68f26c6..0f81167eb 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -125,7 +125,7 @@ jobs: run: sudo apt-get install -y libyaml-dev - name: Install capa if: ${{ env.BN_SERIAL != 0 }} - run: pip install -e .[dev] + run: pip install -e .[dev,scripts] - name: install Binary Ninja if: ${{ env.BN_SERIAL != 0 }} run: | diff --git a/pyproject.toml b/pyproject.toml index 540111574..82ed09308 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,6 +94,8 @@ dev = [ "types-psutil==5.8.23", "types_requests==2.31.0.2", "types-protobuf==4.23.0.3", +] +scripts = [ "yaramod==3.20.1", "mkYARA==1.0.0", ] From 86a4a3ecf5ae22161cc066f4ea5f9e25614b5a79 Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Tue, 29 Aug 2023 10:28:42 -0400 Subject: [PATCH 18/21] Remove artifact of merge commit --- CHANGELOG.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ba208c166..d9c49edca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,14 +3,12 @@ ## master (unreleased) ### New Features -<<<<<<< HEAD - ELF: implement file import and export name extractor #1607 #1608 @Aayush-Goel-04 - bump pydantic from 1.10.9 to 2.1.1 #1582 @Aayush-Goel-04 - develop script to highlight the features that are not used during matching #331 @Aayush-Goel-04 - add script to create code-based YARA based on CAPA match details called match-2-yar @jconnor0426 ======= ->>>>>>> master ### Breaking Changes From a888c15cb87cff4e0ae5984ede5bccf927c9580b Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Tue, 29 Aug 2023 11:16:16 -0400 Subject: [PATCH 19/21] Add in dependency installation to tests github action --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0f81167eb..d8232f120 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -94,7 +94,7 @@ jobs: if: matrix.os == 'ubuntu-20.04' run: sudo apt-get install -y libyaml-dev - name: Install capa - run: pip install -e .[dev] + run: pip install -e .[dev,scripts] - name: Run tests run: pytest -v tests/ From 400bc89ad531c7ae63fe068759a0fc32adad043b Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Wed, 30 Aug 2023 09:04:42 -0400 Subject: [PATCH 20/21] Update CHANGELOG.md based on suggestion Co-authored-by: Moritz --- CHANGELOG.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d9c49edca..19e85f525 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,12 +3,8 @@ ## master (unreleased) ### New Features -- ELF: implement file import and export name extractor #1607 #1608 @Aayush-Goel-04 -- bump pydantic from 1.10.9 to 2.1.1 #1582 @Aayush-Goel-04 -- develop script to highlight the features that are not used during matching #331 @Aayush-Goel-04 - add script to create code-based YARA based on CAPA match details called match-2-yar @jconnor0426 -======= ### Breaking Changes From 8a0fa9d68dcebb0ab7b9f970ee28913847dc2c53 Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Thu, 31 Aug 2023 18:29:14 +0000 Subject: [PATCH 21/21] Updating test yara file names to avoid name issues in test files repo --- tests/data | 2 +- tests/test_scripts.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/data b/tests/data index b5c583cf8..5bfe79607 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit b5c583cf867a52431c543b74d59f5a8a0a65d498 +Subproject commit 5bfe7960707b94e1b4f3066e66cc0f1b8136ae9e diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 226e041c9..7aac70a07 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -85,7 +85,7 @@ def test_bulk_process(tmp_path): pytest.param( "match-2-yar.py", [get_data_path("9324d1a8ae37a36ae560c37448c9705a.exe_")], - "yara/expected_9324d1a8ae37a36ae560c37448c9705a.exe_.yar", + "yara/expected_9324d1a8ae37a36ae560c37448c9705a.yar", ), # Test match-2-yar x64 EXE pytest.param( @@ -101,7 +101,7 @@ def test_bulk_process(tmp_path): "dotnet", get_data_path("dotnet/1c444ebeba24dcba8628b7dfe5fec7c6.exe_"), ], - "yara/expected_1c444ebeba24dcba8628b7dfe5fec7c6.exe_.yar", + "yara/expected_1c444ebeba24dcba8628b7dfe5fec7c6.yar", ), # Test match-2-yar files with multiple X86 PEs pytest.param( @@ -111,7 +111,7 @@ def test_bulk_process(tmp_path): get_data_path("Practical Malware Analysis Lab 11-03.exe_"), get_data_path("Practical Malware Analysis Lab 16-01.exe_"), ], - "yara/expected_pma_03-04.exe_11-03.exe_16-01.exe", + "yara/expected_pma_03-04_11-03_16-01.yar", ), # Test match-2-yar files with CAPA file limitations are filtered out of multi sample pytest.param( @@ -120,7 +120,7 @@ def test_bulk_process(tmp_path): get_data_path("Practical Malware Analysis Lab 01-01.exe_"), get_data_path("Practical Malware Analysis Lab 01-02.exe_"), ], - "yara/expected_pma_01-01.exe_01-02.exe", + "yara/expected_pma_01-01_01-02.yar", ), # Test match-2-yar multiple x86 .NET PE pytest.param(