From 1b6bdaad85240069a4e780aa1afe898ae8dff81c Mon Sep 17 00:00:00 2001 From: aaronatp Date: Tue, 19 Mar 2024 10:51:55 -0700 Subject: [PATCH 01/21] This pull request extracts web domains and IP addresses from files and sandbox traces and presents them to the user. In (-v) and (-vv) modes, this pull request also tries to identify how web domains and IP addresses are used. It checks whether a WinAPI networking function is called soon after the web domain or IP address appears. --- capa/capabilities/domain_ip_helpers.py | 84 +++++ capa/capabilities/extract_domain_and_ip.py | 413 +++++++++++++++++++++ capa/render/default.py | 34 ++ capa/render/verbose.py | 56 +++ capa/render/vverbose.py | 56 +++ tests/test_domain_ip_extractor.py | 173 +++++++++ 6 files changed, 816 insertions(+) create mode 100644 capa/capabilities/domain_ip_helpers.py create mode 100644 capa/capabilities/extract_domain_and_ip.py create mode 100644 tests/test_domain_ip_extractor.py diff --git a/capa/capabilities/domain_ip_helpers.py b/capa/capabilities/domain_ip_helpers.py new file mode 100644 index 000000000..d07cc160c --- /dev/null +++ b/capa/capabilities/domain_ip_helpers.py @@ -0,0 +1,84 @@ +import logging +from pathlib import Path + +from capa.helpers import get_auto_format +from capa.features.common import FORMAT_CAPE +from capa.render.result_document import ResultDocument +from capa.features.extractors.base_extractor import FeatureExtractor +from capa.features.extractors.cape.extractor import CapeExtractor + +logger = logging.getLogger(__name__) + +BACKEND_VIV = "vivisect" +BACKEND_DOTNET = "dotnet" +BACKEND_BINJA = "binja" +BACKEND_PEFILE = "pefile" + + +def get_file_path(doc: ResultDocument) -> Path: + return Path(doc.meta.sample.path) + + +def get_sigpaths_from_doc(doc: ResultDocument): + import capa.loader + + if doc.meta.argv: + try: + if "-s" in list(doc.meta.argv): + idx = doc.meta.argv.index("-s") + sigpath = Path(doc.meta.argv[idx + 1]) + if "./" in str(sigpath): + fixed_str = str(sigpath).split("./")[1] + sigpath = Path(fixed_str) + + elif "--signatures" in list(doc.meta.argv): + idx = doc.meta.argv.index("--signatures") + sigpath = Path(doc.meta.argv[idx + 1]) + if "./" in str(sigpath): + fixed_str = str(sigpath).split("./")[1] + sigpath = Path(fixed_str) + + else: + sigpath = "(embedded)" # type: ignore + + return capa.loader.get_signatures(sigpath) + + except AttributeError: + raise NotImplementedError("Confirm that argv is an attribute of doc.meta") + + else: + print("in 'get_sigpaths_from_doc', run in debug (-d) mode") + logger.debug("'doc.meta' has not attribute 'argv', this is probably a bad sign...") + + +def get_extractor_from_doc(doc: ResultDocument) -> FeatureExtractor: + import capa.loader + + path = get_file_path(doc) + format = doc.meta.analysis.format + os = doc.meta.analysis.os + + _ = get_auto_format(get_file_path(doc)) + if format == FORMAT_CAPE: + report = capa.helpers.load_json_from_path(path) + return CapeExtractor.from_report(report) + elif _ == BACKEND_VIV: + backend = BACKEND_VIV + elif _ == BACKEND_PEFILE: + backend = BACKEND_PEFILE + elif _ == BACKEND_BINJA: + backend = BACKEND_BINJA + elif _ == BACKEND_DOTNET: + backend = BACKEND_DOTNET + else: + backend = BACKEND_VIV # according to main.py this is the default + + sigpath = get_sigpaths_from_doc(doc) + + return capa.loader.get_extractor( + input_path=path, + input_format=format, + os_=os, + backend=backend, + sigpaths=sigpath, + ) diff --git a/capa/capabilities/extract_domain_and_ip.py b/capa/capabilities/extract_domain_and_ip.py new file mode 100644 index 000000000..dab59fb8d --- /dev/null +++ b/capa/capabilities/extract_domain_and_ip.py @@ -0,0 +1,413 @@ +import re +import socket +import logging +import ipaddress +from typing import Dict, List, Tuple, Generator + +from capa.features.insn import API, Feature +from capa.features.common import Address +from capa.render.result_document import ResultDocument +from capa.capabilities.domain_ip_helpers import get_extractor_from_doc +from capa.features.extractors.base_extractor import StaticFeatureExtractor, DynamicFeatureExtractor + +logger = logging.getLogger(__name__) + + +def is_valid_domain(string: str) -> bool: + """ + uses a regex to check whether a string could be a valid web domain + + ignores domain-like strings that have invalid top-level domains (e.g., ".exe", ".dll", etc.) + """ + ############## + # ideally 'DOMAIN_PATTERN' should probably be moved out of this function's scope but + # then it would have to be passed as a variable to this function and that would make + # rendering in the main function a lot more messy + + # See this Stackoverflow post that discusses the parts of this regex (http://stackoverflow.com/a/7933253/433790) + # The following regex is based on the linked-to regex but significantly modified/updated + DOMAIN_PATTERN = ( + r"^(?!.{256})(?:[a-z](?:[a-z0-9-']{0,61})?(? bool: + """checks if a string is a valid IP address""" + try: + ipaddress.ip_address(string) + return True + except ValueError: + return False + + +def generate_insns_from_doc(doc: ResultDocument) -> Generator[Tuple[Feature, Address], None, None]: + """ + checks whether extractor's type is StaticFeatureExtractor or DynamicFeatureExtractor + + if the type is StaticFeatureExtractor, this function yields assembly instruction's and addresses + + StaticFeatureExtractor example: + mnemonic(xor), absolute(0x401015) + mnemonic(lea), absolute(0x401017) + mnemonic(mov), absolute(0x40101d) + mnemonic(push), absolute(0x401023) + number(0xF), absolute(0x401023) + ... + string(70.62.232.98), absolute(0x4010b6) + mnemonic(call), absolute(0x4010bb) + ... + api(strncpy), absolute(0x4010f3) + + if the type is DynamicFeatureExtractor, this function yields "call features" which are analogous + to assembly instructions but extracted from sandbox traces as opposed to files directly + + args: + doc (ResultDocument): a ResultDocument object + + yields: + feature, addr (Tuple[Feature, Address]): + 'feature' is either an assembly instruction or a call feature; and, + 'addr' is a memory address. + """ + extractor = get_extractor_from_doc(doc) + if isinstance(extractor, StaticFeatureExtractor): + for func in extractor.get_functions(): + for block in extractor.get_basic_blocks(func): + for insn in extractor.get_instructions(func, block): + for feature, addr in extractor.extract_insn_features(func, block, insn): + yield feature, addr + + elif isinstance(extractor, DynamicFeatureExtractor): + for proc in extractor.get_processes(): + for thread in extractor.get_threads(proc): + for call in extractor.get_calls(proc, thread): + for feature, addr in extractor.extract_call_features(proc, thread, call): + yield feature, addr + + +def default_extract_domain_names(doc: ResultDocument) -> Generator[str, None, None]: + """ + loops through assembly instructions retrieved from a ResultDocument object + + this 'default' function is meant to merely tell users what domains/IPs are in a file, + not to show users how many time each occur, so we consciously do not yield duplicates + + yields: + potential web domain names and IP addresses + """ + duplicates = set() + for feature, _ in generate_insns_from_doc(doc): + string = str(feature.value) + if string in duplicates: + continue + + if is_valid_domain(string): + duplicates.add(string) + yield string + + elif is_ip_addr(string): + duplicates.add(string) + yield string + + +def verbose_extract_domain_and_ip(doc: ResultDocument) -> Generator[str, None, None]: + """calls verbose statement formatter for IP addresses and web domains""" + for string, count in get_domain_ip_dict(doc).items(): + if is_ip_addr(string): + yield formatted_ip_verbose(doc, string, count) + else: + yield formatted_domain_verbose(doc, string, count) + + +def get_domain_ip_dict(doc: ResultDocument): + """ + returns dict of domains/IPs in a file and number of times each occur + + example: + {'malicious-website.com/next/asxp.jpg': 3, 'other-website.net': 2} + + args: + doc (ResultDocument): ResultDocument object which contains FeatureExtractor information, including file strings + + returns: + domain_and_ip_counts (Dict[str, int]): dict of domain names and IP addresses and occurrances of each + - Note: each full-path URL gets its own dict key + """ + domain_and_ip_counts: Dict[str, int] = {} + + for feature, _ in generate_insns_from_doc(doc): + extended_string = feature.value + + if not isinstance(extended_string, str): + continue + + # this for loop cleans up any "http(s)://" strings + for string in extended_string.split(" "): + if string.startswith("http://"): + string = string.split("http://")[-1] + break + + elif string.startswith("https://"): + string = string.split("https://")[-1] + break + + else: + # makes sure there are no weird "http(s)://" strings + # if the assert statement runs, there's probably an issue + assert not (any(prefix in string for prefix in ["http://", "https://"])) + + # for example, if string == "malware.com/next/virus.jpg", + # the following "if-else" statements split at "/" + # and checks whether "malware.com" is a web domain or IP address + if is_valid_domain(string.split("/")[0]): + try: + domain_and_ip_counts[string] += 1 + except KeyError: + domain_and_ip_counts[string] = 1 + + elif is_ip_addr(string.split("/")[0]): + try: + domain_and_ip_counts[string] += 1 + except KeyError: + domain_and_ip_counts[string] = 1 + + return domain_and_ip_counts + + +def formatted_domain_verbose(doc: ResultDocument, domain: str, total_occurrances: int) -> str: + """ + example output: + + capa -v suspicious.exe + ----------------------- + malware.com + |---- IP address: + | |----192.0.0.1 + |----Functions used to communicate with malware.com: + | |----InternetConnectA + | |----HttpOpenRequestA + | |----FtpGetFileA + |----3 occurrances + """ + return ( + f"{domain}\n" + + f" |---- {ip_address_statement(domain)}\n" + + f" |---- {networking_functions_statement(doc, domain)}\n" + + f" |---- {total_occurrances} occurrances" + ) + + +def formatted_ip_verbose(doc: ResultDocument, ip_addr: str, total_occurrances: int) -> str: + """same as 'formatted_domain_verbose' but without 'ip_address_statement'""" + return ( + f"{ip_addr}\n" + + f" |---- {networking_functions_statement(doc, ip_addr)}" + + f" |---- {total_occurrances} occurrances" + ) + + +def ip_address_statement(domain: str) -> str: + """ + tries to identify a web domain's IP address + + this function's output is used by 'formatted_domain_verbose' + + return: + (str): either the formatted IP address, or an error message + """ + try: + ip_address = socket.gethostbyname(domain) + return "IP address:\n" + f" | |----{ip_address}\n" + except socket.gaierror: + return f"Could not get IP address for {domain.split('/')[0]}\n" + + +def networking_functions_statement(doc: ResultDocument, domain_or_ip: str): + """prints the functions used to communicate with domain/ip""" + api_functions = get_domain_or_ip_caller_functions(doc, domain_or_ip) + + if len(api_functions) == 0: + statement = ( + f"{domain_or_ip} occurs but no functions found that use it.\n" + " If you think this is a mistake, please open an issue on\n" + " the capa GitHub page (https://github.com/mandiant/capa)\n" + ) + return statement + + elif len(api_functions) == 1: + statement = f"Function used to communicate with {domain_or_ip}:\n" + for func in api_functions: + return statement + f" | |----{func}\n" + + elif len(api_functions) > 1: + statement = f"Functions used to communicate with {domain_or_ip}:\n" + for function in api_functions: + statement += f" | |----{function}\n" + + return statement + + else: + raise LengthError("'api_functions' contains unexpected data!") + + +class LengthError(BaseException): + pass + + +def get_domain_or_ip_caller_functions(doc: ResultDocument, domain_or_ip: str) -> List[str]: + """ + for every occurrance of 'domain_or_ip' in the ResultDocument, we see which functions operate on it + + returns: + List[str]: list of functions that operate on the 'domain_or_ip' string + """ + api_functions = [] + for caller_func in yield_caller_funcs(doc, domain_or_ip): + api_functions.append(caller_func) + + return api_functions + + +def yield_caller_funcs(doc: ResultDocument, domain_or_ip: str) -> Generator[str, None, None]: + """ + We loop through asembly instructions and look for features whose values equal 'domain_or_ip'. + When we find a feature, we look for a WinAPI instruction. WinAPI instructions are features: + 1) whose type is API; and, + 2) whose values are, heuristically, WinAPI networking functions. + + yields: + (str): either a potential WinAPI function, or an error message + """ + signal = 0 + for feature, _ in generate_insns_from_doc(doc): + if isinstance(feature.value, str) and feature.value == domain_or_ip: + signal = 1 + continue + + # we only run this block if we have found a 'target_string' + if signal == 1: + # skip instructions until we get to an API instruction + if not isinstance(feature, API): + continue + + signal = 0 + + func = str(feature.value) # redundant but helps pass mypy tests + if "." in func: + func = func.split(".")[-1] + + # at this point, we have found an API instruction + # and see whether it could be a networking function + if potential_winapi_function(func): + yield func + + else: + yield "Not able to identify the calling function" + + +def potential_winapi_function(string: str) -> bool: + """ + some simple heuristics for checking whether a string is NOT a WinAPI function + + returns: + True if string could be a WinAPI function + False if string is not a WinAPI function + """ + if string in excluded_functions(): + return False + + if any(x in string.lower() for x in quick_true()): + return True + + if all(sep.isupper() for sep in string.split("_")) or all( + sep.islower() for sep in string.split("_") + ): # WinAPI functions are usually mixed upper and lower case + return False + + if not all(sep.isalpha() for sep in string.split("_")): # if contains non-letters + return False + + if too_many_consecutive_uppercase_letters(string, 7): # maximum of 7 consecutive uppercase letters + return False + + return True + + +def quick_true(): + """matched against lowercase strings""" + return [ + "http", + "ftp", + "internet", + "url", + "connection", + "connected", + "online", + "inet", + "addr", + "send", + "recv", + "sock", + "select", + "shutdown", + "ntoh", + "listen", + "serv", + "getpeer", + ] + + +def excluded_functions(): + """ + add excluded functions here, e.g., those that can't accept an IP address/web domain as an argument + """ + return ["Sleep"] + + +def too_many_consecutive_uppercase_letters(string, limit): + """ + 'HOSTENT' (probably) has the most consecutive uppercase letters + + returns: + True: too many consecutive uppercase letters, caller function disregards + False: not too many consecutive uppercase, indicates this is a potential WinAPI function + """ + counter = 0 + for i in string: + if i.isupper(): + counter += 1 + else: # basically reset counter if we reach a non-uppercase letter + counter = 0 + + if counter > limit: + return True + + return False diff --git a/capa/render/default.py b/capa/render/default.py index cf387a5dc..2806fd832 100644 --- a/capa/render/default.py +++ b/capa/render/default.py @@ -16,6 +16,7 @@ from capa.rules import RuleSet from capa.engine import MatchResults from capa.render.utils import StringIO +from capa.capabilities.extract_domain_and_ip import default_extract_domain_names tabulate.PRESERVE_WHITESPACE = True @@ -197,6 +198,36 @@ def render_mbc(doc: rd.ResultDocument, ostream: StringIO): ostream.write("\n") +def render_domain_and_ip(doc: rd.ResultDocument, ostream: StringIO): + """ + example:: + +------------------------------+ + | IP addresses and web domains | + |------------------------------+ + | google.com | + | 192.123.232.08 | + | my-website.net | + | maliciooous.webs1t3-site.uhoh| + | malware.net | + +------------------------------+ + """ + rows = [] + for domain_or_ip in default_extract_domain_names(doc): + rows.append(domain_or_ip) + + if rows: + ostream.write( + tabulate.tabulate( + {"IP addresses and web domains": rows}, + headers=["IP addresses and web domains"], + tablefmt="mixed_outline", + ) + ) + ostream.write("\n") + else: + ostream.writeln(rutils.bold("No web domains or IP addresses found")) + + def render_default(doc: rd.ResultDocument): ostream = rutils.StringIO() @@ -207,6 +238,9 @@ def render_default(doc: rd.ResultDocument): render_mbc(doc, ostream) ostream.write("\n") render_capabilities(doc, ostream) + ostream.write("\n") + render_domain_and_ip(doc, ostream) + ostream.write("\n") return ostream.getvalue() diff --git a/capa/render/verbose.py b/capa/render/verbose.py index 44024acf4..daa8b048c 100644 --- a/capa/render/verbose.py +++ b/capa/render/verbose.py @@ -34,6 +34,7 @@ import capa.render.result_document as rd from capa.rules import RuleSet from capa.engine import MatchResults +from capa.capabilities.extract_domain_and_ip import verbose_extract_domain_and_ip def format_address(address: frz.Address) -> str: @@ -317,6 +318,58 @@ def render_rules(ostream, doc: rd.ResultDocument): ostream.writeln(rutils.bold("no capabilities found")) +def render_domain_and_ip(ostream: rutils.StringIO, doc: rd.ResultDocument): + """ + example:: + +-----------------------------------------------------------+ + | IP addresses and web domains | + |-----------------------------------------------------------+ + | google.com | + | |----IP address: | + | |----192.0.0.1 | + | |----Functions used to communicate with google.com: | + | |----InternetConnectA | + | |----HttpOpenRequestA | + | |----FtpGetFileA | + | |----3 occurrances | + | | | + | 192.123.232.08 | + | |----Functions used to communicate with 192.123.232.08:| + | |----... | + | | + +-----------------------------------------------------------+ + """ + rows = [] + for domain_or_ip in verbose_extract_domain_and_ip(doc): + for i in domain_or_ip.split("\n"): + rows.append(i) + + max_line = 0 + for item in rows: + for new_line in item.split("\n"): + if len(new_line) > max_line: + max_line = len(new_line) + + white_spaces = " " * ceil(1 / 3 * max_line) + + if rows: + ostream.write( + tabulate.tabulate( + {white_spaces + "IP addresses and web domains" + white_spaces: rows}, + headers=[white_spaces + "IP addresses and web domains" + white_spaces], + tablefmt="mixed_outline", + ) + ) + ostream.write("\n") + else: + ostream.writeln(rutils.bold("No web domains or IP addresses found")) + + +def ceil(num): + if isinstance(num, float): + return int(num - 0.5) + 1 + + def render_verbose(doc: rd.ResultDocument): ostream = rutils.StringIO() @@ -326,6 +379,9 @@ def render_verbose(doc: rd.ResultDocument): render_rules(ostream, doc) ostream.write("\n") + render_domain_and_ip(ostream, doc) + ostream.write("\n") + return ostream.getvalue() diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index 3498d24b8..dbea521f7 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -22,6 +22,7 @@ import capa.features.freeze.features as frzf from capa.rules import RuleSet from capa.engine import MatchResults +from capa.capabilities.extract_domain_and_ip import verbose_extract_domain_and_ip logger = logging.getLogger(__name__) @@ -458,6 +459,58 @@ def render_rules(ostream, doc: rd.ResultDocument): ostream.writeln(rutils.bold("no capabilities found")) +def render_domain_and_ip(ostream: rutils.StringIO, doc: rd.ResultDocument): + """ + example:: + +-----------------------------------------------------------+ + | IP addresses and web domains | + |-----------------------------------------------------------+ + | google.com | + | |----IP address: | + | |----192.0.0.1 | + | |----Functions used to communicate with google.com: | + | |----InternetConnectA | + | |----HttpOpenRequestA | + | |----FtpGetFileA | + | |----3 occurrances | + | | | + | 192.123.232.08 | + | |----Functions used to communicate with 192.123.232.08:| + | |----... | + | | + +-----------------------------------------------------------+ + """ + rows = [] + for domain_or_ip in verbose_extract_domain_and_ip(doc): + for i in domain_or_ip.split("\n"): + rows.append(i) + + max_line = 0 + for item in rows: + for new_line in item.split("\n"): + if len(new_line) > max_line: + max_line = len(new_line) + + white_spaces = " " * ceil(1 / 3 * max_line) + + if rows: + ostream.write( + tabulate.tabulate( + {white_spaces + "IP addresses and web domains" + white_spaces: rows}, + headers=[white_spaces + "IP addresses and web domains" + white_spaces], + tablefmt="mixed_outline", + ) + ) + ostream.write("\n") + else: + ostream.writeln(rutils.bold("No web domains or IP addresses found")) + + +def ceil(num): + if isinstance(num, float): + return int(num - 0.5) + 1 + + def render_vverbose(doc: rd.ResultDocument): ostream = rutils.StringIO() @@ -467,6 +520,9 @@ def render_vverbose(doc: rd.ResultDocument): render_rules(ostream, doc) ostream.write("\n") + render_domain_and_ip(ostream, doc) + ostream.write("\n") + return ostream.getvalue() diff --git a/tests/test_domain_ip_extractor.py b/tests/test_domain_ip_extractor.py new file mode 100644 index 000000000..a55841e40 --- /dev/null +++ b/tests/test_domain_ip_extractor.py @@ -0,0 +1,173 @@ +import pytest + +from capa.capabilities.extract_domain_and_ip import is_ip_addr, is_valid_domain, potential_winapi_function + + +@pytest.mark.parametrize( + "string", + [ + # Valid IPv4 addresses + ("8.8.8.8"), + ("128.0.0.1"), + ("123.4.56.78"), + ("0.0.0.0"), + ("255.255.255.255"), + # Valid IPv6 addresses + ("2001:0db8:85a3:0000:0000:8a2e:0370:7334"), + ("fe80:0000:0000:0000:0202:b3ff:fe1e:8329"), + ("2002::1234:5678:9abc:def0"), + ("::1"), + ("2001:0db8:0001:0000:0000:0ab9:C0A8:0102"), + ("2001:db8:1::ab9:C0A8:102"), + ("::1234:5678"), + ("::"), + ("2001:db8::"), + ("2001:db8:3333:4444:CCCC:DDDD:EEEE:FFFF"), + ("2001:db8:3333:4444:5555:6666:7777:8888"), + ("3ffe:ffff:ffff:ffff:ffff:ffff:ffff:ffff"), + ("2001:db8:3333:4444:5555:6666:1.2.3.4"), + ("::11.22.33.44"), + ("2001:db8::123.123.123.123"), + ("::1234:5678:91.123.4.56"), + ("::1234:5678:1.2.3.4"), + ("2001:db8::1234:5678:5.6.7.8"), + ], +) +def test_is_ip_addr(string: str): + # Valid IPv4 addresses + assert is_ip_addr(string) + + +@pytest.mark.parametrize( + "string", + [ + # Invalid IPv4 addresses + ("255.255.255.256"), + ("255.255.255.-1"), + ("2555.255.255.255"), + # Invalid IPv6 addresses + ("2001:0db8:85a3:0000:0000:8a2e:0370:G334"), + ("2001:db8:a0b:12f0:0000:0000:0000::0001"), + ("2001:db8:a0b:12f0::1:2:3:4:5"), + ("2001:db8::::1"), + ("fe80:2030:31:24"), + ("::1:2:3:4:5:6:7:8"), + ("2001:db8:a0b:12f0:g:h:i:j"), + ("1234567890:1234:5678:90ab:cdef:1234:5678:90ab"), + ], +) +def test_is_not_ip_addr(string: str): + assert not is_ip_addr(string) + + +@pytest.mark.parametrize( + "string", + [ + ( + "google.com" + ), # the following talks about some domain matching considerations - (http://stackoverflow.com/a/7933253/433790) + ("favorite.website"), + ("dont.like.spiders"), + ("lots.of.subnets.com.org.net"), + ("walk-your-dog.net"), # can have dashes in domain names + ( + "whos--a---goood---boy.com" + ), # can have multiple dashes (https://stackoverflow.com/questions/16468309/can-domain-name-have-two-continuous-hyphens) + ("fileshare.biz"), + ( + "g00gle.c0m" + ), # can have numbers in top-level domain as long as the top-level domain doesn't start or end with a number + ( + "coooooooooooool.we.b.s.t.e" + ), # single-character top-level-domains technically legal (https://stackoverflow.com/questions/7411255/is-it-possible-to-have-one-single-character-top-level-domain-name) + ("really.long.jhgfjhgfjhgfkjh76547kjhgkjhgl234567gfdshgfkklkjh"), + ("oiuyu78658765hgjj-i765jhgftuytruytr.jhgfhgfjhgf654365436576908-088098jhgjff.gdffdghdgfd"), + ("xn--bcher-kva.tld"), + ( + "xn--q1a.xn--b1aube0e.xn--c1acygb.xn--p1ai" + ), # https://superuser.com/questions/860121/what-does-it-mean-when-a-dns-name-starts-with-xn + ("xn--diseolatinoamericano-66b.com"), # https://stackoverflow.com/questions/9724379/xn-on-domain-what-it-means + ( + "don't.like.sp1d3rs" + ), # apostropes in URLs technically legal (https://stackoverflow.com/questions/13442421/apostrophes-in-the-url-good-idea-or-bad-idea-and-why) + ], +) +def test_valid_domain(string: str): + assert is_valid_domain(string) + + +@pytest.mark.parametrize( + "string", + [ + ("yup"), + ("no way this passes the test"), # can't have spaces + ("really.long-domainname"), # can only have "-" in top-level domains if "xn--..." + ("really.long-domain-name"), + ( + "dog..cat" + ), # consecutive periods are invalid in a subdomain (https://stackoverflow.com/questions/41821416/are-urls-with-multiple-periods-in-the-url-path-valid) + ("dog.34.cat"), # subdomain has only numbers + ("34.dog.cat"), + ( + "dog.cat.34" + ), # top-level domains can not consist only of numbers (https://stackoverflow.com/questions/7411255/is-it-possible-to-have-one-single-character-top-level-domain-name) + ("d0nt.lik3.sp1d3rs"), # number at end of second subdomain + ("definite.1nvalid"), # number at start of the top-level domain + ], +) +def test_invalid_domain(string: str): + assert not is_valid_domain(string) + + +@pytest.mark.parametrize( + "string", + [ + ("InternetConnectA"), + ("HttpQueryInfo"), + ("HttpSendRequestW"), + ("InternetCanonicalizeUrlA"), + ("InternetCrackUrlA"), + ("InternetCloseHandle"), + ("InternetCombineUrlW"), + ("InternetCheckConnectionA"), + ("INTERNET_STATUS_CALLBACK"), + ("INTERNET_CACHE_ENTRY_INFOA"), + ("INTERNET_ASYNC_RESULT"), + ("GetUrlCacheEntryInfoExA"), + ("FindNextUrlCacheEntryW"), + ("DeleteUrlCacheEntry"), + ("DetectAutoProxyUrl"), + ("FindFirstUrlCacheEntryExA"), + ("InternetConfirmZoneCrossing"), + ("InternetGoOnlineW"), + ("InternetHangUp"), + ("InternetSetOptionExW"), + ("UnlockUrlCacheEntryFile"), + ("URL_COMPONENTSA"), + ("Internet"), + ("recv"), + ("send"), + ], +) +def test_potential_winapi_function(string: str): + assert potential_winapi_function(string) + + +@pytest.mark.parametrize( + "string", + [ + ("asdfadsfasdfasf"), + ("plkj"), + ("DSFLKJKLJKLDJFKJ"), + ("LKJD LKJ ALKSDJFH"), + ("dog cat mouse snake"), + ("Dog CAT mOuse Snake"), + (""), + (" "), + ("2345"), + ("SDFGHJ_SDFGHJKLKJHG"), + ("Sleep"), + ], +) +def test_not_potential_winapi_function(string: str): + assert not potential_winapi_function(string) From 999f656c670d5cae51e41f29742e98f00e3e2e06 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Tue, 19 Mar 2024 13:34:20 -0500 Subject: [PATCH 02/21] Update domain_ip_helpers.py --- capa/capabilities/domain_ip_helpers.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/capa/capabilities/domain_ip_helpers.py b/capa/capabilities/domain_ip_helpers.py index d07cc160c..4f0af1e9b 100644 --- a/capa/capabilities/domain_ip_helpers.py +++ b/capa/capabilities/domain_ip_helpers.py @@ -1,3 +1,11 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + import logging from pathlib import Path From 80fc4d4a6fe87432ca17f5472197cdff499f1801 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Tue, 19 Mar 2024 13:34:36 -0500 Subject: [PATCH 03/21] Update extract_domain_and_ip.py --- capa/capabilities/extract_domain_and_ip.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/capa/capabilities/extract_domain_and_ip.py b/capa/capabilities/extract_domain_and_ip.py index dab59fb8d..456ef0587 100644 --- a/capa/capabilities/extract_domain_and_ip.py +++ b/capa/capabilities/extract_domain_and_ip.py @@ -1,3 +1,11 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + import re import socket import logging From 2bb09f8fedd5bd42703d44189ebcd8e0a26d1353 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Tue, 19 Mar 2024 13:34:59 -0500 Subject: [PATCH 04/21] Update test_domain_ip_extractor.py --- tests/test_domain_ip_extractor.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_domain_ip_extractor.py b/tests/test_domain_ip_extractor.py index a55841e40..54906a92b 100644 --- a/tests/test_domain_ip_extractor.py +++ b/tests/test_domain_ip_extractor.py @@ -1,3 +1,11 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + import pytest from capa.capabilities.extract_domain_and_ip import is_ip_addr, is_valid_domain, potential_winapi_function From 61c4ad5b58901a62544e1f01afaa86eeecdbabd0 Mon Sep 17 00:00:00 2001 From: aaronatp Date: Tue, 19 Mar 2024 11:46:11 -0700 Subject: [PATCH 05/21] Adds to changelog and fixes string concatenation style error --- CHANGELOG.md | 1 + capa/capabilities/extract_domain_and_ip.py | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 57e1a60e6..9ba15a939 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### New Features +- extracts and prints web domains/IP addresses and potential WinAPI networking functions @aaronatp ### Breaking Changes diff --git a/capa/capabilities/extract_domain_and_ip.py b/capa/capabilities/extract_domain_and_ip.py index 456ef0587..f81772c6a 100644 --- a/capa/capabilities/extract_domain_and_ip.py +++ b/capa/capabilities/extract_domain_and_ip.py @@ -263,11 +263,9 @@ def networking_functions_statement(doc: ResultDocument, domain_or_ip: str): api_functions = get_domain_or_ip_caller_functions(doc, domain_or_ip) if len(api_functions) == 0: - statement = ( - f"{domain_or_ip} occurs but no functions found that use it.\n" + statement = (f"{domain_or_ip} occurs but no functions found that use it.\n" " If you think this is a mistake, please open an issue on\n" - " the capa GitHub page (https://github.com/mandiant/capa)\n" - ) + " the capa GitHub page (https://github.com/mandiant/capa)\n") return statement elif len(api_functions) == 1: From 9b41074ad4826dae0e8d6bea052f9250139ac112 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Tue, 19 Mar 2024 14:07:10 -0500 Subject: [PATCH 06/21] Help debug why it won't build with PyInstaller 3.11 --- capa/capabilities/domain_ip_helpers.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/capa/capabilities/domain_ip_helpers.py b/capa/capabilities/domain_ip_helpers.py index 4f0af1e9b..7c890438c 100644 --- a/capa/capabilities/domain_ip_helpers.py +++ b/capa/capabilities/domain_ip_helpers.py @@ -30,25 +30,45 @@ def get_file_path(doc: ResultDocument) -> Path: def get_sigpaths_from_doc(doc: ResultDocument): import capa.loader + logger.debug("enter get_sigpaths_from_doc") + if doc.meta.argv: + logger.debug("enter if doc.meta.argv") try: + logger.debug("enter try block") + logger.debug(f"doc.meta.argv == {list(doc.meta.argv)}") if "-s" in list(doc.meta.argv): + logger.debug("enter -s") idx = doc.meta.argv.index("-s") + logger.debug("got -s idx") sigpath = Path(doc.meta.argv[idx + 1]) + logger.debug("got -s sigpath1") if "./" in str(sigpath): + logger.debug("in -s ./") fixed_str = str(sigpath).split("./")[1] + logger.debug("got -s fixed_str") sigpath = Path(fixed_str) + logger.debug("got -s sigpath2") elif "--signatures" in list(doc.meta.argv): + logger.debug("enter --signatures") idx = doc.meta.argv.index("--signatures") + logger.debug("got --signatures idx") sigpath = Path(doc.meta.argv[idx + 1]) + logger.debug("got --signatures sigpath1") if "./" in str(sigpath): + logger.debug("in --signatures ./ block") fixed_str = str(sigpath).split("./")[1] + logger.debug("got --signatures fixed_str") sigpath = Path(fixed_str) + logger.debug("got --signatures sigpath2") else: + logger.debug("enter else block") sigpath = "(embedded)" # type: ignore + logger.debug("got else sigpath") + logger.debug("attempt capa.loader.get_signatures(sigpath)") return capa.loader.get_signatures(sigpath) except AttributeError: From 6236b8f1f5d602cfd8857ab94c549007da67780e Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Tue, 19 Mar 2024 14:08:37 -0500 Subject: [PATCH 07/21] Update capa/capabilities/extract_domain_and_ip.py Co-authored-by: Vasco Schiavo <115561717+VascoSch92@users.noreply.github.com> --- capa/capabilities/extract_domain_and_ip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/capabilities/extract_domain_and_ip.py b/capa/capabilities/extract_domain_and_ip.py index f81772c6a..e666bcb9c 100644 --- a/capa/capabilities/extract_domain_and_ip.py +++ b/capa/capabilities/extract_domain_and_ip.py @@ -55,7 +55,7 @@ def is_valid_domain(string: str) -> bool: "bin", "scr", "exf", - ] # add more to this list + ] # TODO: add more to this list top_level_domain = string.split(".")[-1] for invalid in invalid_list: From f9f6bb44a503cc5f36250e361f49f8d890b28cd8 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Tue, 19 Mar 2024 14:15:43 -0500 Subject: [PATCH 08/21] Update capa/capabilities/extract_domain_and_ip.py Co-authored-by: Vasco Schiavo <115561717+VascoSch92@users.noreply.github.com> --- capa/capabilities/extract_domain_and_ip.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/capa/capabilities/extract_domain_and_ip.py b/capa/capabilities/extract_domain_and_ip.py index e666bcb9c..73d9f5fab 100644 --- a/capa/capabilities/extract_domain_and_ip.py +++ b/capa/capabilities/extract_domain_and_ip.py @@ -295,11 +295,7 @@ def get_domain_or_ip_caller_functions(doc: ResultDocument, domain_or_ip: str) -> returns: List[str]: list of functions that operate on the 'domain_or_ip' string """ - api_functions = [] - for caller_func in yield_caller_funcs(doc, domain_or_ip): - api_functions.append(caller_func) - - return api_functions + return [ caller_func for caller_func in yield_caller_funcs(doc, domain_or_ip)] def yield_caller_funcs(doc: ResultDocument, domain_or_ip: str) -> Generator[str, None, None]: From aa5f542688b867f429930357b18cdf4582b25c22 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Tue, 19 Mar 2024 14:27:09 -0500 Subject: [PATCH 09/21] Update verbose.py --- capa/render/verbose.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/capa/render/verbose.py b/capa/render/verbose.py index 5d6b69e82..838a751d3 100644 --- a/capa/render/verbose.py +++ b/capa/render/verbose.py @@ -350,7 +350,8 @@ def render_domain_and_ip(ostream: rutils.StringIO, doc: rd.ResultDocument): if len(new_line) > max_line: max_line = len(new_line) - white_spaces = " " * ceil(1 / 3 * max_line) + if max_line > 0: + white_spaces = " " * ceil(1 / 3 * max_line) if rows: ostream.write( @@ -365,9 +366,8 @@ def render_domain_and_ip(ostream: rutils.StringIO, doc: rd.ResultDocument): ostream.writeln(rutils.bold("No web domains or IP addresses found")) -def ceil(num): - if isinstance(num, float): - return int(num - 0.5) + 1 +def ceil(num: float) -> int: + return int(num - 0.5) + 1 def render_verbose(doc: rd.ResultDocument): From 2e9c1a981cee4e4d24b52503d09b08eb1a04c3e3 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Tue, 19 Mar 2024 14:27:53 -0500 Subject: [PATCH 10/21] Update vverbose.py --- capa/render/vverbose.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index 34884edcc..967b8cccb 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -491,7 +491,8 @@ def render_domain_and_ip(ostream: rutils.StringIO, doc: rd.ResultDocument): if len(new_line) > max_line: max_line = len(new_line) - white_spaces = " " * ceil(1 / 3 * max_line) + if max_line > 0: + white_spaces = " " * ceil(1 / 3 * max_line) if rows: ostream.write( @@ -506,9 +507,8 @@ def render_domain_and_ip(ostream: rutils.StringIO, doc: rd.ResultDocument): ostream.writeln(rutils.bold("No web domains or IP addresses found")) -def ceil(num): - if isinstance(num, float): - return int(num - 0.5) + 1 +def ceil(num: float) -> int: + return int(num - 0.5) + 1 def render_vverbose(doc: rd.ResultDocument): From 11585c348836e639ed4f7605f07241360af019a4 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Tue, 19 Mar 2024 14:33:00 -0500 Subject: [PATCH 11/21] Update extract_domain_and_ip.py --- capa/capabilities/extract_domain_and_ip.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/capa/capabilities/extract_domain_and_ip.py b/capa/capabilities/extract_domain_and_ip.py index 73d9f5fab..b1ab0f982 100644 --- a/capa/capabilities/extract_domain_and_ip.py +++ b/capa/capabilities/extract_domain_and_ip.py @@ -280,13 +280,6 @@ def networking_functions_statement(doc: ResultDocument, domain_or_ip: str): return statement - else: - raise LengthError("'api_functions' contains unexpected data!") - - -class LengthError(BaseException): - pass - def get_domain_or_ip_caller_functions(doc: ResultDocument, domain_or_ip: str) -> List[str]: """ @@ -295,7 +288,7 @@ def get_domain_or_ip_caller_functions(doc: ResultDocument, domain_or_ip: str) -> returns: List[str]: list of functions that operate on the 'domain_or_ip' string """ - return [ caller_func for caller_func in yield_caller_funcs(doc, domain_or_ip)] + return [caller_func for caller_func in yield_caller_funcs(doc, domain_or_ip)] def yield_caller_funcs(doc: ResultDocument, domain_or_ip: str) -> Generator[str, None, None]: From 58b0336b5ef191a5bdc96f17d660b1af9215b3c2 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Tue, 19 Mar 2024 14:59:51 -0500 Subject: [PATCH 12/21] Fix PyInstaller 3.11 installation error with get_signatures --- capa/capabilities/domain_ip_helpers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/capa/capabilities/domain_ip_helpers.py b/capa/capabilities/domain_ip_helpers.py index 7c890438c..7527cc8f3 100644 --- a/capa/capabilities/domain_ip_helpers.py +++ b/capa/capabilities/domain_ip_helpers.py @@ -29,6 +29,7 @@ def get_file_path(doc: ResultDocument) -> Path: def get_sigpaths_from_doc(doc: ResultDocument): import capa.loader + from capa.main import get_default_root logger.debug("enter get_sigpaths_from_doc") @@ -65,7 +66,7 @@ def get_sigpaths_from_doc(doc: ResultDocument): else: logger.debug("enter else block") - sigpath = "(embedded)" # type: ignore + sigpath = get_default_root() / "sigs" logger.debug("got else sigpath") logger.debug("attempt capa.loader.get_signatures(sigpath)") From c55169bd07cec1178b9fd5679461963008d36b22 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Tue, 19 Mar 2024 17:33:52 -0500 Subject: [PATCH 13/21] Fix get_extractor_from_doc issue for CAPE --- capa/capabilities/domain_ip_helpers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/capa/capabilities/domain_ip_helpers.py b/capa/capabilities/domain_ip_helpers.py index 7527cc8f3..197ebbae9 100644 --- a/capa/capabilities/domain_ip_helpers.py +++ b/capa/capabilities/domain_ip_helpers.py @@ -21,6 +21,7 @@ BACKEND_DOTNET = "dotnet" BACKEND_BINJA = "binja" BACKEND_PEFILE = "pefile" +BACKEND_CAPE = "cape" def get_file_path(doc: ResultDocument) -> Path: @@ -88,7 +89,7 @@ def get_extractor_from_doc(doc: ResultDocument) -> FeatureExtractor: os = doc.meta.analysis.os _ = get_auto_format(get_file_path(doc)) - if format == FORMAT_CAPE: + if format == BACKEND_CAPE: report = capa.helpers.load_json_from_path(path) return CapeExtractor.from_report(report) elif _ == BACKEND_VIV: From 85dfa5576b351f8467dd0a90bee23e7d77a8d3c5 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Tue, 19 Mar 2024 19:28:53 -0500 Subject: [PATCH 14/21] Fix 'backend' assignment in get_extractor_from_doc --- capa/capabilities/domain_ip_helpers.py | 49 ++++++++++++++++---------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/capa/capabilities/domain_ip_helpers.py b/capa/capabilities/domain_ip_helpers.py index 197ebbae9..0d24d1855 100644 --- a/capa/capabilities/domain_ip_helpers.py +++ b/capa/capabilities/domain_ip_helpers.py @@ -9,11 +9,9 @@ import logging from pathlib import Path -from capa.helpers import get_auto_format -from capa.features.common import FORMAT_CAPE +from capa.features.common import FORMAT_AUTO, FORMAT_CAPE, FORMAT_DOTNET, FORMAT_FREEZE from capa.render.result_document import ResultDocument from capa.features.extractors.base_extractor import FeatureExtractor -from capa.features.extractors.cape.extractor import CapeExtractor logger = logging.getLogger(__name__) @@ -82,30 +80,43 @@ def get_sigpaths_from_doc(doc: ResultDocument): def get_extractor_from_doc(doc: ResultDocument) -> FeatureExtractor: - import capa.loader + from capa.loader import ( + BACKEND_VIV, + BACKEND_CAPE, + BACKEND_DOTNET, + BACKEND_FREEZE, + get_extractor, + ) path = get_file_path(doc) - format = doc.meta.analysis.format os = doc.meta.analysis.os - _ = get_auto_format(get_file_path(doc)) - if format == BACKEND_CAPE: - report = capa.helpers.load_json_from_path(path) - return CapeExtractor.from_report(report) - elif _ == BACKEND_VIV: + args = doc.meta.argv + for i in range(len(args)): + if args[i] == any(['-f', '--format']): + format = args[i + 1] + else: + format = FORMAT_AUTO + + for i in range(len(args)): + if args[i] == any(['-b', '--backend']): + backend = args[i + 1] + break + elif format == FORMAT_CAPE: + backend = BACKEND_CAPE + elif format == FORMAT_DOTNET: + backend = BACKEND_DOTNET + elif format == FORMAT_FREEZE: + backend = BACKEND_FREEZE + else: + backend = '' + + if backend == '': backend = BACKEND_VIV - elif _ == BACKEND_PEFILE: - backend = BACKEND_PEFILE - elif _ == BACKEND_BINJA: - backend = BACKEND_BINJA - elif _ == BACKEND_DOTNET: - backend = BACKEND_DOTNET - else: - backend = BACKEND_VIV # according to main.py this is the default sigpath = get_sigpaths_from_doc(doc) - return capa.loader.get_extractor( + return get_extractor( input_path=path, input_format=format, os_=os, From 5afd175e893d46859e9e07be713bf7ee4f334013 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Wed, 20 Mar 2024 13:42:56 -0500 Subject: [PATCH 15/21] Fix 'format' in get_extractor_from_doc --- capa/capabilities/domain_ip_helpers.py | 78 ++++++++++++-------------- 1 file changed, 35 insertions(+), 43 deletions(-) diff --git a/capa/capabilities/domain_ip_helpers.py b/capa/capabilities/domain_ip_helpers.py index 0d24d1855..18245d121 100644 --- a/capa/capabilities/domain_ip_helpers.py +++ b/capa/capabilities/domain_ip_helpers.py @@ -9,18 +9,19 @@ import logging from pathlib import Path -from capa.features.common import FORMAT_AUTO, FORMAT_CAPE, FORMAT_DOTNET, FORMAT_FREEZE +from capa.helpers import get_auto_format +from capa.exceptions import UnsupportedFormatError +from capa.features.common import ( + FORMAT_CAPE, + FORMAT_DOTNET, + FORMAT_FREEZE, + FORMAT_UNKNOWN, +) from capa.render.result_document import ResultDocument from capa.features.extractors.base_extractor import FeatureExtractor logger = logging.getLogger(__name__) -BACKEND_VIV = "vivisect" -BACKEND_DOTNET = "dotnet" -BACKEND_BINJA = "binja" -BACKEND_PEFILE = "pefile" -BACKEND_CAPE = "cape" - def get_file_path(doc: ResultDocument) -> Path: return Path(doc.meta.sample.path) @@ -30,88 +31,75 @@ def get_sigpaths_from_doc(doc: ResultDocument): import capa.loader from capa.main import get_default_root - logger.debug("enter get_sigpaths_from_doc") - if doc.meta.argv: - logger.debug("enter if doc.meta.argv") try: - logger.debug("enter try block") - logger.debug(f"doc.meta.argv == {list(doc.meta.argv)}") if "-s" in list(doc.meta.argv): - logger.debug("enter -s") idx = doc.meta.argv.index("-s") - logger.debug("got -s idx") sigpath = Path(doc.meta.argv[idx + 1]) - logger.debug("got -s sigpath1") if "./" in str(sigpath): - logger.debug("in -s ./") fixed_str = str(sigpath).split("./")[1] - logger.debug("got -s fixed_str") sigpath = Path(fixed_str) - logger.debug("got -s sigpath2") elif "--signatures" in list(doc.meta.argv): - logger.debug("enter --signatures") idx = doc.meta.argv.index("--signatures") - logger.debug("got --signatures idx") sigpath = Path(doc.meta.argv[idx + 1]) - logger.debug("got --signatures sigpath1") if "./" in str(sigpath): - logger.debug("in --signatures ./ block") fixed_str = str(sigpath).split("./")[1] - logger.debug("got --signatures fixed_str") sigpath = Path(fixed_str) - logger.debug("got --signatures sigpath2") else: - logger.debug("enter else block") sigpath = get_default_root() / "sigs" - logger.debug("got else sigpath") - logger.debug("attempt capa.loader.get_signatures(sigpath)") return capa.loader.get_signatures(sigpath) except AttributeError: raise NotImplementedError("Confirm that argv is an attribute of doc.meta") else: - print("in 'get_sigpaths_from_doc', run in debug (-d) mode") - logger.debug("'doc.meta' has not attribute 'argv', this is probably a bad sign...") + logger.debug("'doc.meta' has not attribute 'argv'") def get_extractor_from_doc(doc: ResultDocument) -> FeatureExtractor: - from capa.loader import ( - BACKEND_VIV, - BACKEND_CAPE, - BACKEND_DOTNET, - BACKEND_FREEZE, - get_extractor, - ) + # import here to avoid circular import + from capa.loader import BACKEND_VIV, BACKEND_CAPE, BACKEND_DOTNET, BACKEND_FREEZE, get_extractor path = get_file_path(doc) os = doc.meta.analysis.os - args = doc.meta.argv + if doc.meta.argv: + args = tuple(doc.meta.argv) + else: + CommandLineArgumentsError("Couldn't find command line arguments!") + for i in range(len(args)): - if args[i] == any(['-f', '--format']): + if args[i] == any(["-f", "--format"]): format = args[i + 1] + break else: - format = FORMAT_AUTO + format = "" + + if format == "": + format = get_auto_format(path) + if format == FORMAT_UNKNOWN: + raise UnsupportedFormatError(f"Couldn't get format for {path.name}") for i in range(len(args)): - if args[i] == any(['-b', '--backend']): + if args[i] == any(["-b", "--backend"]): backend = args[i + 1] break elif format == FORMAT_CAPE: backend = BACKEND_CAPE + break elif format == FORMAT_DOTNET: backend = BACKEND_DOTNET + break elif format == FORMAT_FREEZE: backend = BACKEND_FREEZE + break else: - backend = '' - - if backend == '': + backend = "" + + if backend == "": backend = BACKEND_VIV sigpath = get_sigpaths_from_doc(doc) @@ -123,3 +111,7 @@ def get_extractor_from_doc(doc: ResultDocument) -> FeatureExtractor: backend=backend, sigpaths=sigpath, ) + + +class CommandLineArgumentsError(BaseException): + pass From 681e6c3fdc1b3f991af48143c88a2658f1e87b0e Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Wed, 20 Mar 2024 13:50:08 -0500 Subject: [PATCH 16/21] Reformat imports --- capa/capabilities/domain_ip_helpers.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/capa/capabilities/domain_ip_helpers.py b/capa/capabilities/domain_ip_helpers.py index 18245d121..c1cd12701 100644 --- a/capa/capabilities/domain_ip_helpers.py +++ b/capa/capabilities/domain_ip_helpers.py @@ -11,12 +11,7 @@ from capa.helpers import get_auto_format from capa.exceptions import UnsupportedFormatError -from capa.features.common import ( - FORMAT_CAPE, - FORMAT_DOTNET, - FORMAT_FREEZE, - FORMAT_UNKNOWN, -) +from capa.features.common import FORMAT_CAPE, FORMAT_DOTNET, FORMAT_FREEZE, FORMAT_UNKNOWN from capa.render.result_document import ResultDocument from capa.features.extractors.base_extractor import FeatureExtractor From fb3ed8a41c9918b37014048683cea5e4f53c6e41 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Wed, 20 Mar 2024 13:54:25 -0500 Subject: [PATCH 17/21] Reformat multi-line string --- capa/capabilities/extract_domain_and_ip.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/capa/capabilities/extract_domain_and_ip.py b/capa/capabilities/extract_domain_and_ip.py index b1ab0f982..6771e9e38 100644 --- a/capa/capabilities/extract_domain_and_ip.py +++ b/capa/capabilities/extract_domain_and_ip.py @@ -263,9 +263,11 @@ def networking_functions_statement(doc: ResultDocument, domain_or_ip: str): api_functions = get_domain_or_ip_caller_functions(doc, domain_or_ip) if len(api_functions) == 0: - statement = (f"{domain_or_ip} occurs but no functions found that use it.\n" + statement = ( + f"{domain_or_ip} occurs but no functions found that use it.\n" " If you think this is a mistake, please open an issue on\n" - " the capa GitHub page (https://github.com/mandiant/capa)\n") + " the capa GitHub page (https://github.com/mandiant/capa)\n" + ) return statement elif len(api_functions) == 1: From 0443caf31cb87ee3f155d47b8f65dd71ae22af4b Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Wed, 20 Mar 2024 14:07:57 -0500 Subject: [PATCH 18/21] Correct Flake8 errors --- capa/capabilities/extract_domain_and_ip.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/capa/capabilities/extract_domain_and_ip.py b/capa/capabilities/extract_domain_and_ip.py index 6771e9e38..cc17add68 100644 --- a/capa/capabilities/extract_domain_and_ip.py +++ b/capa/capabilities/extract_domain_and_ip.py @@ -55,10 +55,10 @@ def is_valid_domain(string: str) -> bool: "bin", "scr", "exf", - ] # TODO: add more to this list + ] # TODO (aaronatp): add more to this list # noqa: T003 top_level_domain = string.split(".")[-1] - for invalid in invalid_list: + for invalid in invalid_list: # noqa: SIM111 if top_level_domain == invalid: return False @@ -264,8 +264,8 @@ def networking_functions_statement(doc: ResultDocument, domain_or_ip: str): if len(api_functions) == 0: statement = ( - f"{domain_or_ip} occurs but no functions found that use it.\n" - " If you think this is a mistake, please open an issue on\n" + f"{domain_or_ip} occurs but no functions found that use it.\n" # noqa: NIC002 + " If you think this is a mistake, please open an issue on\n" # noqa: NIC002 " the capa GitHub page (https://github.com/mandiant/capa)\n" ) return statement @@ -282,6 +282,13 @@ def networking_functions_statement(doc: ResultDocument, domain_or_ip: str): return statement + else: + raise LengthError("'api_functions' contains unexpected data!") + + +class LengthError(BaseException): + pass + def get_domain_or_ip_caller_functions(doc: ResultDocument, domain_or_ip: str) -> List[str]: """ @@ -290,7 +297,7 @@ def get_domain_or_ip_caller_functions(doc: ResultDocument, domain_or_ip: str) -> returns: List[str]: list of functions that operate on the 'domain_or_ip' string """ - return [caller_func for caller_func in yield_caller_funcs(doc, domain_or_ip)] + return list(yield_caller_funcs(doc, domain_or_ip)) def yield_caller_funcs(doc: ResultDocument, domain_or_ip: str) -> Generator[str, None, None]: From a000461daad5bd94e5c65029e13f4a8a38ed1c0e Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Fri, 22 Mar 2024 03:01:40 -0500 Subject: [PATCH 19/21] Update domain_ip_helpers.py --- capa/capabilities/domain_ip_helpers.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/capa/capabilities/domain_ip_helpers.py b/capa/capabilities/domain_ip_helpers.py index c1cd12701..d4d17abba 100644 --- a/capa/capabilities/domain_ip_helpers.py +++ b/capa/capabilities/domain_ip_helpers.py @@ -99,6 +99,12 @@ def get_extractor_from_doc(doc: ResultDocument) -> FeatureExtractor: sigpath = get_sigpaths_from_doc(doc) + import capa.helpers + + logger.debug(f"running standable == {capa.helpers.is_running_standalone}") + + raise QuickExitError() + return get_extractor( input_path=path, input_format=format, @@ -110,3 +116,6 @@ def get_extractor_from_doc(doc: ResultDocument) -> FeatureExtractor: class CommandLineArgumentsError(BaseException): pass + +class QuickExitError(BaseException): + pass From 93943e54dedcd60a94093c234cfa5434617b93c2 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Fri, 22 Mar 2024 03:06:48 -0500 Subject: [PATCH 20/21] Update domain_ip_helpers.py --- capa/capabilities/domain_ip_helpers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/capa/capabilities/domain_ip_helpers.py b/capa/capabilities/domain_ip_helpers.py index d4d17abba..05dadae8b 100644 --- a/capa/capabilities/domain_ip_helpers.py +++ b/capa/capabilities/domain_ip_helpers.py @@ -101,7 +101,7 @@ def get_extractor_from_doc(doc: ResultDocument) -> FeatureExtractor: import capa.helpers - logger.debug(f"running standable == {capa.helpers.is_running_standalone}") + logger.debug(f"running standable == {capa.helpers.is_running_standalone()}") raise QuickExitError() @@ -117,5 +117,6 @@ def get_extractor_from_doc(doc: ResultDocument) -> FeatureExtractor: class CommandLineArgumentsError(BaseException): pass + class QuickExitError(BaseException): pass From 32b47787e877985140526d066ceb19431992d6a9 Mon Sep 17 00:00:00 2001 From: aaronatp <58194911+aaronatp@users.noreply.github.com> Date: Fri, 22 Mar 2024 12:32:58 -0500 Subject: [PATCH 21/21] Update capa/render/verbose.py Co-authored-by: Moritz --- capa/render/verbose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/render/verbose.py b/capa/render/verbose.py index 838a751d3..12fd4d00f 100644 --- a/capa/render/verbose.py +++ b/capa/render/verbose.py @@ -332,7 +332,7 @@ def render_domain_and_ip(ostream: rutils.StringIO, doc: rd.ResultDocument): | |----HttpOpenRequestA | | |----FtpGetFileA | | |----3 occurrances | - | | | + | | | 192.123.232.08 | | |----Functions used to communicate with 192.123.232.08:| | |----... |