diff --git a/clamscan/clamscan.c b/clamscan/clamscan.c index 2b10a1023b..3c35d6aa0c 100644 --- a/clamscan/clamscan.c +++ b/clamscan/clamscan.c @@ -295,6 +295,7 @@ void help(void) mprintf(LOGG_INFO, " --phishing-sigs[=yes(*)/no] Enable email signature-based phishing detection\n"); mprintf(LOGG_INFO, " --phishing-scan-urls[=yes(*)/no] Enable URL signature-based phishing detection\n"); mprintf(LOGG_INFO, " --heuristic-alerts[=yes(*)/no] Heuristic alerts\n"); + mprintf(LOGG_INFO, " --store-html-urls[=yes(*)/no] Store html URLs in metadata\n"); mprintf(LOGG_INFO, " --heuristic-scan-precedence[=yes/no(*)] Stop scanning as soon as a heuristic match is found\n"); mprintf(LOGG_INFO, " --normalize[=yes(*)/no] Normalize html, script, and text files. Use normalize=no for yara compatibility\n"); mprintf(LOGG_INFO, " --scan-pe[=yes(*)/no] Scan PE files\n"); diff --git a/clamscan/manager.c b/clamscan/manager.c index db3a8f46b6..8c75e75010 100644 --- a/clamscan/manager.c +++ b/clamscan/manager.c @@ -1557,6 +1557,10 @@ int scanmanager(const struct optstruct *opts) options.general |= CL_SCAN_GENERAL_HEURISTICS; } + if (optget(opts, "store-html-urls")->enabled) { + options.general |= CL_SCAN_STORE_HTML_URLS; + } + /* TODO: Remove deprecated option in a future feature release */ if ((optget(opts, "block-max")->enabled) || (optget(opts, "alert-exceeds-max")->enabled)) { diff --git a/common/optparser.c b/common/optparser.c index dd99f43eb2..8caf869c99 100644 --- a/common/optparser.c +++ b/common/optparser.c @@ -389,6 +389,7 @@ const struct clam_option __clam_options[] = { {"PhishingScanURLs", "phishing-scan-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Scan URLs found in mails for phishing attempts using heuristics.", "yes"}, {"HeuristicAlerts", "heuristic-alerts", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "In some cases (eg. complex malware, exploits in graphic files, and others),\nClamAV uses special algorithms to provide accurate detection. This option\ncontrols the algorithmic detection.", "yes"}, + {"StoreHTMLUrls", "store-html-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in HTML
count + 1; + char ** tmp = NULL; + + /* + * Do NOT use cli_max_realloc_or_free because all the previously malloc'd tag + * values will be leaked when tag is free'd in the case where realloc fails. + */ + tmp = cli_max_realloc(tags->urls, cnt * sizeof(unsigned char *)); + if (!tmp) { + goto done; + } + tags->urls = tmp; + + tags->urls[tags->count] = cli_safer_strdup(value); + if (tags->urls[tags->count]) { + tags->count = cnt; + } + + bRet = true; +done: + if (!bRet){ + memset(tags, 0, sizeof(*tags)); + } + + return bRet; +} + +void html_form_data_tag_free(form_data_t *tags) { + size_t i; + for (i = 0; i < tags->count; i++){ + CLI_FREE_AND_SET_NULL(tags->urls[i]); + } + CLI_FREE_AND_SET_NULL(tags->urls); +} + +static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t * form_data) { int fd_tmp, tag_length = 0, tag_arg_length = 0; bool binary, retval = false, escape = false, hex = false; @@ -659,7 +696,7 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha FILE *stream_in = NULL; html_state state = HTML_NORM, next_state = HTML_BAD_STATE, saved_next_state = HTML_BAD_STATE; char filename[1024], tag[HTML_STR_LENGTH + 1], tag_arg[HTML_STR_LENGTH + 1]; - char tag_val[HTML_STR_LENGTH + 1], *tmp_file, *arg_value; + char tag_val[HTML_STR_LENGTH + 1], *tmp_file = NULL, *arg_value = NULL; unsigned char *line = NULL, *ptr, *ptr_screnc = NULL; tag_arguments_t tag_args; quoted_state quoted = NOT_QUOTED; @@ -1224,8 +1261,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha href_contents_begin = ptr; } if (strcmp(tag, "/form") == 0) { - if (in_form_action) + if (in_form_action) { free(in_form_action); + } in_form_action = NULL; } } else if (strcmp(tag, "script") == 0) { @@ -1310,9 +1348,13 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha } else if (strcmp(tag, "form") == 0 && hrefs->scanContents) { const char *arg_action_value = html_tag_arg_value(&tag_args, "action"); if (arg_action_value) { - if (in_form_action) + if (in_form_action) { free(in_form_action); + } in_form_action = (unsigned char *)cli_safer_strdup(arg_action_value); + if (form_data){ + html_insert_form_data((const char * const) in_form_action, form_data); + } } } else if (strcmp(tag, "img") == 0) { arg_value = html_tag_arg_value(&tag_args, "src"); @@ -1917,8 +1959,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha done: if (line) /* only needed for done case */ free(line); - if (in_form_action) + if (in_form_action) { free(in_form_action); + } if (in_ahref) /* tag not closed, force closing */ html_tag_contents_done(hrefs, in_ahref, &contents); @@ -1960,6 +2003,11 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha } bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf) +{ + return html_normalise_mem_form_data(ctx, in_buff, in_size, dirname, hrefs, dconf, NULL); +} + +bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t * form_data) { m_area_t m_area; @@ -1968,10 +2016,15 @@ bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, con m_area.offset = 0; m_area.map = NULL; - return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf); + return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data); } bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf) +{ + return html_normalise_map_form_data(ctx, map, dirname, hrefs, dconf, NULL); +} + +bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t * form_data) { bool retval = false; m_area_t m_area; @@ -1979,7 +2032,7 @@ bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_argu m_area.length = map->len; m_area.offset = 0; m_area.map = map; - retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf); + retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data); return retval; } diff --git a/libclamav/htmlnorm.h b/libclamav/htmlnorm.h index 72524165a6..abd8fb1004 100644 --- a/libclamav/htmlnorm.h +++ b/libclamav/htmlnorm.h @@ -45,10 +45,19 @@ typedef struct m_area_tag { fmap_t *map; } m_area_t; +typedef struct form_data_tag { + char ** urls; + size_t count; +} form_data_t; + bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf); +bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t * form_data); bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf); +bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t * form_data); void html_tag_arg_free(tag_arguments_t *tags); bool html_screnc_decode(fmap_t *map, const char *dirname); void html_tag_arg_add(tag_arguments_t *tags, const char *tag, char *value); +void html_form_data_tag_free(form_data_t *tags); + #endif diff --git a/libclamav/others.h b/libclamav/others.h index 8cebf78d35..fdff4283ca 100644 --- a/libclamav/others.h +++ b/libclamav/others.h @@ -552,6 +552,7 @@ extern LIBCLAMAV_EXPORT int have_rar; #define SCAN_HEURISTICS (ctx->options->general & CL_SCAN_GENERAL_HEURISTICS) #define SCAN_HEURISTIC_PRECEDENCE (ctx->options->general & CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE) #define SCAN_UNPRIVILEGED (ctx->options->general & CL_SCAN_GENERAL_UNPRIVILEGED) +#define STORE_HTML_URLS (ctx->options->general & CL_SCAN_STORE_HTML_URLS) #define SCAN_PARSE_ARCHIVE (ctx->options->parse & CL_SCAN_PARSE_ARCHIVE) #define SCAN_PARSE_ELF (ctx->options->parse & CL_SCAN_PARSE_ELF) diff --git a/libclamav/scanners.c b/libclamav/scanners.c index 8cc19297af..150b313ff5 100644 --- a/libclamav/scanners.c +++ b/libclamav/scanners.c @@ -2082,6 +2082,481 @@ static cl_error_t cli_ole2_tempdir_scan_for_xlm_and_images(const char *dir, cli_ return ret; } +const char *const HTML_URLS_JSON_KEY = "HTMLUrls"; +/* https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml */ +/* clang-format off */ +const char * URI_LIST[] = { + "aaa://" + , "aaas://" + , "about://" + , "acap://" + , "acct://" + , "acd://" + , "acr://" + , "adiumxtra://" + , "adt://" + , "afp://" + , "afs://" + , "aim://" + , "amss://" + , "android://" + , "appdata://" + , "apt://" + , "ar://" + , "ark://" + , "at://" + , "attachment://" + , "aw://" + , "barion://" + , "bb://" + , "beshare://" + , "bitcoin://" + , "bitcoincash://" + , "blob://" + , "bolo://" + , "brid://" + , "browserext://" + , "cabal://" + , "calculator://" + , "callto://" + , "cap://" + , "cast://" + , "casts://" + , "chrome://" + , "chrome-extension://" + , "cid://" + , "coap://" + , "coap+tcp://" + , "coap+ws://" + , "coaps://" + , "coaps+tcp://" + , "coaps+ws://" + , "com-eventbrite-attendee://" + , "content://" + , "content-type://" + , "crid://" + , "cstr://" + , "cvs://" + , "dab://" + , "dat://" + , "data://" + , "dav://" + , "dhttp://" + , "diaspora://" + , "dict://" + , "did://" + , "dis://" + , "dlna-playcontainer://" + , "dlna-playsingle://" + , "dns://" + , "dntp://" + , "doi://" + , "dpp://" + , "drm://" + , "drop://" + , "dtmi://" + , "dtn://" + , "dvb://" + , "dvx://" + , "dweb://" + , "ed2k://" + , "eid://" + , "elsi://" + , "embedded://" + , "ens://" + , "ethereum://" + , "example://" + , "facetime://" + , "fax://" + , "feed://" + , "feedready://" + , "fido://" + , "file://" + , "filesystem://" + , "finger://" + , "first-run-pen-experience://" + , "fish://" + , "fm://" + , "ftp://" + , "fuchsia-pkg://" + , "geo://" + , "gg://" + , "git://" + , "gitoid://" + , "gizmoproject://" + , "go://" + , "gopher://" + , "graph://" + , "grd://" + , "gtalk://" + , "h323://" + , "ham://" + , "hcap://" + , "hcp://" + , "hs20://" + , "http://" + , "https://" + , "hxxp://" + , "hxxps://" + , "hydrazone://" + , "hyper://" + , "iax://" + , "icap://" + , "icon://" + , "im://" + , "imap://" + , "info://" + , "iotdisco://" + , "ipfs://" + , "ipn://" + , "ipns://" + , "ipp://" + , "ipps://" + , "irc://" + , "irc6://" + , "ircs://" + , "iris://" + , "iris.beep://" + , "iris.lwz://" + , "iris.xpc://" + , "iris.xpcs://" + , "isostore://" + , "itms://" + , "jabber://" + , "jar://" + , "jms://" + , "keyparc://" + , "lastfm://" + , "lbry://" + , "ldap://" + , "ldaps://" + , "leaptofrogans://" + , "lid://" + , "lorawan://" + , "lpa://" + , "lvlt://" + , "machineProvisioningProgressReporter://" + , "magnet://" + , "mailserver://" + , "mailto://" + , "maps://" + , "market://" + , "matrix://" + , "message://" + , "microsoft.windows.camera://" + , "microsoft.windows.camera.multipicker://" + , "microsoft.windows.camera.picker://" + , "mid://" + , "mms://" + , "modem://" + , "mongodb://" + , "moz://" + , "ms-access://" + , "ms-appinstaller://" + , "ms-browser-extension://" + , "ms-calculator://" + , "ms-drive-to://" + , "ms-enrollment://" + , "ms-excel://" + , "ms-eyecontrolspeech://" + , "ms-gamebarservices://" + , "ms-gamingoverlay://" + , "ms-getoffice://" + , "ms-help://" + , "ms-infopath://" + , "ms-inputapp://" + , "ms-launchremotedesktop://" + , "ms-lockscreencomponent-config://" + , "ms-media-stream-id://" + , "ms-meetnow://" + , "ms-mixedrealitycapture://" + , "ms-mobileplans://" + , "ms-newsandinterests://" + , "ms-officeapp://" + , "ms-people://" + , "ms-project://" + , "ms-powerpoint://" + , "ms-publisher://" + , "ms-recall://" + , "ms-remotedesktop://" + , "ms-remotedesktop-launch://" + , "ms-restoretabcompanion://" + , "ms-screenclip://" + , "ms-screensketch://" + , "ms-search://" + , "ms-search-repair://" + , "ms-secondary-screen-controller://" + , "ms-secondary-screen-setup://" + , "ms-settings://" + , "ms-settings-airplanemode://" + , "ms-settings-bluetooth://" + , "ms-settings-camera://" + , "ms-settings-cellular://" + , "ms-settings-cloudstorage://" + , "ms-settings-connectabledevices://" + , "ms-settings-displays-topology://" + , "ms-settings-emailandaccounts://" + , "ms-settings-language://" + , "ms-settings-location://" + , "ms-settings-lock://" + , "ms-settings-nfctransactions://" + , "ms-settings-notifications://" + , "ms-settings-power://" + , "ms-settings-privacy://" + , "ms-settings-proximity://" + , "ms-settings-screenrotation://" + , "ms-settings-wifi://" + , "ms-settings-workplace://" + , "ms-spd://" + , "ms-stickers://" + , "ms-sttoverlay://" + , "ms-transit-to://" + , "ms-useractivityset://" + , "ms-virtualtouchpad://" + , "ms-visio://" + , "ms-walk-to://" + , "ms-whiteboard://" + , "ms-whiteboard-cmd://" + , "ms-word://" + , "msnim://" + , "msrp://" + , "msrps://" + , "mss://" + , "mt://" + , "mtqp://" + , "mumble://" + , "mupdate://" + , "mvn://" + , "mvrp://" + , "mvrps://" + , "news://" + , "nfs://" + , "ni://" + , "nih://" + , "nntp://" + , "notes://" + , "num://" + , "ocf://" + , "oid://" + , "onenote://" + , "onenote-cmd://" + , "opaquelocktoken://" + , "openid://" + , "openpgp4fpr://" + , "otpauth://" + , "p1://" + , "pack://" + , "palm://" + , "paparazzi://" + , "payment://" + , "payto://" + , "pkcs11://" + , "platform://" + , "pop://" + , "pres://" + , "prospero://" + , "proxy://" + , "pwid://" + , "psyc://" + , "pttp://" + , "qb://" + , "query://" + , "quic-transport://" + , "redis://" + , "rediss://" + , "reload://" + , "res://" + , "resource://" + , "rmi://" + , "rsync://" + , "rtmfp://" + , "rtmp://" + , "rtsp://" + , "rtsps://" + , "rtspu://" + , "sarif://" + , "secondlife://" + , "secret-token://" + , "service://" + , "session://" + , "sftp://" + , "sgn://" + , "shc://" + , "shttp://" + , "sieve://" + , "simpleledger://" + , "simplex://" + , "sip://" + , "sips://" + , "skype://" + , "smb://" + , "smp://" + , "sms://" + , "smtp://" + , "snews://" + , "snmp://" + , "soap.beep://" + , "soap.beeps://" + , "soldat://" + , "spiffe://" + , "spotify://" + , "ssb://" + , "ssh://" + , "starknet://" + , "steam://" + , "stun://" + , "stuns://" + , "submit://" + , "svn://" + , "swh://" + , "swid://" + , "swidpath://" + , "tag://" + , "taler://" + , "teamspeak://" + , "tel://" + , "teliaeid://" + , "telnet://" + , "tftp://" + , "things://" + , "thismessage://" + , "tip://" + , "tn3270://" + , "tool://" + , "turn://" + , "turns://" + , "tv://" + , "udp://" + , "unreal://" + , "upt://" + , "urn://" + , "ut2004://" + , "uuid-in-package://" + , "v-event://" + , "vemmi://" + , "ventrilo://" + , "ves://" + , "videotex://" + , "vnc://" + , "view-source://" + , "vscode://" + , "vscode-insiders://" + , "vsls://" + , "w3://" + , "wais://" + , "web3://" + , "wcr://" + , "webcal://" + , "web+ap://" + , "wifi://" + , "wpid://" + , "ws://" + , "wss://" + , "wtai://" + , "wyciwyg://" + , "xcon://" + , "xcon-userid://" + , "xfire://" + , "xmlrpc.beep://" + , "xmlrpc.beeps://" + , "xmpp://" + , "xftp://" + , "xrcp://" + , "xri://" + , "ymsgr://" + , "z39.50://" + , "z39.50r://" + , "z39.50s://" +}; +/* clang-format on */ + +static bool is_url(const char *const str) +{ + +#define MATCH(str, prefix) \ + do { \ + if (str && (strlen(str) > strlen(prefix)) && (0 == strncasecmp(str, prefix, strlen(prefix)))) { \ + bRet = true; \ + goto done; \ + } \ + } while (0); + + bool bRet = false; + size_t i; + + for (i = 0; i < sizeof(URI_LIST) / sizeof(URI_LIST[0]); i++) { + MATCH(str, URI_LIST[i]); + } +done: + return bRet; +#undef MATCH +} +static void save_urls(cli_ctx *ctx, tag_arguments_t *hrefs, form_data_t * form_data) +{ + int i = 0; + json_object *ary = NULL; + + if (NULL == hrefs) { + return; + } + + if (ctx->wrkproperty != ctx->properties) { + return; + } + + if (!(STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL))) { + return; + } + + /*Add hrefs*/ + for (i = 0; i < hrefs->count; i++) { + if (is_url((const char *)hrefs->value[i])) { + if (NULL == ary){ + ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY); + if (!ary){ + cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY); + return; + } + } + cli_jsonstr(ary, NULL, (const char *)hrefs->value[i]); + } + } + + /*Add form_data*/ + for (i = 0; i < (int) form_data->count; i++) { + if (is_url((const char *)form_data->urls[i])) { + if (NULL == ary){ + ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY); + if (!ary){ + cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY); + return; + } + } + cli_jsonstr(ary, NULL, (const char *)form_data->urls[i]); + } + } + + +#if 0 + if (!bAdded) { + return; + } + + json_object *ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY); + if (ary) { + for (i = 0; i < hrefs->count; i++) { + if (is_url((const char *)hrefs->value[i])) { + cli_jsonstr(ary, NULL, (const char *)hrefs->value[i]); + } + } + } else { + cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY); + } +#endif + + +} + static cl_error_t cli_scanhtml(cli_ctx *ctx) { cl_error_t status = CL_SUCCESS; @@ -2113,7 +2588,18 @@ static cl_error_t cli_scanhtml(cli_ctx *ctx) cli_dbgmsg("cli_scanhtml: using tempdir %s\n", tempname); - (void)html_normalise_map(ctx, map, tempname, NULL, ctx->dconf); + /* Output JSON Summary Information */ + if (STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL)) { + tag_arguments_t hrefs = {0}; + hrefs.scanContents = 1; + form_data_t form_data = {0}; + (void)html_normalise_map_form_data(ctx, map, tempname, &hrefs, ctx->dconf, &form_data); + save_urls(ctx, &hrefs, &form_data); + html_tag_arg_free(&hrefs); + html_form_data_tag_free(&form_data); + } else { + (void)html_normalise_map(ctx, map, tempname, NULL, ctx->dconf); + } snprintf(fullname, 1024, "%s" PATHSEP "nocomment.html", tempname); fd = open(fullname, O_RDONLY | O_BINARY); @@ -4212,9 +4698,9 @@ static inline bool result_should_goto_done(cli_ctx *ctx, cl_error_t result_in, c cl_error_t cli_magic_scan(cli_ctx *ctx, cli_file_t type) { - cl_error_t ret = CL_CLEAN; + cl_error_t ret = CL_CLEAN; cl_error_t cache_check_result = CL_VIRUS; - bool cache_enabled = true; + bool cache_enabled = true; cl_error_t verdict_at_this_level; cli_file_t dettype = 0; uint8_t typercg = 1; diff --git a/unit_tests/clamscan/save_html_urls_test.py b/unit_tests/clamscan/save_html_urls_test.py new file mode 100644 index 0000000000..ed29936072 --- /dev/null +++ b/unit_tests/clamscan/save_html_urls_test.py @@ -0,0 +1,76 @@ +# Copyright (C) 2020-2024 Cisco Systems, Inc. and/or its affiliates. All rights reserved. + +""" +Run clamscan tests. +""" + +import sys +import os +import re +import shutil + +sys.path.append('../unit_tests') +import testcase + + +class TC(testcase.TestCase): + @classmethod + def setUpClass(cls): + super(TC, cls).setUpClass() + + @classmethod + def tearDownClass(cls): + super(TC, cls).tearDownClass() + + def setUp(self): + super(TC, self).setUp() + + def tearDown(self): + super(TC, self).tearDown() + + # Remove scan temps directory between tests + if (self.path_tmp / "TD").exists(): + shutil.rmtree(self.path_tmp / "TD") + + self.verify_valgrind_log() + + # Find the metadata.json file and verify its contents. + #TODO: REMOVE THIS WHEN https://github.com/Cisco-Talos/clamav/pull/1295 is merged + def verify_metadata_json_TEMPORARY(self, tempdir, expected=[], unexpected=[]): + for parent, dirs, files in os.walk(tempdir): + for f in files: + if "metadata.json" == f: + with open(os.path.join(parent, f)) as handle: + metadata_json = handle.read() + self.verify_output(metadata_json, expected=expected, unexpected=unexpected) + + # There is only one metadata.json per scan. + # We found it, so we can break out of the loop. + break + + def test_save_links(self): + self.step_name('Extract Links') + + tempdir=self.path_tmp / "TD" + if not os.path.isdir(tempdir): + os.makedirs(tempdir); + + testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'html' / 'index.html' + command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format( + valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan, + path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb', + tempdir=tempdir, + testfile=testfile, + ) + output = self.execute_command(command) + + assert output.ec == 0 # clean + + expected_strings = [ 'HTMLUrls' + , '"https://www.clamav.net/reports/malware"' + , '"http://www.google.com"' + ] + self.verify_metadata_json_TEMPORARY(tempdir, expected_strings) + + + diff --git a/unit_tests/input/other_scanfiles/html/index.html b/unit_tests/input/other_scanfiles/html/index.html new file mode 100644 index 0000000000..1ca1956380 --- /dev/null +++ b/unit_tests/input/other_scanfiles/html/index.html @@ -0,0 +1,16 @@ + + + + +

Save Links Unittest

+

Paragraph

+Report Malware + + + +
+ + + + +