From 45c0e98106e2473606fbdc6c3d7644c5578f37ea Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Thu, 19 Sep 2024 17:57:37 +0200 Subject: [PATCH] Adds detection for Oh Dear and improves detection for vuhuvBot and Bravebot (#7837) * Change category for Bravebot * Improves detection for vuhuvBot * Adds detection for Oh Dear ref #7836 --- Tests/fixtures/bots.yml | 41 +++++++++++++++++++++++++++++++++++++---- regexes/bots.yml | 18 +++++++++++++----- 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index f4cbb482e4..5200b9e578 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -5812,9 +5812,15 @@ - user_agent: Mozilla/5.0 (compatible; vuhuvBot/1.0; +http://vuhuv.com/bot.html) bot: - name: Vuhuv Bot - category: Crawler - url: http://vuhuv.com/bot.html + name: vuhuvBot + category: Search bot + url: https://vuhuv.com/bot.html +- + user_agent: Mozilla/5.0 (compatible; vuhuvRBT/2.0; +https://vuhuv.com/rbt.html) + bot: + name: vuhuvBot + category: Search bot + url: https://vuhuv.com/bot.html - user_agent: Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) SiteCheck-sitecrawl by Siteimprove.com bot: @@ -8238,7 +8244,7 @@ user_agent: Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Bravebot/1.0; +https://search.brave.com/help/brave-search-crawler) Chrome/W.X.Y.Z Safari/537.36 bot: name: Bravebot - category: Crawler + category: Search bot url: https://search.brave.com/help/brave-search-crawler producer: name: Brave Software, Inc. @@ -8267,3 +8273,30 @@ producer: name: Valve Corporation url: https://www.valvesoftware.com/ +- + user_agent: Oh Dear Sitemap check https://ohdear.app + bot: + name: Oh Dear + category: Site Monitor + url: https://ohdear.app/docs/faq/what-is-the-oh-dear-crawler-doing-in-my-logs + producer: + name: Immutable, SNC + url: https://ohdear.app/ +- + user_agent: Mozilla/5.0 (compatible; OhDear/1.1; +https://ohdear.app/checker; brokenLinks) + bot: + name: Oh Dear + category: Site Monitor + url: https://ohdear.app/docs/faq/what-is-the-oh-dear-crawler-doing-in-my-logs + producer: + name: Immutable, SNC + url: https://ohdear.app/ +- + user_agent: Mozilla/5.0 (compatible; OhDear/1.1; +https://ohdear.app/checker; mixedContent) + bot: + name: Oh Dear + category: Site Monitor + url: https://ohdear.app/docs/faq/what-is-the-oh-dear-crawler-doing-in-my-logs + producer: + name: Immutable, SNC + url: https://ohdear.app/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 757f7ad777..e0a1efd41f 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -876,10 +876,10 @@ name: 'HubSpot Inc.' url: 'https://www.hubspot.com' -- regex: 'vuhuvBot' - name: 'Vuhuv Bot' - category: 'Crawler' - url: 'http://vuhuv.com/bot.html' +- regex: 'vuhuv(?:Bot|RBT)' + name: 'vuhuvBot' + category: 'Search bot' + url: 'https://vuhuv.com/bot.html' - regex: 'HTTPMon/[\d.]+' name: 'HTTPMon' @@ -4787,7 +4787,7 @@ - regex: 'Bravebot' name: 'Bravebot' - category: 'Crawler' + category: 'Search bot' url: 'https://search.brave.com/help/brave-search-crawler' producer: name: 'Brave Software, Inc.' @@ -4806,6 +4806,14 @@ name: 'Valve Corporation' url: 'https://www.valvesoftware.com/' +- regex: 'ohdear\.app' + name: 'Oh Dear' + category: 'Site Monitor' + url: 'https://ohdear.app/docs/faq/what-is-the-oh-dear-crawler-doing-in-my-logs' + producer: + name: 'Immutable, SNC' + url: 'https://ohdear.app/' + # Generic bots - regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|geckotrail|Wordup|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|Node.js|Report Runner|url|Zeus|ZmEu)$' name: 'Generic Bot'