Skip to content

Commit 3b6e35c

Browse files
Improves detection for generic bots (matomo-org#8200)
* Improves detection for generic bots * Add more generic bots --------- Co-authored-by: Stefan Giehl <stefan@matomo.org>
1 parent eaf6fb3 commit 3b6e35c

File tree

3 files changed

+34
-4
lines changed

3 files changed

+34
-4
lines changed

Parser/Bot.php

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,18 @@ public function parse(): ?array
7474
foreach ($this->getRegexes() as $regex) {
7575
$matches = $this->matchUserAgent($regex['regex']);
7676

77-
if ($matches) {
78-
unset($regex['regex']);
79-
$result = $regex;
77+
if (!$matches) {
78+
continue;
79+
}
80+
81+
unset($regex['regex']);
82+
$result = $regex;
8083

81-
break;
84+
if (\array_key_exists('name', $result)) {
85+
$result['name'] = $this->buildByMatch($result['name'], $matches);
8286
}
87+
88+
break;
8389
}
8490
}
8591

Tests/fixtures/bots.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8948,6 +8948,26 @@
89488948
Sec-CH-UA: 'Mozilliqa"<?=print(9347655345-4954366);?>"'
89498949
Sec-CH-UA-Platform: 'Lindows&lt;?=print(9347655345-4954366);?&gt;'
89508950
Sec-CH-UA-Mobile: '?0&lt;?=print(9347655345-4954366);?&gt;'
8951+
-
8952+
user_agent: Mozilla/5.0 (compatible; PrivacyPolicyBot/1.0)
8953+
bot:
8954+
name: PrivacyPolicyBot
8955+
category: Crawler
8956+
-
8957+
user_agent: Mozilla/5.0 (compatible; SeoCherryBot/1.0; +https://en.wikipedia.org/wiki/Web_crawler)
8958+
bot:
8959+
name: SeoCherryBot
8960+
category: Crawler
8961+
-
8962+
user_agent: HanaleiBot runid=beta-stage-integration-test
8963+
bot:
8964+
name: HanaleiBot
8965+
category: Crawler
8966+
-
8967+
user_agent: Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ABEvalBot/0.1) Version/11.1.2 Safari/605.1.15
8968+
bot:
8969+
name: ABEvalBot
8970+
category: Crawler
89518971
-
89528972
user_agent: Google-NotebookLM
89538973
bot:

regexes/bots.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5651,6 +5651,10 @@
56515651
url: 'https://www.semantic-visions.com/'
56525652

56535653
# Generic bots
5654+
- regex: '(ABEvalBot|HanaleiBot|PrivacyPolicyBot|SeoCherryBot)'
5655+
name: '$1'
5656+
category: 'Crawler'
5657+
56545658
- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|geckotrail|Wordup|Keydrop|\(compatible\)|John Recon|SPARK COMMIT|masjesu|Komaru_The_Cat|Jesus Christ of Nazareth is LORD|Kowai|Hakai|LoliSec|LMAO|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|Node.js|Report Runner|url|Zeus|ZmEu)$|OnlyScans|TheInternetSearchx|Laravel Reaver|bang2013|libredtail|Mozilliqa|Tiberius'
56555659
name: 'Generic Bot'
56565660

0 commit comments

Comments
 (0)