mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2026-06-07 09:26:54 +02:00
Keep generated robot files in dark visitors update
This commit is contained in:
parent
aea7db9e34
commit
8d5a081a16
8 changed files with 30 additions and 11 deletions
9
.github/workflows/ai_robots_update.yml
vendored
9
.github/workflows/ai_robots_update.yml
vendored
|
|
@ -18,6 +18,8 @@ jobs:
|
|||
git config --global user.email "dark-visitors@users.noreply.github.com"
|
||||
echo "Updating robots.json with data from darkvisitor.com ..."
|
||||
python code/robots.py --update
|
||||
echo "Updating generated files from robots.json ..."
|
||||
python code/robots.py --convert
|
||||
echo "... done."
|
||||
git --no-pager diff
|
||||
git add -A
|
||||
|
|
@ -29,10 +31,3 @@ jobs:
|
|||
echo "No changes to commit."
|
||||
fi
|
||||
shell: bash
|
||||
convert:
|
||||
name: convert
|
||||
needs: dark-visitors
|
||||
uses: ./.github/workflows/main.yml
|
||||
secrets: inherit
|
||||
with:
|
||||
message: "Update from Dark Visitors"
|
||||
|
|
|
|||
|
|
@ -1,3 +1,3 @@
|
|||
RewriteEngine On
|
||||
RewriteCond %{HTTP_USER_AGENT} (AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Amzn\-SearchBot|Amzn\-User|Andibot|Anomura|anthropic\-ai|ApifyBot|ApifyWebsiteContentCrawler|Applebot|Applebot\-Extended|Aranet\-SearchBot|atlassian\-bot|Awario|AzureAI\-SearchBot|bedrockbot|bigsur\.ai|Bravebot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|Channel3Bot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|ExaBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-Agent|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|kagi\-fetcher|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|NagetBot|netEstate\ Imprint\ Crawler|newsai|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TavilyBot|TerraCotta|Thinkbot|TikTokSpider|Timpibot|TwinAgent|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot) [NC]
|
||||
RewriteCond %{HTTP_USER_AGENT} (AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Amzn\-SearchBot|Amzn\-User|Andibot|Anomura|anthropic\-ai|ApifyBot|ApifyWebsiteContentCrawler|Applebot|Applebot\-Extended|Aranet\-SearchBot|atlassian\-bot|Awario|AzureAI\-SearchBot|bedrockbot|bigsur\.ai|Bravebot|Brightbot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|Channel3Bot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-Code|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|Code|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|ExaBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-Agent|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-Gemini\-CLI|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|HenkBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|kagi\-fetcher|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|NagetBot|netEstate\ Imprint\ Crawler|newsai|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|opencode|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TavilyBot|Terra\ Cotta|TerraCotta|Thinkbot|TikTokSpider|Timpibot|Trae|TwinAgent|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot) [NC]
|
||||
RewriteRule !^/?robots\.txt$ - [F]
|
||||
|
|
|
|||
|
|
@ -1,3 +1,3 @@
|
|||
@aibots {
|
||||
header_regexp User-Agent "(AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Amzn\-SearchBot|Amzn\-User|Andibot|Anomura|anthropic\-ai|ApifyBot|ApifyWebsiteContentCrawler|Applebot|Applebot\-Extended|Aranet\-SearchBot|atlassian\-bot|Awario|AzureAI\-SearchBot|bedrockbot|bigsur\.ai|Bravebot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|Channel3Bot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|ExaBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-Agent|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|kagi\-fetcher|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|NagetBot|netEstate\ Imprint\ Crawler|newsai|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TavilyBot|TerraCotta|Thinkbot|TikTokSpider|Timpibot|TwinAgent|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot)"
|
||||
header_regexp User-Agent "(AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Amzn\-SearchBot|Amzn\-User|Andibot|Anomura|anthropic\-ai|ApifyBot|ApifyWebsiteContentCrawler|Applebot|Applebot\-Extended|Aranet\-SearchBot|atlassian\-bot|Awario|AzureAI\-SearchBot|bedrockbot|bigsur\.ai|Bravebot|Brightbot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|Channel3Bot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-Code|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|Code|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|ExaBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-Agent|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-Gemini\-CLI|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|HenkBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|kagi\-fetcher|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|NagetBot|netEstate\ Imprint\ Crawler|newsai|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|opencode|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TavilyBot|Terra\ Cotta|TerraCotta|Thinkbot|TikTokSpider|Timpibot|Trae|TwinAgent|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot)"
|
||||
}
|
||||
|
|
@ -22,6 +22,7 @@ AzureAI-SearchBot
|
|||
bedrockbot
|
||||
bigsur.ai
|
||||
Bravebot
|
||||
Brightbot
|
||||
Brightbot 1.0
|
||||
BuddyBot
|
||||
Bytespider
|
||||
|
|
@ -30,12 +31,14 @@ Channel3Bot
|
|||
ChatGLM-Spider
|
||||
ChatGPT Agent
|
||||
ChatGPT-User
|
||||
Claude-Code
|
||||
Claude-SearchBot
|
||||
Claude-User
|
||||
Claude-Web
|
||||
ClaudeBot
|
||||
Cloudflare-AutoRAG
|
||||
CloudVertexBot
|
||||
Code
|
||||
cohere-ai
|
||||
cohere-training-data-crawler
|
||||
Cotoyogi
|
||||
|
|
@ -59,12 +62,14 @@ Google-Agent
|
|||
Google-CloudVertexBot
|
||||
Google-Extended
|
||||
Google-Firebase
|
||||
Google-Gemini-CLI
|
||||
Google-NotebookLM
|
||||
GoogleAgent-Mariner
|
||||
GoogleOther
|
||||
GoogleOther-Image
|
||||
GoogleOther-Video
|
||||
GPTBot
|
||||
HenkBot
|
||||
iAskBot
|
||||
iaskspider
|
||||
iaskspider/2.0
|
||||
|
|
@ -102,6 +107,7 @@ OAI-SearchBot
|
|||
omgili
|
||||
omgilibot
|
||||
OpenAI
|
||||
opencode
|
||||
Operator
|
||||
PanguBot
|
||||
Panscient
|
||||
|
|
@ -123,10 +129,12 @@ ShapBot
|
|||
Sidetrade indexer bot
|
||||
Spider
|
||||
TavilyBot
|
||||
Terra Cotta
|
||||
TerraCotta
|
||||
Thinkbot
|
||||
TikTokSpider
|
||||
Timpibot
|
||||
Trae
|
||||
TwinAgent
|
||||
VelenPublicWebCrawler
|
||||
WARDBot
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
$HTTP["url"] != "/robots.txt" { $HTTP["user-agent"] =~ "(AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Amzn\-SearchBot|Amzn\-User|Andibot|Anomura|anthropic\-ai|ApifyBot|ApifyWebsiteContentCrawler|Applebot|Applebot\-Extended|Aranet\-SearchBot|atlassian\-bot|Awario|AzureAI\-SearchBot|bedrockbot|bigsur\.ai|Bravebot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|Channel3Bot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|ExaBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-Agent|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|kagi\-fetcher|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|NagetBot|netEstate\ Imprint\ Crawler|newsai|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TavilyBot|TerraCotta|Thinkbot|TikTokSpider|Timpibot|TwinAgent|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot)" { url.access-deny = ( "" ) } }
|
||||
$HTTP["url"] != "/robots.txt" { $HTTP["user-agent"] =~ "(AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Amzn\-SearchBot|Amzn\-User|Andibot|Anomura|anthropic\-ai|ApifyBot|ApifyWebsiteContentCrawler|Applebot|Applebot\-Extended|Aranet\-SearchBot|atlassian\-bot|Awario|AzureAI\-SearchBot|bedrockbot|bigsur\.ai|Bravebot|Brightbot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|Channel3Bot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-Code|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|Code|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|ExaBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-Agent|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-Gemini\-CLI|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|HenkBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|kagi\-fetcher|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|NagetBot|netEstate\ Imprint\ Crawler|newsai|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|opencode|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TavilyBot|Terra\ Cotta|TerraCotta|Thinkbot|TikTokSpider|Timpibot|Trae|TwinAgent|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot)" { url.access-deny = ( "" ) } }
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
set $block 0;
|
||||
|
||||
if ($http_user_agent ~* "(AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Amzn\-SearchBot|Amzn\-User|Andibot|Anomura|anthropic\-ai|ApifyBot|ApifyWebsiteContentCrawler|Applebot|Applebot\-Extended|Aranet\-SearchBot|atlassian\-bot|Awario|AzureAI\-SearchBot|bedrockbot|bigsur\.ai|Bravebot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|Channel3Bot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|ExaBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-Agent|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|kagi\-fetcher|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|NagetBot|netEstate\ Imprint\ Crawler|newsai|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TavilyBot|TerraCotta|Thinkbot|TikTokSpider|Timpibot|TwinAgent|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot)") {
|
||||
if ($http_user_agent ~* "(AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Amzn\-SearchBot|Amzn\-User|Andibot|Anomura|anthropic\-ai|ApifyBot|ApifyWebsiteContentCrawler|Applebot|Applebot\-Extended|Aranet\-SearchBot|atlassian\-bot|Awario|AzureAI\-SearchBot|bedrockbot|bigsur\.ai|Bravebot|Brightbot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|Channel3Bot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-Code|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|Code|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|ExaBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-Agent|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-Gemini\-CLI|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|HenkBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|kagi\-fetcher|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|NagetBot|netEstate\ Imprint\ Crawler|newsai|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|opencode|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TavilyBot|Terra\ Cotta|TerraCotta|Thinkbot|TikTokSpider|Timpibot|Trae|TwinAgent|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot)") {
|
||||
set $block 1;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ User-agent: AzureAI-SearchBot
|
|||
User-agent: bedrockbot
|
||||
User-agent: bigsur.ai
|
||||
User-agent: Bravebot
|
||||
User-agent: Brightbot
|
||||
User-agent: Brightbot 1.0
|
||||
User-agent: BuddyBot
|
||||
User-agent: Bytespider
|
||||
|
|
@ -30,12 +31,14 @@ User-agent: Channel3Bot
|
|||
User-agent: ChatGLM-Spider
|
||||
User-agent: ChatGPT Agent
|
||||
User-agent: ChatGPT-User
|
||||
User-agent: Claude-Code
|
||||
User-agent: Claude-SearchBot
|
||||
User-agent: Claude-User
|
||||
User-agent: Claude-Web
|
||||
User-agent: ClaudeBot
|
||||
User-agent: Cloudflare-AutoRAG
|
||||
User-agent: CloudVertexBot
|
||||
User-agent: Code
|
||||
User-agent: cohere-ai
|
||||
User-agent: cohere-training-data-crawler
|
||||
User-agent: Cotoyogi
|
||||
|
|
@ -59,12 +62,14 @@ User-agent: Google-Agent
|
|||
User-agent: Google-CloudVertexBot
|
||||
User-agent: Google-Extended
|
||||
User-agent: Google-Firebase
|
||||
User-agent: Google-Gemini-CLI
|
||||
User-agent: Google-NotebookLM
|
||||
User-agent: GoogleAgent-Mariner
|
||||
User-agent: GoogleOther
|
||||
User-agent: GoogleOther-Image
|
||||
User-agent: GoogleOther-Video
|
||||
User-agent: GPTBot
|
||||
User-agent: HenkBot
|
||||
User-agent: iAskBot
|
||||
User-agent: iaskspider
|
||||
User-agent: iaskspider/2.0
|
||||
|
|
@ -102,6 +107,7 @@ User-agent: OAI-SearchBot
|
|||
User-agent: omgili
|
||||
User-agent: omgilibot
|
||||
User-agent: OpenAI
|
||||
User-agent: opencode
|
||||
User-agent: Operator
|
||||
User-agent: PanguBot
|
||||
User-agent: Panscient
|
||||
|
|
@ -123,10 +129,12 @@ User-agent: ShapBot
|
|||
User-agent: Sidetrade indexer bot
|
||||
User-agent: Spider
|
||||
User-agent: TavilyBot
|
||||
User-agent: Terra Cotta
|
||||
User-agent: TerraCotta
|
||||
User-agent: Thinkbot
|
||||
User-agent: TikTokSpider
|
||||
User-agent: Timpibot
|
||||
User-agent: Trae
|
||||
User-agent: TwinAgent
|
||||
User-agent: VelenPublicWebCrawler
|
||||
User-agent: WARDBot
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@
|
|||
| bedrockbot | [Amazon](https://amazon.com) | [Yes](https://docs.aws.amazon.com/bedrock/latest/userguide/webcrawl-data-source-connector.html#configuration-webcrawl-connector) | Data scraping for custom AI applications. | Unclear at this time. | Connects to and crawls URLs that have been selected for use in a user's AWS bedrock application. |
|
||||
| bigsur\.ai | Big Sur AI that fetches website content to enable AI-powered web agents, sales assistants, and content marketing solutions for businesses | Unclear at this time. | AI Assistants | Unclear at this time. | bigsur.ai is a web crawler operated by Big Sur AI that fetches website content to enable AI-powered web agents, sales assistants, and content marketing solutions for businesses. More info can be found at https://darkvisitors.com/agents/agents/bigsur-ai |
|
||||
| Bravebot | https://safe.search.brave.com/help/brave-search-crawler | Yes | Collects data for AI search | Unclear at this time. | Brave search has a crawler to discover new pages and index their content. |
|
||||
| Brightbot | Unclear at this time. | Unclear at this time. | AI Data Providers | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/brightbot |
|
||||
| Brightbot 1\.0 | https://brightdata.com/brightbot | Unclear at this time. | LLM/AI training. | At least one per minute. | Scrapes data to train LLMs and AI products focused on website customer support, [uses residential IPs and legit-looking user-agents to disguise itself](https://ksol.io/en/blog/posts/brightbot-not-that-bright/). |
|
||||
| BuddyBot | [BuddyBotLearning](https://www.buddybotlearning.com) | Unclear at this time. | AI Learning Companion | Unclear at this time. | BuddyBot is a voice-controlled AI learning companion targeted at childhooded STEM education. |
|
||||
| Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. |
|
||||
|
|
@ -32,12 +33,14 @@
|
|||
| ChatGLM\-Spider | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/chatglm-spider |
|
||||
| ChatGPT Agent | [OpenAI](https://openai.com) | Yes | AI Agents | Unclear at this time. | ChatGPT Agent is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/chatgpt-agent |
|
||||
| ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. |
|
||||
| Claude\-Code | Unclear at this time. | Unclear at this time. | AI Coding Agents | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/claude-code |
|
||||
| Claude\-SearchBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | No information provided. | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. |
|
||||
| Claude\-User | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | No information provided. | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. |
|
||||
| Claude\-Web | Anthropic | Unclear at this time. | Undocumented AI Agents | Unclear at this time. | Claude-Web is an AI-related agent operated by Anthropic. It's currently unclear exactly what it's used for, since there's no official documentation. If you can provide more detail, please contact us. More info can be found at https://darkvisitors.com/agents/agents/claude-web |
|
||||
| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
|
||||
| Cloudflare\-AutoRAG | [Cloudflare](https://developers.cloudflare.com/autorag) | Yes | Collects data for AI search | Unclear at this time. | AutoRAG is an all-in-one AI search solution. |
|
||||
| CloudVertexBot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | CloudVertexBot is a Google-operated crawler available to site owners to request targeted crawls of their own sites for AI training purposes on the Vertex AI platform. More info can be found at https://darkvisitors.com/agents/agents/cloudvertexbot |
|
||||
| Code | Unclear at this time. | Unclear at this time. | AI Coding Agents | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/code |
|
||||
| cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. |
|
||||
| cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler |
|
||||
| Cotoyogi | [ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/) | Yes | AI LLM Scraper. | No information provided. | Scrapes data for AI training in Japanese language. |
|
||||
|
|
@ -61,12 +64,14 @@
|
|||
| Google\-CloudVertexBot | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Build and manage AI models for businesses employing Vertex AI | No information. | Google-CloudVertexBot crawls sites on the site owners' request when building Vertex AI Agents. |
|
||||
| Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. |
|
||||
| Google\-Firebase | Google | Unclear at this time. | Used as part of AI apps developed by users of Google's Firebase AI products. | Unclear at this time. | Supports Google's Firebase AI products. |
|
||||
| Google\-Gemini\-CLI | Unclear at this time. | Unclear at this time. | AI Coding Agents | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/google-gemini-cli |
|
||||
| Google\-NotebookLM | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Google-NotebookLM is an AI-powered research and note-taking assistant that helps users synthesize information from their own uploaded sources, such as documents, transcripts, or web content. It can generate summaries, answer questions, and highlight key themes from the materials you provide, acting like a personalized research companion built on Google's Gemini model. Google-NotebookLM fetches source URLs when users add them to their notebooks, enabling the AI to access and analyze those pages for context and insights. More info can be found at https://darkvisitors.com/agents/agents/google-notebooklm |
|
||||
| GoogleAgent\-Mariner | Google | Unclear at this time. | AI Agents | Unclear at this time. | GoogleAgent-Mariner is an AI agent created by Google that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/googleagent-mariner |
|
||||
| GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
|
||||
| GoogleOther\-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
|
||||
| GoogleOther\-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
|
||||
| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. |
|
||||
| HenkBot | Unclear at this time. | Unclear at this time. | AI Data Providers | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/henkbot |
|
||||
| iAskBot | Unclear at this time. | Unclear at this time. | Undocumented AI Agents | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/iaskbot |
|
||||
| iaskspider | Unclear at this time. | Unclear at this time. | Undocumented AI Agents | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/iaskspider |
|
||||
| iaskspider/2\.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. |
|
||||
|
|
@ -104,6 +109,7 @@
|
|||
| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
|
||||
| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
|
||||
| OpenAI | [OpenAI](https://openai.com) | Yes | Unclear at this time. | Unclear at this time. | The purpose of this bot is unclear at this time but it is a member of OpenAI's suite of crawlers. |
|
||||
| opencode | Unclear at this time. | Unclear at this time. | AI Coding Agents | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/opencode |
|
||||
| Operator | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Operator is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/operator |
|
||||
| PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot |
|
||||
| Panscient | [Panscient](https://panscient.com) | [Yes](https://panscient.com/faq.htm) | Data collection and analysis using machine learning and AI. | The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address. | Compiles data on businesses and business professionals that is structured using AI and machine learning. |
|
||||
|
|
@ -125,10 +131,12 @@
|
|||
| Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. |
|
||||
| Spider | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/spider |
|
||||
| TavilyBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/tavilybot |
|
||||
| Terra Cotta | Unclear at this time. | Unclear at this time. | AI Data Providers | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/terra-cotta |
|
||||
| TerraCotta | [Ceramic AI](https://ceramic.ai/) | [Yes](https://github.com/CeramicTeam/CeramicTerracotta) | AI Agents | Unclear at this time. | Downloads data to train LLMs. |
|
||||
| Thinkbot | [Thinkbot](https://www.thinkbot.agency) | No | Insights on AI integration and automation. | Unclear at this time. | Collects data for analysis on AI usage and automation. |
|
||||
| TikTokSpider | ByteDance | Unclear at this time. | LLM training. | Unclear at this time. | Downloads data to train LLMS, as per Bytespider. |
|
||||
| Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. |
|
||||
| Trae | Unclear at this time. | Unclear at this time. | AI Coding Agents | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/trae |
|
||||
| TwinAgent | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/twinagent |
|
||||
| VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." |
|
||||
| WARDBot | WEBSPARK | Unclear at this time. | AI Data Scrapers | Unclear at this time. | WARDBot is an AI data scraper operated by WEBSPARK. It's not currently known to be artificially intelligent or AI-related. If you think that's incorrect or can provide more detail about its purpose, please contact us. More info can be found at https://darkvisitors.com/agents/agents/wardbot |
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue