mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-12-29 12:18:33 +01:00
Update from Dark Visitors
This commit is contained in:
parent
83485effdb
commit
aa8519ec10
6 changed files with 6 additions and 3 deletions
|
|
@ -1,3 +1,3 @@
|
|||
RewriteEngine On
|
||||
RewriteCond %{HTTP_USER_AGENT} (AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Andibot|Anomura|anthropic\-ai|Applebot|Applebot\-Extended|atlassian\-bot|Awario|bedrockbot|bigsur\.ai|Bravebot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|Channel3Bot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|netEstate\ Imprint\ Crawler|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TerraCotta|Thinkbot|TikTokSpider|Timpibot|TwinAgent|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot) [NC]
|
||||
RewriteCond %{HTTP_USER_AGENT} (AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Andibot|Anomura|anthropic\-ai|Applebot|Applebot\-Extended|atlassian\-bot|Awario|bedrockbot|bigsur\.ai|Bravebot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|Channel3Bot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|netEstate\ Imprint\ Crawler|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TavilyBot|TerraCotta|Thinkbot|TikTokSpider|Timpibot|TwinAgent|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot) [NC]
|
||||
RewriteRule !^/?robots\.txt$ - [F]
|
||||
|
|
|
|||
|
|
@ -1,3 +1,3 @@
|
|||
@aibots {
|
||||
header_regexp User-Agent "(AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Andibot|Anomura|anthropic\-ai|Applebot|Applebot\-Extended|atlassian\-bot|Awario|bedrockbot|bigsur\.ai|Bravebot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|Channel3Bot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|netEstate\ Imprint\ Crawler|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TerraCotta|Thinkbot|TikTokSpider|Timpibot|TwinAgent|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot)"
|
||||
header_regexp User-Agent "(AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Andibot|Anomura|anthropic\-ai|Applebot|Applebot\-Extended|atlassian\-bot|Awario|bedrockbot|bigsur\.ai|Bravebot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|Channel3Bot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|netEstate\ Imprint\ Crawler|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TavilyBot|TerraCotta|Thinkbot|TikTokSpider|Timpibot|TwinAgent|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot)"
|
||||
}
|
||||
|
|
@ -111,6 +111,7 @@ SemrushBot-SWA
|
|||
ShapBot
|
||||
Sidetrade indexer bot
|
||||
Spider
|
||||
TavilyBot
|
||||
TerraCotta
|
||||
Thinkbot
|
||||
TikTokSpider
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
set $block 0;
|
||||
|
||||
if ($http_user_agent ~* "(AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Andibot|Anomura|anthropic\-ai|Applebot|Applebot\-Extended|atlassian\-bot|Awario|bedrockbot|bigsur\.ai|Bravebot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|Channel3Bot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|netEstate\ Imprint\ Crawler|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TerraCotta|Thinkbot|TikTokSpider|Timpibot|TwinAgent|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot)") {
|
||||
if ($http_user_agent ~* "(AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Andibot|Anomura|anthropic\-ai|Applebot|Applebot\-Extended|atlassian\-bot|Awario|bedrockbot|bigsur\.ai|Bravebot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|Channel3Bot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|netEstate\ Imprint\ Crawler|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TavilyBot|TerraCotta|Thinkbot|TikTokSpider|Timpibot|TwinAgent|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot)") {
|
||||
set $block 1;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -111,6 +111,7 @@ User-agent: SemrushBot-SWA
|
|||
User-agent: ShapBot
|
||||
User-agent: Sidetrade indexer bot
|
||||
User-agent: Spider
|
||||
User-agent: TavilyBot
|
||||
User-agent: TerraCotta
|
||||
User-agent: Thinkbot
|
||||
User-agent: TikTokSpider
|
||||
|
|
|
|||
|
|
@ -113,6 +113,7 @@
|
|||
| ShapBot | [Parallel](https://parallel.ai) | [Yes](https://docs.parallel.ai/features/crawler) | Collects data for Parallel's web APIs. | Unclear at this time. | ShapBot helps discover and index websites for Parallel's web APIs. |
|
||||
| Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. |
|
||||
| Spider | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/spider |
|
||||
| TavilyBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/tavilybot |
|
||||
| TerraCotta | [Ceramic AI](https://ceramic.ai/) | [Yes](https://github.com/CeramicTeam/CeramicTerracotta) | AI Agents | Unclear at this time. | Downloads data to train LLMs. |
|
||||
| Thinkbot | [Thinkbot](https://www.thinkbot.agency) | No | Insights on AI integration and automation. | Unclear at this time. | Collects data for analysis on AI usage and automation. |
|
||||
| TikTokSpider | ByteDance | Unclear at this time. | LLM training. | Unclear at this time. | Downloads data to train LLMS, as per Bytespider. |
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue