From d1e0a9a757d159bec6daf15090052849168b6f7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A1szl=C3=B3=20K=C3=A1rolyi?= Date: Sun, 7 Sep 2025 12:12:35 +0200 Subject: [PATCH] Add meta-webindexer bot and update Brightbot operator info - Added 'meta-webindexer' to HAProxy, Nginx, robots.txt blocklists - Updated Brightbot operator to brightdata.com in robots.json and metrics - Added Brightbot frequency and disguise tactics documentation link - Added meta-webindexer entry in robots.json with Meta's official description - Added meta-webindexer row in table-of-bot-metrics.md with details --- haproxy-block-ai-bots.txt | 3 ++- nginx-block-ai-bots.conf | 4 ++-- robots.json | 15 +++++++++++---- robots.txt | 1 + table-of-bot-metrics.md | 3 ++- 5 files changed, 18 insertions(+), 8 deletions(-) diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index cb9633f..8ff93b4 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -55,6 +55,7 @@ meta-externalagent Meta-ExternalAgent meta-externalfetcher Meta-ExternalFetcher +meta-webindexer MistralAI-User MistralAI-User/1.0 MyCentralAIScraperBot @@ -92,4 +93,4 @@ wpbot YaK YandexAdditional YandexAdditionalBot -YouBot \ No newline at end of file +YouBot diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 2115bbe..345c38f 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AddSearchBot|AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Awario|bedrockbot|bigsur\.ai|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Datenbank\ Crawler|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|LinerBot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|netEstate\ Imprint\ Crawler|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Thinkbot|TikTokSpider|Timpibot|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|wpbot|YaK|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AddSearchBot|AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Awario|bedrockbot|bigsur\.ai|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Datenbank\ Crawler|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|LinerBot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta-webindexer/|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|netEstate\ Imprint\ Crawler|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Thinkbot|TikTokSpider|Timpibot|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|wpbot|YaK|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; -} \ No newline at end of file +} diff --git a/robots.json b/robots.json index 25449f4..5faef5a 100644 --- a/robots.json +++ b/robots.json @@ -84,11 +84,11 @@ "description": "bigsur.ai is a web crawler operated by Big Sur AI that fetches website content to enable AI-powered web agents, sales assistants, and content marketing solutions for businesses. More info can be found at https://darkvisitors.com/agents/agents/bigsur-ai" }, "Brightbot 1.0": { - "operator": "Browsing.ai", + "operator": "https://brightdata.com/brightbot", "respect": "Unclear at this time.", "function": "LLM/AI training.", - "frequency": "Unclear at this time.", - "description": "Scrapes data to train LLMs and AI products focused on website customer support." + "frequency": "At least one per minute.", + "description": "Scrapes data to train LLMs and AI products focused on website customer support, [uses residential IPs and legit-looking user-agents to disguise itself](https://ksol.io/en/blog/posts/brightbot-not-that-bright/)." }, "Bytespider": { "operator": "ByteDance", @@ -391,6 +391,13 @@ "frequency": "Unclear at this time.", "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" }, + "meta-webindexer": { + "operator": "[Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/)", + "respect": "Unclear at this time.", + "function": "AI Assistants", + "frequency": "Unhinged, more than 1 per second.", + "description": "As per their documentation, \"The Meta-WebIndexer crawler navigates the web to improve Meta AI search result quality for users. In doing so, Meta analyzes online content to enhance the relevance and accuracy of Meta AI. Allowing Meta-WebIndexer in your robots.txt file helps us cite and link to your content in Meta AI's responses.\"" + }, "Meta-ExternalFetcher": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", @@ -664,4 +671,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} diff --git a/robots.txt b/robots.txt index 317b84a..d96222e 100644 --- a/robots.txt +++ b/robots.txt @@ -55,6 +55,7 @@ User-agent: meta-externalagent User-agent: Meta-ExternalAgent User-agent: meta-externalfetcher User-agent: Meta-ExternalFetcher +User-agent: meta-webindexer User-agent: MistralAI-User User-agent: MistralAI-User/1.0 User-agent: MyCentralAIScraperBot diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 8966f11..3744499 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -12,7 +12,7 @@ | Awario | Awario | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Awario is an AI data scraper operated by Awario. It's not currently known to be artificially intelligent or AI-related. If you think that's incorrect or can provide more detail about its purpose, please contact us. More info can be found at https://darkvisitors.com/agents/agents/awario | | bedrockbot | [Amazon](https://amazon.com) | [Yes](https://docs.aws.amazon.com/bedrock/latest/userguide/webcrawl-data-source-connector.html#configuration-webcrawl-connector) | Data scraping for custom AI applications. | Unclear at this time. | Connects to and crawls URLs that have been selected for use in a user's AWS bedrock application. | | bigsur\.ai | Big Sur AI that fetches website content to enable AI-powered web agents, sales assistants, and content marketing solutions for businesses | Unclear at this time. | AI Assistants | Unclear at this time. | bigsur.ai is a web crawler operated by Big Sur AI that fetches website content to enable AI-powered web agents, sales assistants, and content marketing solutions for businesses. More info can be found at https://darkvisitors.com/agents/agents/bigsur-ai | -| Brightbot 1\.0 | Browsing.ai | Unclear at this time. | LLM/AI training. | Unclear at this time. | Scrapes data to train LLMs and AI products focused on website customer support. | +| Brightbot 1\.0 | https://brightdata.com/brightbot | Unclear at this time. | LLM/AI training. | At least one per minute. | Scrapes data to train LLMs and AI products focused on website customer support, [uses residential IPs and legit-looking user-agents to disguise itself](https://ksol.io/en/blog/posts/brightbot-not-that-bright/). | | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | | ChatGPT Agent | [OpenAI](https://openai.com) | Yes | AI Agents | Unclear at this time. | ChatGPT Agent is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/chatgpt-agent | @@ -57,6 +57,7 @@ | Meta\-ExternalAgent | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent | | meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| meta-webindexer | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/) | Unclear at this time. | AI Assistants | Unhinged, more than 1 per second. | As per their documentation, "The Meta-WebIndexer crawler navigates the web to improve Meta AI search result quality for users. In doing so, Meta analyzes online content to enhance the relevance and accuracy of Meta AI. Allowing Meta-WebIndexer in your robots.txt file helps us cite and link to your content in Meta AI's responses." | | MistralAI\-User | Mistral | Unclear at this time. | AI Assistants | Unclear at this time. | MistralAI-User is an AI assistant operated by Mistral. It's not currently known to be artificially intelligent or AI-related. If you think that's incorrect or can provide more detail about its purpose, please contact us. More info can be found at https://darkvisitors.com/agents/agents/mistralai-user | | MistralAI\-User/1\.0 | Mistral AI | Yes | Takes action based on user prompts. | Only when prompted by a user. | MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response. | | MyCentralAIScraperBot | Unclear at this time. | Unclear at this time. | AI data scraper | Unclear at this time. | Operator and data use is unclear at this time. |