From 8363d4fdd404c03af92115c8b9115952e47749ec Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 2 Dec 2025 01:25:24 +0000 Subject: [PATCH] Update from Dark Visitors --- robots.json | 150 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 124 insertions(+), 26 deletions(-) diff --git a/robots.json b/robots.json index 3b0bc85..50aadc3 100644 --- a/robots.json +++ b/robots.json @@ -13,12 +13,19 @@ "frequency": "No information provided.", "description": "Explores 'certain domains' to find web content." }, + "AI2Bot-DeepResearchEval": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Assistants", + "frequency": "Unclear at this time.", + "description": "Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/ai2bot-deepresearcheval" + }, "Ai2Bot-Dolma": { - "description": "Explores 'certain domains' to find web content.", - "frequency": "No information provided.", - "function": "Content is used to train open language models.", "operator": "[Ai2](https://allenai.org/crawler)", - "respect": "Yes" + "respect": "Yes", + "function": "Content is used to train open language models.", + "frequency": "No information provided.", + "description": "Explores 'certain domains' to find web content." }, "aiHitBot": { "operator": "[aiHit](https://www.aihitdata.com/about)", @@ -27,20 +34,6 @@ "frequency": "No information provided.", "description": "Scrapes data for AI systems." }, - "AmazonBuyForMe": { - "operator": "[Amazon](https://amazon.com)", - "respect": "Unclear at this time.", - "function": "AI Agents", - "frequency": "No information provided.", - "description": "Buy For Me is an AI agent that helps buy products at the direction of customers." - }, - "atlassian-bot": { - "operator": "[Atlassian](https://www.atlassian.com)", - "respect": "[Yes](https://support.atlassian.com/organization-administration/docs/connect-custom-website-to-rovo/#Editing-your-robots.txt)", - "function": "AI search, assistants and agents", - "frequency": "No information provided.", - "description": "atlassian-bot is a web crawler used to index website content for its AI search, assistants and agents available in its Rovo GenAI product." - }, "amazon-kendra": { "operator": "Amazon", "respect": "Yes", @@ -55,6 +48,13 @@ "frequency": "No information provided.", "description": "Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses." }, + "AmazonBuyForMe": { + "operator": "[Amazon](https://amazon.com)", + "respect": "Unclear at this time.", + "function": "AI Agents", + "frequency": "No information provided.", + "description": "Buy For Me is an AI agent that helps buy products at the direction of customers." + }, "Andibot": { "operator": "[Andi](https://andisearch.com/)", "respect": "Unclear at this time", @@ -90,6 +90,13 @@ "frequency": "Unclear at this time.", "description": "Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools." }, + "atlassian-bot": { + "operator": "[Atlassian](https://www.atlassian.com)", + "respect": "[Yes](https://support.atlassian.com/organization-administration/docs/connect-custom-website-to-rovo/#Editing-your-robots.txt)", + "function": "AI search, assistants and agents", + "frequency": "No information provided.", + "description": "atlassian-bot is a web crawler used to index website content for its AI search, assistants and agents available in its Rovo GenAI product." + }, "Awario": { "operator": "Awario", "respect": "Unclear at this time.", @@ -146,6 +153,13 @@ "frequency": "Monthly at present.", "description": "Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers)." }, + "ChatGLM-Spider": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/chatglm-spider" + }, "ChatGPT Agent": { "operator": "[OpenAI](https://openai.com)", "respect": "Yes", @@ -384,6 +398,20 @@ "frequency": "No information.", "description": "Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies." }, + "iAskBot": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "Undocumented AI Agents", + "frequency": "Unclear at this time.", + "description": "Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/iaskbot" + }, + "iaskspider": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "Undocumented AI Agents", + "frequency": "Unclear at this time.", + "description": "Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/iaskspider" + }, "iaskspider/2.0": { "description": "Used to provide answers to user queries.", "frequency": "Unclear at this time.", @@ -412,6 +440,13 @@ "operator": "[ImageSift](https://imagesift.com)", "respect": "[Yes](https://imagesift.com/about)" }, + "imageSpider": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/imagespider" + }, "img2dataset": { "description": "Downloads large sets of images into datasets for LLM training or other purposes.", "frequency": "At the discretion of img2dataset users.", @@ -440,6 +475,20 @@ "frequency": "Indexes based on 'change signals' and user configuration.", "description": "Indexes content to tailor AI experiences, generate content, answers and recommendations." }, + "KunatoCrawler": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "Undocumented AI Agents", + "frequency": "Unclear at this time.", + "description": "Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/kunatocrawler" + }, + "laion-huggingface-processor": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/laion-huggingface-processor" + }, "LAIONDownloader": { "operator": "[Large-scale Artificial Intelligence Open Network](https://laion.ai/)", "respect": "[No](https://laion.ai/faq/)", @@ -447,6 +496,13 @@ "frequency": "Unclear at this time.", "description": "LAIONDownloader is a bot by LAION, a non-profit organization that provides datasets, tools and models to liberate machine learning research." }, + "LCC": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/lcc" + }, "LinerBot": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", @@ -461,6 +517,13 @@ "frequency": "Unclear at this time.", "description": "Linguee Bot is a web crawler used by Linguee to gather training data for its AI powered translation service." }, + "LinkupBot": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Search Crawlers", + "frequency": "Unclear at this time.", + "description": "Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/linkupbot" + }, "meta-externalagent": { "operator": "[Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)", "respect": "Yes", @@ -622,6 +685,13 @@ "operator": "[phind](https://www.phind.com/)", "respect": "Unclear at this time." }, + "Poggio-Citations": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Assistants", + "frequency": "Unclear at this time.", + "description": "Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/poggio-citations" + }, "Poseidon Research Crawler": { "operator": "[Poseidon Research](https://www.poseidonresearch.com)", "description": "Lab focused on scaling the interpretability research necessary to make better AI systems possible.", @@ -651,11 +721,11 @@ "respect": "Unclear at this time." }, "SBIntuitionsBot": { - "description": "AI development and information analysis", + "operator": "[SB Intuitions](https://www.sbintuitions.co.jp/en/)", "respect": "[Yes](https://www.sbintuitions.co.jp/en/bot/)", - "frequency": "No information.", "function": "Uses data gathered in AI development and information analysis.", - "operator": "[SB Intuitions](https://www.sbintuitions.co.jp/en/)" + "frequency": "No information.", + "description": "AI development and information analysis" }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", @@ -692,6 +762,13 @@ "operator": "[Sidetrade](https://www.sidetrade.com)", "respect": "Unclear at this time." }, + "Spider": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/spider" + }, "TerraCotta": { "operator": "[Ceramic AI](https://ceramic.ai/)", "respect": "[Yes](https://github.com/CeramicTeam/CeramicTerracotta)", @@ -721,11 +798,11 @@ "description": "Makes data available for training AI models." }, "VelenPublicWebCrawler": { - "description": "\"Our goal with this crawler is to build business datasets and machine learning models to better understand the web.\"", - "frequency": "No information.", - "function": "Scrapes data for business data sets and machine learning models.", "operator": "[Velen Crawler](https://velen.io)", - "respect": "[Yes](https://velen.io)" + "respect": "[Yes](https://velen.io)", + "function": "Scrapes data for business data sets and machine learning models.", + "frequency": "No information.", + "description": "\"Our goal with this crawler is to build business datasets and machine learning models to better understand the web.\"" }, "WARDBot": { "operator": "WEBSPARK", @@ -741,6 +818,13 @@ "frequency": "Unclear at this time.", "description": "Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended" }, + "webzio-extended": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/webzio-extended" + }, "wpbot": { "operator": "[QuantumCloud](https://www.quantumcloud.com)", "respect": "Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9)", @@ -748,6 +832,13 @@ "frequency": "Unclear at this time.", "description": "wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support." }, + "WRTNBot": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "Undocumented AI Agents", + "frequency": "Unclear at this time.", + "description": "Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/wrtnbot" + }, "YaK": { "operator": "[Meltwater](https://www.meltwater.com/en/suite/consumer-intelligence)", "respect": "Unclear at this time.", @@ -775,5 +866,12 @@ "function": "Scrapes data for search engine and LLMs.", "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." + }, + "ZanistaBot": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Search Crawlers", + "frequency": "Unclear at this time.", + "description": "Description unavailable from darkvisitors.com More info can be found at https://darkvisitors.com/agents/agents/zanistabot" } -} +} \ No newline at end of file