From 0b3266b35f621d01d5d9dae5a99219c42a0f3085 Mon Sep 17 00:00:00 2001 From: fiskhandlarn <680264+fiskhandlarn@users.noreply.github.com> Date: Thu, 20 Nov 2025 11:50:37 +0100 Subject: [PATCH 1/4] feat: allow robots access to `/robots.txt` in nginx --- nginx-block-ai-bots.conf | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 938f787..aefb9e2 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,13 @@ +set $block 0; + if ($http_user_agent ~* "(AddSearchBot|AI2Bot|Ai2Bot\-Dolma|aiHitBot|AmazonBuyForMe|atlassian\-bot|amazon\-kendra|Amazonbot|Andibot|Anomura|anthropic\-ai|Applebot|Applebot\-Extended|Awario|bedrockbot|bigsur\.ai|Bravebot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|KlaviyoAIBot|LinerBot|Linguee\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|netEstate\ Imprint\ Crawler|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|TerraCotta|Thinkbot|TikTokSpider|Timpibot|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|wpbot|YaK|YandexAdditional|YandexAdditionalBot|YouBot)") { + set $block 1; +} + +if ($request_uri = '/robots.txt') { + set $block 0; +} + +if ($block) { return 403; } \ No newline at end of file From ef8eda4fe62b35bc80dbc64810e1dbcf83cfc677 Mon Sep 17 00:00:00 2001 From: fiskhandlarn Date: Tue, 25 Nov 2025 16:39:35 +0100 Subject: [PATCH 2/4] chore: normalize quote style --- nginx-block-ai-bots.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index aefb9e2..9a7bdc4 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -4,7 +4,7 @@ if ($http_user_agent ~* "(AddSearchBot|AI2Bot|Ai2Bot\-Dolma|aiHitBot|AmazonBuyFo set $block 1; } -if ($request_uri = '/robots.txt') { +if ($request_uri = "/robots.txt") { set $block 0; } From 2679fcad34088ce1f6a38bc054a1242b90a382df Mon Sep 17 00:00:00 2001 From: fiskhandlarn Date: Tue, 25 Nov 2025 16:46:31 +0100 Subject: [PATCH 3/4] feat: update nginx generator --- code/robots.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/code/robots.py b/code/robots.py index 94a915b..26a4aca 100755 --- a/code/robots.py +++ b/code/robots.py @@ -108,10 +108,10 @@ def clean_robot_name(name): # This was specifically spotted in "Perplexity-User" # Looks like a non-breaking hyphen introduced by the HTML rendering software # Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots - # You can see the bot is listed several times as "Perplexity-User" with a normal hyphen, + # You can see the bot is listed several times as "Perplexity-User" with a normal hyphen, # and it's only the Row-Heading that has the special hyphen - # - # Technically, there's no reason there wouldn't someday be a bot that + # + # Technically, there's no reason there wouldn't someday be a bot that # actually uses a non-breaking hyphen, but that seems unlikely, # so this solution should be fine for now. result = re.sub(r"\u2011", "-", name) @@ -173,9 +173,9 @@ def json_to_htaccess(robot_json): return htaccess def json_to_nginx(robot_json): - # Creates an Nginx config file. This config snippet can be included in + # Creates an Nginx config file. This config snippet can be included in # nginx server{} blocks to block AI bots. - config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" + config = f"set $block 0;\n\nif ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n set $block 1;\n}}\n\nif ($request_uri = \"/robots.txt\") {{\n set $block 0;\n}}\n\nif ($block) {{\n return 403;\n}}" return config @@ -225,7 +225,7 @@ def conversions(): file_name="./Caddyfile", converter=json_to_caddy, ) - + update_file_if_changed( file_name="./haproxy-block-ai-bots.txt", converter=json_to_haproxy, From a6cf6b204b9581db5b0bb7353ba69dd237caabe8 Mon Sep 17 00:00:00 2001 From: fiskhandlarn Date: Tue, 25 Nov 2025 16:47:04 +0100 Subject: [PATCH 4/4] test: update test nginx conf --- code/test_files/nginx-block-ai-bots.conf | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/code/test_files/nginx-block-ai-bots.conf b/code/test_files/nginx-block-ai-bots.conf index c569b15..d5e3cc6 100644 --- a/code/test_files/nginx-block-ai-bots.conf +++ b/code/test_files/nginx-block-ai-bots.conf @@ -1,3 +1,13 @@ +set $block 0; + if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") { + set $block 1; +} + +if ($request_uri = "/robots.txt") { + set $block 0; +} + +if ($block) { return 403; } \ No newline at end of file