From 237a6a98e268df70f2655809010c2b87ce3fca53 Mon Sep 17 00:00:00 2001 From: Dryusdan Date: Mon, 18 Aug 2025 12:52:23 +0200 Subject: [PATCH] Bump ai.robots.txt to v1.39 (#982) --- data/bots/ai-robots-txt.yaml | 2 +- docs/docs/CHANGELOG.md | 1 + web/static/robots.txt | 22 ++++++++++++++++++---- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/data/bots/ai-robots-txt.yaml b/data/bots/ai-robots-txt.yaml index c330eb7..d3dbe1b 100644 --- a/data/bots/ai-robots-txt.yaml +++ b/data/bots/ai-robots-txt.yaml @@ -4,5 +4,5 @@ # CCBot is allowed because if Common Crawl is allowed, then scrapers don't need to scrape to get the data. - name: "ai-robots-txt" user_agent_regex: >- - AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|Andibot|anthropic-ai|Applebot|Applebot-Extended|bedrockbot|Brightbot 1.0|Bytespider|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|MyCentralAIScraperBot|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient.com|Perplexity-User|PerplexityBot|PetalBot|PhindBot|Poseidon Research Crawler|QualifiedBot|QuillBot|quillbot.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot-BA|SemrushBot-CT|SemrushBot-OCOB|SemrushBot-SI|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot + AddSearchBot|AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|Andibot|anthropic-ai|Applebot|Applebot-Extended|Awario|bedrockbot|bigsur.ai|Brightbot 1.0|Bytespider|CCBot|ChatGPT Agent|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|CloudVertexBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Datenbank Crawler|Devin|Diffbot|DuckAssistBot|Echobot Bot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini-Deep-Research|Google-CloudVertexBot|Google-Extended|GoogleAgent-Mariner|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo Bot|LinerBot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User|MistralAI-User/1.0|MyCentralAIScraperBot|netEstate Imprint Crawler|NovaAct|OAI-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient.com|Perplexity-User|PerplexityBot|PetalBot|PhindBot|Poseidon Research Crawler|QualifiedBot|QuillBot|quillbot.com|SBIntuitionsBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|Thinkbot|TikTokSpider|Timpibot|VelenPublicWebCrawler|WARDBot|Webzio-Extended|wpbot|YaK|YandexAdditional|YandexAdditionalBot|YouBot action: DENY diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index 75bfd4c..7811bed 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -37,6 +37,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - The default patterns in `data/common/keep-internet-working.yaml` have been updated to appropriately escape the '.' character in the regular expression patterns. - Add optional restrictions for JWT based on the value of a header ([#697](https://github.com/TecharoHQ/anubis/pull/697)) - The word "hack" has been removed from the translation strings for Anubis due to incidents involving people misunderstanding that word and sending particularly horrible things to the project lead over email. +- Bump AI-robots.txt to version 1.39 ### Breaking changes diff --git a/web/static/robots.txt b/web/static/robots.txt index 6e65c42..e5c518f 100644 --- a/web/static/robots.txt +++ b/web/static/robots.txt @@ -1,3 +1,4 @@ +User-agent: AddSearchBot User-agent: AI2Bot User-agent: Ai2Bot-Dolma User-agent: aiHitBot @@ -6,28 +7,38 @@ User-agent: Andibot User-agent: anthropic-ai User-agent: Applebot User-agent: Applebot-Extended +User-agent: Awario User-agent: bedrockbot +User-agent: bigsur.ai User-agent: Brightbot 1.0 User-agent: Bytespider +User-agent: CCBot +User-agent: ChatGPT Agent User-agent: ChatGPT-User User-agent: Claude-SearchBot User-agent: Claude-User User-agent: Claude-Web User-agent: ClaudeBot +User-agent: CloudVertexBot User-agent: cohere-ai User-agent: cohere-training-data-crawler User-agent: Cotoyogi User-agent: Crawlspace +User-agent: Datenbank Crawler +User-agent: Devin User-agent: Diffbot User-agent: DuckAssistBot +User-agent: Echobot Bot User-agent: EchoboxBot User-agent: FacebookBot User-agent: facebookexternalhit User-agent: Factset_spyderbot User-agent: FirecrawlAgent User-agent: FriendlyCrawler +User-agent: Gemini-Deep-Research User-agent: Google-CloudVertexBot User-agent: Google-Extended +User-agent: GoogleAgent-Mariner User-agent: GoogleOther User-agent: GoogleOther-Image User-agent: GoogleOther-Video @@ -38,16 +49,20 @@ User-agent: ImagesiftBot User-agent: img2dataset User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot +User-agent: LinerBot User-agent: meta-externalagent User-agent: Meta-ExternalAgent User-agent: meta-externalfetcher User-agent: Meta-ExternalFetcher +User-agent: MistralAI-User User-agent: MistralAI-User/1.0 User-agent: MyCentralAIScraperBot +User-agent: netEstate Imprint Crawler User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili User-agent: omgilibot +User-agent: OpenAI User-agent: Operator User-agent: PanguBot User-agent: Panscient @@ -62,18 +77,17 @@ User-agent: QuillBot User-agent: quillbot.com User-agent: SBIntuitionsBot User-agent: Scrapy -User-agent: SemrushBot -User-agent: SemrushBot-BA -User-agent: SemrushBot-CT User-agent: SemrushBot-OCOB -User-agent: SemrushBot-SI User-agent: SemrushBot-SWA User-agent: Sidetrade indexer bot +User-agent: Thinkbot User-agent: TikTokSpider User-agent: Timpibot User-agent: VelenPublicWebCrawler +User-agent: WARDBot User-agent: Webzio-Extended User-agent: wpbot +User-agent: YaK User-agent: YandexAdditional User-agent: YandexAdditionalBot User-agent: YouBot