Split up AI filtering files (#592)
* Split up AI filtering files Create aggressive/moderate/permissive policies to allow administrators to choose their AI/LLM stance. Aggressive policy matches existing default in Anubis. Removes `Google-Extended` flag from `ai-robots-txt.yaml` as it doesn't exist in requests. Rename `ai-robots-txt.yaml` to `ai-catchall.yaml` as the file is no longer a copy of the source repo/file. * chore: spelling * chore: fix embeds * chore: fix data includes * chore: fix file name typo * chore: Ignore READMEs in configs * chore(lib/policy/config): go tool goimports -w Signed-off-by: Xe Iaso <me@xeiaso.net> --------- Signed-off-by: Xe Iaso <me@xeiaso.net> Co-authored-by: Xe Iaso <me@xeiaso.net>
This commit is contained in:
parent
77e0bbbce9
commit
de7dbfe6d6
19 changed files with 107 additions and 18 deletions
|
|
@ -4,7 +4,7 @@
|
|||
"import": "(data)/bots/_deny-pathological.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/bots/ai-robots-txt.yaml"
|
||||
"import": "(data)/meta/ai-block-aggressive.yaml"
|
||||
},
|
||||
{
|
||||
"import": "(data)/crawlers/_allow-good.yaml"
|
||||
|
|
|
|||
|
|
@ -17,8 +17,12 @@ bots:
|
|||
import: (data)/bots/_deny-pathological.yaml
|
||||
- import: (data)/bots/aggressive-brazilian-scrapers.yaml
|
||||
|
||||
# Enforce https://github.com/ai-robots-txt/ai.robots.txt
|
||||
- import: (data)/bots/ai-robots-txt.yaml
|
||||
# Aggressively block AI/LLM related bots/agents by default
|
||||
- import: (data)/meta/ai-block-aggressive.yaml
|
||||
|
||||
# Consider replacing the aggressive AI policy with more selective policies:
|
||||
# - import: (data)/meta/ai-block-moderate.yaml
|
||||
# - import: (data)/meta/ai-block-permissive.yaml
|
||||
|
||||
# Search engine crawlers to allow, defaults to:
|
||||
# - Google (so they don't try to bypass Anubis)
|
||||
|
|
|
|||
11
data/bots/ai-catchall.yaml
Normal file
11
data/bots/ai-catchall.yaml
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
# Extensive list of AI-affiliated agents based on https://github.com/ai-robots-txt/ai.robots.txt
|
||||
# Add new/undocumented agents here. Where documentation exists, consider moving to dedicated policy files.
|
||||
# Notes on various agents:
|
||||
# - Amazonbot: Well documented, but they refuse to state which agent collects training data.
|
||||
# - anthropic-ai/Claude-Web: Undocumented by Anthropic. Possibly deprecated or hallucinations?
|
||||
# - Perplexity*: Well documented, but they refuse to state which agent collects training data.
|
||||
# Warning: May contain user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
|
||||
- name: "ai-catchall"
|
||||
user_agent_regex: >-
|
||||
AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|Claude-Web|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|GoogleOther|GoogleOther-Image|GoogleOther-Video|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|NovaAct|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
|
||||
action: DENY
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
# Warning: Contains user agents that _must_ be blocked in robots.txt, or the opt-out will have no effect.
|
||||
# Note: Blocks human-directed/non-training user agents
|
||||
- name: "ai-robots-txt"
|
||||
user_agent_regex: >-
|
||||
AI2Bot|Ai2Bot-Dolma|aiHitBot|Amazonbot|anthropic-ai|Brightbot 1.0|Bytespider|CCBot|ChatGPT-User|Claude-SearchBot|Claude-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google-CloudVertexBot|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo Bot|meta-externalagent|Meta-ExternalAgent|meta-externalfetcher|Meta-ExternalFetcher|MistralAI-User/1.0|NovaAct|OAI-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade indexer bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio-Extended|wpbot|YouBot
|
||||
action: DENY
|
||||
8
data/clients/ai.yaml
Normal file
8
data/clients/ai.yaml
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
# User agents that act on behalf of humans in AI tools, e.g. searching the web.
|
||||
# Each entry should have a positive/ALLOW entry created as well, with further documentation.
|
||||
# Exceptions:
|
||||
# - Claude-User: No published IP allowlist
|
||||
- name: "ai-clients"
|
||||
user_agent_regex: >-
|
||||
ChatGPT-User|Claude-User|MistralAI-User
|
||||
action: DENY
|
||||
8
data/crawlers/ai-search.yaml
Normal file
8
data/crawlers/ai-search.yaml
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
# User agents that index exclusively for search in for AI systems.
|
||||
# Each entry should have a positive/ALLOW entry created as well, with further documentation.
|
||||
# Exceptions:
|
||||
# - Claude-SearchBot: No published IP allowlist
|
||||
- name: "ai-crawlers-search"
|
||||
user_agent_regex: >-
|
||||
OAI-SearchBot|Claude-SearchBot
|
||||
action: DENY
|
||||
8
data/crawlers/ai-training.yaml
Normal file
8
data/crawlers/ai-training.yaml
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
# User agents that crawl for training AI/LLM systems
|
||||
# Each entry should have a positive/ALLOW entry created as well, with further documentation.
|
||||
# Exceptions:
|
||||
# - ClaudeBot: No published IP allowlist
|
||||
- name: "ai-crawlers-training"
|
||||
user_agent_regex: >-
|
||||
GPTBot|ClaudeBot
|
||||
action: DENY
|
||||
|
|
@ -3,6 +3,6 @@ package data
|
|||
import "embed"
|
||||
|
||||
var (
|
||||
//go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:clients all:common all:crawlers
|
||||
//go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:clients all:common all:crawlers all:meta
|
||||
BotPolicies embed.FS
|
||||
)
|
||||
|
|
|
|||
5
data/meta/README.md
Normal file
5
data/meta/README.md
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
# meta policies
|
||||
|
||||
Contains policies that exclusively reference policies in _multiple_ other data folders.
|
||||
|
||||
Akin to "stances" that the administrator can take, with reference to various topics, such as AI/LLM systems.
|
||||
6
data/meta/ai-block-aggressive.yaml
Normal file
6
data/meta/ai-block-aggressive.yaml
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
# Blocks all AI/LLM associated user agents, regardless of purpose or human agency
|
||||
# Warning: To completely block some AI/LLM training, such as with Google, you _must_ place flags in robots.txt.
|
||||
- import: (data)/bots/ai-catchall.yaml
|
||||
- import: (data)/clients/ai.yaml
|
||||
- import: (data)/crawlers/ai-search.yaml
|
||||
- import: (data)/crawlers/ai-training.yaml
|
||||
7
data/meta/ai-block-moderate.yaml
Normal file
7
data/meta/ai-block-moderate.yaml
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
# Blocks all AI/LLM bots used for training or unknown/undocumented purposes.
|
||||
# Permits user agents with explicitly documented non-training use, and published IP allowlists.
|
||||
- import: (data)/bots/ai-catchall.yaml
|
||||
- import: (data)/crawlers/ai-training.yaml
|
||||
- import: (data)/crawlers/openai-searchbot.yaml
|
||||
- import: (data)/clients/openai-chatgpt-user.yaml
|
||||
- import: (data)/clients/mistral-mistralai-user.yaml
|
||||
6
data/meta/ai-block-permissive.yaml
Normal file
6
data/meta/ai-block-permissive.yaml
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
# Permits all well documented AI/LLM user agents with published IP allowlists.
|
||||
- import: (data)/bots/ai-catchall.yaml
|
||||
- import: (data)/crawlers/openai-searchbot.yaml
|
||||
- import: (data)/crawlers/openai-gptbot.yaml
|
||||
- import: (data)/clients/openai-chatgpt-user.yaml
|
||||
- import: (data)/clients/mistral-mistralai-user.yaml
|
||||
Loading…
Add table
Add a link
Reference in a new issue