nuke/data/botPolicies.yaml

## Anubis has the ability to let you import snippets of configuration into the main
## configuration file. This allows you to break up your config into smaller parts
## that get logically assembled into one big file.
##
## Of note, a bot rule can either have inline bot configuration or import a
## bot config snippet. You cannot do both in a single bot rule.
##
## Import paths can either be prefixed with (data) to import from the common/shared
## rules in the data folder in the Anubis source tree or will point to absolute/relative
## paths in your filesystem. If you don't have access to the Anubis source tree, check
## /usr/share/docs/anubis/data or in the tarball you extracted Anubis from.

bots:
  # Pathological bots to deny
  - # This correlates to data/bots/deny-pathological.yaml in the source tree
    # https://github.com/TecharoHQ/anubis/blob/main/data/bots/deny-pathological.yaml
    import: (data)/bots/_deny-pathological.yaml
  - import: (data)/bots/aggressive-brazilian-scrapers.yaml

  # Aggressively block AI/LLM related bots/agents by default
  - import: (data)/meta/ai-block-aggressive.yaml

  # Consider replacing the aggressive AI policy with more selective policies:
  # - import: (data)/meta/ai-block-moderate.yaml
  # - import: (data)/meta/ai-block-permissive.yaml

  # Search engine crawlers to allow, defaults to:
  #   - Google (so they don't try to bypass Anubis)
  #   - Apple
  #   - Bing
  #   - DuckDuckGo
  #   - Qwant
  #   - The Internet Archive
  #   - Kagi
  #   - Marginalia
  #   - Mojeek
  - import: (data)/crawlers/_allow-good.yaml
  # Challenge Firefox AI previews
  - import: (data)/clients/x-firefox-ai.yaml

  # Allow common "keeping the internet working" routes (well-known, favicon, robots.txt)
  - import: (data)/common/keep-internet-working.yaml

  # # Punish any bot with "bot" in the user-agent string
  # # This is known to have a high false-positive rate, use at your own risk
  # - name: generic-bot-catchall
  #   user_agent_regex: (?i:bot|crawler)
  #   action: CHALLENGE
  #   challenge:
  #     difficulty: 16  # impossible
  #     report_as: 4    # lie to the operator
  #     algorithm: slow # intentionally waste CPU cycles and time

  # Generic catchall rule
  - name: generic-browser
    user_agent_regex: >-
      Mozilla|Opera
    action: CHALLENGE

dnsbl: false

# By default, send HTTP 200 back to clients that either get issued a challenge
# or a denial. This seems weird, but this is load-bearing due to the fact that
# the most aggressive scraper bots seem to really, really, want an HTTP 200 and
# will stop sending requests once they get it.
status_codes:
  CHALLENGE: 200
  DENY: 200