feat: default config macro (#1186)

* feat(data): add default-config macro Closes #1152 Signed-off-by: Xe Iaso <me@xeiaso.net> * docs: update CHANGELOG Signed-off-by: Xe Iaso <me@xeiaso.net> * test: add default-config-macro smoke test This uses an AI generated python script to diff the contents of the bots field of the default configuration file and the data/meta/default-config.yaml file. It emits a patch showing what needs to be changed. Signed-off-by: Xe Iaso <me@xeiaso.net> --------- Signed-off-by: Xe Iaso <me@xeiaso.net>
2025-10-13 11:33:16 -04:00 · 2025-10-13 11:33:16 -04:00 · ffbbdce3da
commit ffbbdce3da
parent c09c86778d
6 changed files with 221 additions and 0 deletions
--- a/.github/workflows/smoke-tests.yml
+++ b/.github/workflows/smoke-tests.yml
@ -14,6 +14,7 @@ jobs:
    strategy:
      matrix:
        test:
+          - default-config-macro
          - double_slash
          - forced-language
          - git-clone
--- a/data/botPolicies.yaml
+++ b/data/botPolicies.yaml
@ -11,6 +11,9 @@
 ## /usr/share/docs/anubis/data or in the tarball you extracted Anubis from.

 bots:
+  # You can import the entire default config with this macro:
+  # - import: (data)/meta/default-config.yaml
+
  # Pathological bots to deny
  - # This correlates to data/bots/_deny-pathological.yaml in the source tree
    # https://github.com/TecharoHQ/anubis/blob/main/data/bots/_deny-pathological.yaml
--- a/data/meta/default-config.yaml
+++ b/data/meta/default-config.yaml
@ -0,0 +1,127 @@
+- # Pathological bots to deny
+  # This correlates to data/bots/_deny-pathological.yaml in the source tree
+  # https://github.com/TecharoHQ/anubis/blob/main/data/bots/_deny-pathological.yaml
+  import: (data)/bots/_deny-pathological.yaml
+- import: (data)/bots/aggressive-brazilian-scrapers.yaml
+
+# Aggressively block AI/LLM related bots/agents by default
+- import: (data)/meta/ai-block-aggressive.yaml
+
+# Consider replacing the aggressive AI policy with more selective policies:
+# - import: (data)/meta/ai-block-moderate.yaml
+# - import: (data)/meta/ai-block-permissive.yaml
+
+# Search engine crawlers to allow, defaults to:
+#   - Google (so they don't try to bypass Anubis)
+#   - Apple
+#   - Bing
+#   - DuckDuckGo
+#   - Qwant
+#   - The Internet Archive
+#   - Kagi
+#   - Marginalia
+#   - Mojeek
+- import: (data)/crawlers/_allow-good.yaml
+# Challenge Firefox AI previews
+- import: (data)/clients/x-firefox-ai.yaml
+
+# Allow common "keeping the internet working" routes (well-known, favicon, robots.txt)
+- import: (data)/common/keep-internet-working.yaml
+
+# # Punish any bot with "bot" in the user-agent string
+# # This is known to have a high false-positive rate, use at your own risk
+# - name: generic-bot-catchall
+#   user_agent_regex: (?i:bot|crawler)
+#   action: CHALLENGE
+#   challenge:
+#     difficulty: 16  # impossible
+#     report_as: 4    # lie to the operator
+#     algorithm: slow # intentionally waste CPU cycles and time
+
+# Requires a subscription to Thoth to use, see
+# https://anubis.techaro.lol/docs/admin/thoth#geoip-based-filtering
+- name: countries-with-aggressive-scrapers
+  action: WEIGH
+  geoip:
+    countries:
+      - BR
+      - CN
+  weight:
+    adjust: 10
+
+# Requires a subscription to Thoth to use, see
+# https://anubis.techaro.lol/docs/admin/thoth#asn-based-filtering
+- name: aggressive-asns-without-functional-abuse-contact
+  action: WEIGH
+  asns:
+    match:
+      - 13335 # Cloudflare
+      - 136907 # Huawei Cloud
+      - 45102 # Alibaba Cloud
+  weight:
+    adjust: 10
+
+# ## System load based checks.
+# # If the system is under high load, add weight.
+# - name: high-load-average
+#   action: WEIGH
+#   expression: load_1m >= 10.0 # make sure to end the load comparison in a .0
+#   weight:
+#     adjust: 20
+
+## If your backend service is running on the same operating system as Anubis,
+## you can uncomment this rule to make the challenge easier when the system is
+## under low load.
+##
+## If it is not, remove weight.
+# - name: low-load-average
+#   action: WEIGH
+#   expression: load_15m <= 4.0 # make sure to end the load comparison in a .0
+#   weight:
+#     adjust: -10
+
+# Assert behaviour that only genuine browsers display. This ensures that Chrome
+# or Firefox versions
+- name: realistic-browser-catchall
+  expression:
+    all:
+      - '"User-Agent" in headers'
+      - '( userAgent.contains("Firefox") ) || ( userAgent.contains("Chrome") ) || ( userAgent.contains("Safari") )'
+      - '"Accept" in headers'
+      - '"Sec-Fetch-Dest" in headers'
+      - '"Sec-Fetch-Mode" in headers'
+      - '"Sec-Fetch-Site" in headers'
+      - '"Upgrade-Insecure-Requests" in headers'
+      - '"Accept-Encoding" in headers'
+      - '( headers["Accept-Encoding"].contains("zstd") || headers["Accept-Encoding"].contains("br") )'
+      - '"Accept-Language" in headers'
+  action: WEIGH
+  weight:
+    adjust: -10
+
+# Chrome should behave like Chrome
+- name: chrome-is-proper
+  expression:
+    all:
+      - userAgent.contains("Chrome")
+      - '"Sec-Ch-Ua" in headers'
+      - 'headers["Sec-Ch-Ua"].contains("Chromium")'
+      - '"Sec-Ch-Ua-Mobile" in headers'
+      - '"Sec-Ch-Ua-Platform" in headers'
+  action: WEIGH
+  weight:
+    adjust: -5
+
+- name: should-have-accept
+  expression: '!("Accept" in headers)'
+  action: WEIGH
+  weight:
+    adjust: 5
+
+# Generic catchall rule
+- name: generic-browser
+  user_agent_regex: >-
+    Mozilla|Opera
+  action: WEIGH
+  weight:
+    adjust: 10
--- a/docs/docs/CHANGELOG.md
+++ b/docs/docs/CHANGELOG.md
@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 <!-- This changes the project to: -->

+- Added `(data)/meta/default-config.yaml` for importing the entire default configuration at once.
 - Add `-custom-real-ip-header` flag to get the original request IP from a different header than `x-real-ip`.
 - Add `contentLength` variable to bot expressions.
 - Add `COOKIE_SAME_SITE_MODE` to force anubis cookies SameSite value, and downgrade automatically from `None` to `Lax` if cookie is insecure.
--- a/test/default-config-macro/compare_bots.py
+++ b/test/default-config-macro/compare_bots.py
@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+Script to verify that the 'bots' field in data/botPolicies.yaml
+has the same semantic contents as data/meta/default-config.yaml.
+
+CW: generated by AI
+"""
+
+import yaml
+import sys
+import os
+import subprocess
+import difflib
+
+def load_yaml(file_path):
+    """Load YAML file and return the data."""
+    try:
+        with open(file_path, 'r') as f:
+            return yaml.safe_load(f)
+    except Exception as e:
+        print(f"Error loading {file_path}: {e}")
+        sys.exit(1)
+
+def normalize_yaml(data):
+    """Normalize YAML data by removing comments and standardizing structure."""
+    # For lists, just return as is, since YAML comments are stripped by safe_load
+    return data
+
+def get_repo_root():
+    """Get the root directory of the git repository."""
+    try:
+        result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], capture_output=True, text=True, check=True)
+        return result.stdout.strip()
+    except subprocess.CalledProcessError:
+        print("Error: Not in a git repository")
+        sys.exit(1)
+
+def main():
+    # Get the git repository root
+    repo_root = get_repo_root()
+
+    # Paths relative to the repo root
+    bot_policies_path = os.path.join(repo_root, 'data', 'botPolicies.yaml')
+    default_config_path = os.path.join(repo_root, 'data', 'meta', 'default-config.yaml')
+
+    # Load the files
+    bot_policies = load_yaml(bot_policies_path)
+    default_config = load_yaml(default_config_path)
+
+    # Extract the 'bots' field from botPolicies.yaml
+    if 'bots' not in bot_policies:
+        print("Error: 'bots' field not found in botPolicies.yaml")
+        sys.exit(1)
+    bots_field = bot_policies['bots']
+
+    # The default-config.yaml is a list directly
+    default_bots = default_config
+
+    # Normalize both
+    normalized_bots = normalize_yaml(bots_field)
+    normalized_default = normalize_yaml(default_bots)
+
+    # Compare
+    if normalized_bots == normalized_default:
+        print("SUCCESS: The 'bots' field in botPolicies.yaml matches the contents of default-config.yaml")
+        sys.exit(0)
+    else:
+        print("FAILURE: The 'bots' field in botPolicies.yaml does not match the contents of default-config.yaml")
+        print("\nDiff:")
+        bots_yaml = yaml.dump(normalized_bots, default_flow_style=False)
+        default_yaml = yaml.dump(normalized_default, default_flow_style=False)
+        diff = difflib.unified_diff(
+            bots_yaml.splitlines(keepends=True),
+            default_yaml.splitlines(keepends=True),
+            fromfile='bots field in botPolicies.yaml',
+            tofile='default-config.yaml'
+        )
+        print(''.join(diff))
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
--- a/test/default-config-macro/test.sh
+++ b/test/default-config-macro/test.sh
@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+cd "$(dirname "$0")"
+python3 -c 'import yaml'
+python3 ./compare_bots.py