From ffbbdce3da118bab728c60a15d3b47456b66f417 Mon Sep 17 00:00:00 2001 From: Xe Iaso Date: Mon, 13 Oct 2025 11:33:16 -0400 Subject: [PATCH] feat: default config macro (#1186) * feat(data): add default-config macro Closes #1152 Signed-off-by: Xe Iaso * docs: update CHANGELOG Signed-off-by: Xe Iaso * test: add default-config-macro smoke test This uses an AI generated python script to diff the contents of the bots field of the default configuration file and the data/meta/default-config.yaml file. It emits a patch showing what needs to be changed. Signed-off-by: Xe Iaso --------- Signed-off-by: Xe Iaso --- .github/workflows/smoke-tests.yml | 1 + data/botPolicies.yaml | 3 + data/meta/default-config.yaml | 127 ++++++++++++++++++++++ docs/docs/CHANGELOG.md | 1 + test/default-config-macro/compare_bots.py | 82 ++++++++++++++ test/default-config-macro/test.sh | 7 ++ 6 files changed, 221 insertions(+) create mode 100644 data/meta/default-config.yaml create mode 100644 test/default-config-macro/compare_bots.py create mode 100755 test/default-config-macro/test.sh diff --git a/.github/workflows/smoke-tests.yml b/.github/workflows/smoke-tests.yml index ef1a834..d34ee9b 100644 --- a/.github/workflows/smoke-tests.yml +++ b/.github/workflows/smoke-tests.yml @@ -14,6 +14,7 @@ jobs: strategy: matrix: test: + - default-config-macro - double_slash - forced-language - git-clone diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml index 20b1fb7..25ed7af 100644 --- a/data/botPolicies.yaml +++ b/data/botPolicies.yaml @@ -11,6 +11,9 @@ ## /usr/share/docs/anubis/data or in the tarball you extracted Anubis from. bots: + # You can import the entire default config with this macro: + # - import: (data)/meta/default-config.yaml + # Pathological bots to deny - # This correlates to data/bots/_deny-pathological.yaml in the source tree # https://github.com/TecharoHQ/anubis/blob/main/data/bots/_deny-pathological.yaml diff --git a/data/meta/default-config.yaml b/data/meta/default-config.yaml new file mode 100644 index 0000000..d239094 --- /dev/null +++ b/data/meta/default-config.yaml @@ -0,0 +1,127 @@ +- # Pathological bots to deny + # This correlates to data/bots/_deny-pathological.yaml in the source tree + # https://github.com/TecharoHQ/anubis/blob/main/data/bots/_deny-pathological.yaml + import: (data)/bots/_deny-pathological.yaml +- import: (data)/bots/aggressive-brazilian-scrapers.yaml + +# Aggressively block AI/LLM related bots/agents by default +- import: (data)/meta/ai-block-aggressive.yaml + +# Consider replacing the aggressive AI policy with more selective policies: +# - import: (data)/meta/ai-block-moderate.yaml +# - import: (data)/meta/ai-block-permissive.yaml + +# Search engine crawlers to allow, defaults to: +# - Google (so they don't try to bypass Anubis) +# - Apple +# - Bing +# - DuckDuckGo +# - Qwant +# - The Internet Archive +# - Kagi +# - Marginalia +# - Mojeek +- import: (data)/crawlers/_allow-good.yaml +# Challenge Firefox AI previews +- import: (data)/clients/x-firefox-ai.yaml + +# Allow common "keeping the internet working" routes (well-known, favicon, robots.txt) +- import: (data)/common/keep-internet-working.yaml + +# # Punish any bot with "bot" in the user-agent string +# # This is known to have a high false-positive rate, use at your own risk +# - name: generic-bot-catchall +# user_agent_regex: (?i:bot|crawler) +# action: CHALLENGE +# challenge: +# difficulty: 16 # impossible +# report_as: 4 # lie to the operator +# algorithm: slow # intentionally waste CPU cycles and time + +# Requires a subscription to Thoth to use, see +# https://anubis.techaro.lol/docs/admin/thoth#geoip-based-filtering +- name: countries-with-aggressive-scrapers + action: WEIGH + geoip: + countries: + - BR + - CN + weight: + adjust: 10 + +# Requires a subscription to Thoth to use, see +# https://anubis.techaro.lol/docs/admin/thoth#asn-based-filtering +- name: aggressive-asns-without-functional-abuse-contact + action: WEIGH + asns: + match: + - 13335 # Cloudflare + - 136907 # Huawei Cloud + - 45102 # Alibaba Cloud + weight: + adjust: 10 + +# ## System load based checks. +# # If the system is under high load, add weight. +# - name: high-load-average +# action: WEIGH +# expression: load_1m >= 10.0 # make sure to end the load comparison in a .0 +# weight: +# adjust: 20 + +## If your backend service is running on the same operating system as Anubis, +## you can uncomment this rule to make the challenge easier when the system is +## under low load. +## +## If it is not, remove weight. +# - name: low-load-average +# action: WEIGH +# expression: load_15m <= 4.0 # make sure to end the load comparison in a .0 +# weight: +# adjust: -10 + +# Assert behaviour that only genuine browsers display. This ensures that Chrome +# or Firefox versions +- name: realistic-browser-catchall + expression: + all: + - '"User-Agent" in headers' + - '( userAgent.contains("Firefox") ) || ( userAgent.contains("Chrome") ) || ( userAgent.contains("Safari") )' + - '"Accept" in headers' + - '"Sec-Fetch-Dest" in headers' + - '"Sec-Fetch-Mode" in headers' + - '"Sec-Fetch-Site" in headers' + - '"Upgrade-Insecure-Requests" in headers' + - '"Accept-Encoding" in headers' + - '( headers["Accept-Encoding"].contains("zstd") || headers["Accept-Encoding"].contains("br") )' + - '"Accept-Language" in headers' + action: WEIGH + weight: + adjust: -10 + +# Chrome should behave like Chrome +- name: chrome-is-proper + expression: + all: + - userAgent.contains("Chrome") + - '"Sec-Ch-Ua" in headers' + - 'headers["Sec-Ch-Ua"].contains("Chromium")' + - '"Sec-Ch-Ua-Mobile" in headers' + - '"Sec-Ch-Ua-Platform" in headers' + action: WEIGH + weight: + adjust: -5 + +- name: should-have-accept + expression: '!("Accept" in headers)' + action: WEIGH + weight: + adjust: 5 + +# Generic catchall rule +- name: generic-browser + user_agent_regex: >- + Mozilla|Opera + action: WEIGH + weight: + adjust: 10 diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index cd06330..56e3cb5 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +- Added `(data)/meta/default-config.yaml` for importing the entire default configuration at once. - Add `-custom-real-ip-header` flag to get the original request IP from a different header than `x-real-ip`. - Add `contentLength` variable to bot expressions. - Add `COOKIE_SAME_SITE_MODE` to force anubis cookies SameSite value, and downgrade automatically from `None` to `Lax` if cookie is insecure. diff --git a/test/default-config-macro/compare_bots.py b/test/default-config-macro/compare_bots.py new file mode 100644 index 0000000..edc0496 --- /dev/null +++ b/test/default-config-macro/compare_bots.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +""" +Script to verify that the 'bots' field in data/botPolicies.yaml +has the same semantic contents as data/meta/default-config.yaml. + +CW: generated by AI +""" + +import yaml +import sys +import os +import subprocess +import difflib + +def load_yaml(file_path): + """Load YAML file and return the data.""" + try: + with open(file_path, 'r') as f: + return yaml.safe_load(f) + except Exception as e: + print(f"Error loading {file_path}: {e}") + sys.exit(1) + +def normalize_yaml(data): + """Normalize YAML data by removing comments and standardizing structure.""" + # For lists, just return as is, since YAML comments are stripped by safe_load + return data + +def get_repo_root(): + """Get the root directory of the git repository.""" + try: + result = subprocess.run(['git', 'rev-parse', '--show-toplevel'], capture_output=True, text=True, check=True) + return result.stdout.strip() + except subprocess.CalledProcessError: + print("Error: Not in a git repository") + sys.exit(1) + +def main(): + # Get the git repository root + repo_root = get_repo_root() + + # Paths relative to the repo root + bot_policies_path = os.path.join(repo_root, 'data', 'botPolicies.yaml') + default_config_path = os.path.join(repo_root, 'data', 'meta', 'default-config.yaml') + + # Load the files + bot_policies = load_yaml(bot_policies_path) + default_config = load_yaml(default_config_path) + + # Extract the 'bots' field from botPolicies.yaml + if 'bots' not in bot_policies: + print("Error: 'bots' field not found in botPolicies.yaml") + sys.exit(1) + bots_field = bot_policies['bots'] + + # The default-config.yaml is a list directly + default_bots = default_config + + # Normalize both + normalized_bots = normalize_yaml(bots_field) + normalized_default = normalize_yaml(default_bots) + + # Compare + if normalized_bots == normalized_default: + print("SUCCESS: The 'bots' field in botPolicies.yaml matches the contents of default-config.yaml") + sys.exit(0) + else: + print("FAILURE: The 'bots' field in botPolicies.yaml does not match the contents of default-config.yaml") + print("\nDiff:") + bots_yaml = yaml.dump(normalized_bots, default_flow_style=False) + default_yaml = yaml.dump(normalized_default, default_flow_style=False) + diff = difflib.unified_diff( + bots_yaml.splitlines(keepends=True), + default_yaml.splitlines(keepends=True), + fromfile='bots field in botPolicies.yaml', + tofile='default-config.yaml' + ) + print(''.join(diff)) + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test/default-config-macro/test.sh b/test/default-config-macro/test.sh new file mode 100755 index 0000000..6e0b7ab --- /dev/null +++ b/test/default-config-macro/test.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +set -euo pipefail + +cd "$(dirname "$0")" +python3 -c 'import yaml' +python3 ./compare_bots.py \ No newline at end of file