feat: add robots2policy CLI to convert robots.txt to Anubis CEL (#657)

* feat: add robots2policy CLI utility to convert robots.txt to Anubis challenge policies

* feat: add documentation for robots2policy CLI tool

* feat: implement crawl delay handling as weight adjustment in Anubis rules

* feat: add various robots.txt and YAML configurations for user agent handling and crawl delays

* test: add comprehensive tests for robots2policy conversion and parsing

* fix: update example URL in usage instructions for robots2policy CLI

* Update metadata

check-spelling run (pull_request) for json/robots2policycli

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev>

* docs: add crawl delay weight adjustment and deny user agents option to robots2policy CLI

* Update cmd/robots2policy/main.go

Co-authored-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>

* Update cmd/robots2policy/main.go

Co-authored-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>

* fix(robots2policy): use sigs.k8s.io/yaml

Signed-off-by: Xe Iaso <me@xeiaso.net>

* feat(config): properly marshal bot policy rules

Signed-off-by: Xe Iaso <me@xeiaso.net>

* chore(yeetfile): expose robots2policy in libexec

Signed-off-by: Xe Iaso <me@xeiaso.net>

* fix(yeetfile): put robots2policy in $PATH

Signed-off-by: Xe Iaso <me@xeiaso.net>

* Update metadata

check-spelling run (pull_request) for json/robots2policycli

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev>

* style: reorder imports

* refactor: use preexisting structs in config

* fix: correct flag check in main function

* fix: reorder fields in AnubisRule struct for better alignment

* style: improve alignment of struct fields in AnubisRule and OGTagCache

* Update metadata

check-spelling run (pull_request) for json/robots2policycli

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev>

* fix: add validation for generated Anubis rules from robots.txt

* feat: add batch processing for robots.txt files to generate Anubis CEL policies

* fix: improve usage message and error handling for input file requirement

* refactor: update AnubisRule structure to use ExpressionOrList for improved expression handling

* refactor: reorganize policy definitions in YAML files for consistency and clarity

* fix: correct indentation in blacklist and complex YAML files for consistency

* test: enhance output comparison in robots2policy tests for YAML and JSON formats

* Revert "fix: improve usage message and error handling for input file requirement"

This reverts commit ddcde1f2a326545d3ef2ec32e5e03f55f4f931a8.

* fix: improve usage message and error handling in robots2policy

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

---------

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>
Signed-off-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Jason Cameron <git@jasoncameron.dev>
Co-authored-by: Xe Iaso <me@xeiaso.net>
This commit is contained in:
Jason Cameron 2025-06-14 23:41:00 -04:00 committed by GitHub
parent 7a195f1595
commit e0781e4560
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
28 changed files with 1302 additions and 27 deletions

View file

@ -0,0 +1,15 @@
# Test with blacklisted user agents
User-agent: *
Disallow: /admin
Crawl-delay: 10
User-agent: BadBot
Disallow: /
User-agent: SpamBot
Disallow: /
Crawl-delay: 60
User-agent: Googlebot
Disallow: /search
Crawl-delay: 5

View file

@ -0,0 +1,30 @@
- action: WEIGH
expression: "true"
name: robots-txt-policy-crawl-delay-1
weight:
adjust: 3
- action: CHALLENGE
expression: path.startsWith("/admin")
name: robots-txt-policy-disallow-2
- action: DENY
expression: userAgent.contains("BadBot")
name: robots-txt-policy-blacklist-3
- action: WEIGH
expression: userAgent.contains("SpamBot")
name: robots-txt-policy-crawl-delay-4
weight:
adjust: 3
- action: DENY
expression: userAgent.contains("SpamBot")
name: robots-txt-policy-blacklist-5
- action: WEIGH
expression: userAgent.contains("Googlebot")
name: robots-txt-policy-crawl-delay-6
weight:
adjust: 3
- action: CHALLENGE
expression:
all:
- userAgent.contains("Googlebot")
- path.startsWith("/search")
name: robots-txt-policy-disallow-7

View file

@ -0,0 +1,30 @@
# Complex real-world example
User-agent: *
Disallow: /admin/
Disallow: /private/
Disallow: /api/internal/
Allow: /api/public/
Crawl-delay: 5
User-agent: Googlebot
Disallow: /search/
Allow: /api/
Crawl-delay: 2
User-agent: Bingbot
Disallow: /search/
Disallow: /admin/
Crawl-delay: 10
User-agent: BadBot
Disallow: /
User-agent: SeoBot
Disallow: /
Crawl-delay: 300
# Test with various patterns
User-agent: TestBot
Disallow: /*/admin
Disallow: /temp*.html
Disallow: /file?.log

71
cmd/robots2policy/testdata/complex.yaml vendored Normal file
View file

@ -0,0 +1,71 @@
- action: WEIGH
expression: "true"
name: robots-txt-policy-crawl-delay-1
weight:
adjust: 5
- action: CHALLENGE
expression: path.startsWith("/admin/")
name: robots-txt-policy-disallow-2
- action: CHALLENGE
expression: path.startsWith("/private/")
name: robots-txt-policy-disallow-3
- action: CHALLENGE
expression: path.startsWith("/api/internal/")
name: robots-txt-policy-disallow-4
- action: WEIGH
expression: userAgent.contains("Googlebot")
name: robots-txt-policy-crawl-delay-5
weight:
adjust: 5
- action: CHALLENGE
expression:
all:
- userAgent.contains("Googlebot")
- path.startsWith("/search/")
name: robots-txt-policy-disallow-6
- action: WEIGH
expression: userAgent.contains("Bingbot")
name: robots-txt-policy-crawl-delay-7
weight:
adjust: 5
- action: CHALLENGE
expression:
all:
- userAgent.contains("Bingbot")
- path.startsWith("/search/")
name: robots-txt-policy-disallow-8
- action: CHALLENGE
expression:
all:
- userAgent.contains("Bingbot")
- path.startsWith("/admin/")
name: robots-txt-policy-disallow-9
- action: DENY
expression: userAgent.contains("BadBot")
name: robots-txt-policy-blacklist-10
- action: WEIGH
expression: userAgent.contains("SeoBot")
name: robots-txt-policy-crawl-delay-11
weight:
adjust: 5
- action: DENY
expression: userAgent.contains("SeoBot")
name: robots-txt-policy-blacklist-12
- action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/.*/admin")
name: robots-txt-policy-disallow-13
- action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/temp.*\\.html")
name: robots-txt-policy-disallow-14
- action: CHALLENGE
expression:
all:
- userAgent.contains("TestBot")
- path.matches("^/file.\\.log")
name: robots-txt-policy-disallow-15

View file

@ -0,0 +1,6 @@
- action: CHALLENGE
expression: path.startsWith("/admin/")
name: my-custom-policy-disallow-1
- action: CHALLENGE
expression: path.startsWith("/private")
name: my-custom-policy-disallow-2

View file

@ -0,0 +1,6 @@
- action: DENY
expression: path.startsWith("/admin/")
name: robots-txt-policy-disallow-1
- action: DENY
expression: path.startsWith("/private")
name: robots-txt-policy-disallow-2

View file

@ -0,0 +1,2 @@
# Empty robots.txt (comments only)
# No actual rules

1
cmd/robots2policy/testdata/empty.yaml vendored Normal file
View file

@ -0,0 +1 @@
[]

12
cmd/robots2policy/testdata/simple.json vendored Normal file
View file

@ -0,0 +1,12 @@
[
{
"action": "CHALLENGE",
"expression": "path.startsWith(\"/admin/\")",
"name": "robots-txt-policy-disallow-1"
},
{
"action": "CHALLENGE",
"expression": "path.startsWith(\"/private\")",
"name": "robots-txt-policy-disallow-2"
}
]

View file

@ -0,0 +1,5 @@
# Simple robots.txt test
User-agent: *
Disallow: /admin/
Disallow: /private
Allow: /public

View file

@ -0,0 +1,6 @@
- action: CHALLENGE
expression: path.startsWith("/admin/")
name: robots-txt-policy-disallow-1
- action: CHALLENGE
expression: path.startsWith("/private")
name: robots-txt-policy-disallow-2

View file

@ -0,0 +1,6 @@
# Test wildcard patterns
User-agent: *
Disallow: /search*
Disallow: /*/private
Disallow: /file?.txt
Disallow: /admin/*?action=delete

View file

@ -0,0 +1,12 @@
- action: CHALLENGE
expression: path.matches("^/search.*")
name: robots-txt-policy-disallow-1
- action: CHALLENGE
expression: path.matches("^/.*/private")
name: robots-txt-policy-disallow-2
- action: CHALLENGE
expression: path.matches("^/file.\\.txt")
name: robots-txt-policy-disallow-3
- action: CHALLENGE
expression: path.matches("^/admin/.*.action=delete")
name: robots-txt-policy-disallow-4