feat: add robots2policy CLI to convert robots.txt to Anubis CEL (#657)

* feat: add robots2policy CLI utility to convert robots.txt to Anubis challenge policies

* feat: add documentation for robots2policy CLI tool

* feat: implement crawl delay handling as weight adjustment in Anubis rules

* feat: add various robots.txt and YAML configurations for user agent handling and crawl delays

* test: add comprehensive tests for robots2policy conversion and parsing

* fix: update example URL in usage instructions for robots2policy CLI

* Update metadata

check-spelling run (pull_request) for json/robots2policycli

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev>

* docs: add crawl delay weight adjustment and deny user agents option to robots2policy CLI

* Update cmd/robots2policy/main.go

Co-authored-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>

* Update cmd/robots2policy/main.go

Co-authored-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>

* fix(robots2policy): use sigs.k8s.io/yaml

Signed-off-by: Xe Iaso <me@xeiaso.net>

* feat(config): properly marshal bot policy rules

Signed-off-by: Xe Iaso <me@xeiaso.net>

* chore(yeetfile): expose robots2policy in libexec

Signed-off-by: Xe Iaso <me@xeiaso.net>

* fix(yeetfile): put robots2policy in $PATH

Signed-off-by: Xe Iaso <me@xeiaso.net>

* Update metadata

check-spelling run (pull_request) for json/robots2policycli

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev>

* style: reorder imports

* refactor: use preexisting structs in config

* fix: correct flag check in main function

* fix: reorder fields in AnubisRule struct for better alignment

* style: improve alignment of struct fields in AnubisRule and OGTagCache

* Update metadata

check-spelling run (pull_request) for json/robots2policycli

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev>

* fix: add validation for generated Anubis rules from robots.txt

* feat: add batch processing for robots.txt files to generate Anubis CEL policies

* fix: improve usage message and error handling for input file requirement

* refactor: update AnubisRule structure to use ExpressionOrList for improved expression handling

* refactor: reorganize policy definitions in YAML files for consistency and clarity

* fix: correct indentation in blacklist and complex YAML files for consistency

* test: enhance output comparison in robots2policy tests for YAML and JSON formats

* Revert "fix: improve usage message and error handling for input file requirement"

This reverts commit ddcde1f2a326545d3ef2ec32e5e03f55f4f931a8.

* fix: improve usage message and error handling in robots2policy

Signed-off-by: Jason Cameron <git@jasoncameron.dev>

---------

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com>
Signed-off-by: Xe Iaso <me@xeiaso.net>
Signed-off-by: Jason Cameron <git@jasoncameron.dev>
Co-authored-by: Xe Iaso <me@xeiaso.net>
This commit is contained in:
Jason Cameron 2025-06-14 23:41:00 -04:00 committed by GitHub
parent 7a195f1595
commit e0781e4560
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
28 changed files with 1302 additions and 27 deletions

View file

@ -46,15 +46,15 @@ const (
const DefaultAlgorithm = "fast"
type BotConfig struct {
UserAgentRegex *string `json:"user_agent_regex,omitempty"`
PathRegex *string `json:"path_regex,omitempty"`
HeadersRegex map[string]string `json:"headers_regex,omitempty"`
Expression *ExpressionOrList `json:"expression,omitempty"`
Challenge *ChallengeRules `json:"challenge,omitempty"`
Weight *Weight `json:"weight,omitempty"`
Name string `json:"name"`
Action Rule `json:"action"`
RemoteAddr []string `json:"remote_addresses,omitempty"`
UserAgentRegex *string `json:"user_agent_regex,omitempty" yaml:"user_agent_regex,omitempty"`
PathRegex *string `json:"path_regex,omitempty" yaml:"path_regex,omitempty"`
HeadersRegex map[string]string `json:"headers_regex,omitempty" yaml:"headers_regex,omitempty"`
Expression *ExpressionOrList `json:"expression,omitempty" yaml:"expression,omitempty"`
Challenge *ChallengeRules `json:"challenge,omitempty" yaml:"challenge,omitempty"`
Weight *Weight `json:"weight,omitempty" yaml:"weight,omitempty"`
Name string `json:"name" yaml:"name"`
Action Rule `json:"action" yaml:"action"`
RemoteAddr []string `json:"remote_addresses,omitempty" yaml:"remote_addresses,omitempty"`
}
func (b BotConfig) Zero() bool {
@ -170,9 +170,9 @@ func (b *BotConfig) Valid() error {
}
type ChallengeRules struct {
Algorithm string `json:"algorithm"`
Difficulty int `json:"difficulty"`
ReportAs int `json:"report_as"`
Algorithm string `json:"algorithm,omitempty" yaml:"algorithm,omitempty"`
Difficulty int `json:"difficulty,omitempty" yaml:"difficulty,omitempty"`
ReportAs int `json:"report_as,omitempty" yaml:"report_as,omitempty"`
}
var (

View file

@ -13,9 +13,9 @@ var (
)
type ExpressionOrList struct {
Expression string `json:"-"`
All []string `json:"all,omitempty"`
Any []string `json:"any,omitempty"`
Expression string `json:"-" yaml:"-"`
All []string `json:"all,omitempty" yaml:"all,omitempty"`
Any []string `json:"any,omitempty" yaml:"any,omitempty"`
}
func (eol ExpressionOrList) Equal(rhs *ExpressionOrList) bool {
@ -34,6 +34,43 @@ func (eol ExpressionOrList) Equal(rhs *ExpressionOrList) bool {
return true
}
func (eol *ExpressionOrList) MarshalYAML() (any, error) {
switch {
case len(eol.All) == 1 && len(eol.Any) == 0:
eol.Expression = eol.All[0]
eol.All = nil
case len(eol.Any) == 1 && len(eol.All) == 0:
eol.Expression = eol.Any[0]
eol.Any = nil
}
if eol.Expression != "" {
return eol.Expression, nil
}
type RawExpressionOrList ExpressionOrList
return RawExpressionOrList(*eol), nil
}
func (eol *ExpressionOrList) MarshalJSON() ([]byte, error) {
switch {
case len(eol.All) == 1 && len(eol.Any) == 0:
eol.Expression = eol.All[0]
eol.All = nil
case len(eol.Any) == 1 && len(eol.All) == 0:
eol.Expression = eol.Any[0]
eol.Any = nil
}
if eol.Expression != "" {
return json.Marshal(string(eol.Expression))
}
type RawExpressionOrList ExpressionOrList
val := RawExpressionOrList(*eol)
return json.Marshal(val)
}
func (eol *ExpressionOrList) UnmarshalJSON(data []byte) error {
switch string(data[0]) {
case `"`: // string

View file

@ -1,12 +1,147 @@
package config
import (
"bytes"
"encoding/json"
"errors"
"testing"
yaml "sigs.k8s.io/yaml/goyaml.v3"
)
func TestExpressionOrListUnmarshal(t *testing.T) {
func TestExpressionOrListMarshalJSON(t *testing.T) {
for _, tt := range []struct {
name string
input *ExpressionOrList
output []byte
err error
}{
{
name: "single expression",
input: &ExpressionOrList{
Expression: "true",
},
output: []byte(`"true"`),
err: nil,
},
{
name: "all",
input: &ExpressionOrList{
All: []string{"true", "true"},
},
output: []byte(`{"all":["true","true"]}`),
err: nil,
},
{
name: "all one",
input: &ExpressionOrList{
All: []string{"true"},
},
output: []byte(`"true"`),
err: nil,
},
{
name: "any",
input: &ExpressionOrList{
Any: []string{"true", "false"},
},
output: []byte(`{"any":["true","false"]}`),
err: nil,
},
{
name: "any one",
input: &ExpressionOrList{
Any: []string{"true"},
},
output: []byte(`"true"`),
err: nil,
},
} {
t.Run(tt.name, func(t *testing.T) {
result, err := json.Marshal(tt.input)
if !errors.Is(err, tt.err) {
t.Errorf("wanted marshal error: %v but got: %v", tt.err, err)
}
if !bytes.Equal(result, tt.output) {
t.Logf("wanted: %s", string(tt.output))
t.Logf("got: %s", string(result))
t.Error("mismatched output")
}
})
}
}
func TestExpressionOrListMarshalYAML(t *testing.T) {
for _, tt := range []struct {
name string
input *ExpressionOrList
output []byte
err error
}{
{
name: "single expression",
input: &ExpressionOrList{
Expression: "true",
},
output: []byte(`"true"`),
err: nil,
},
{
name: "all",
input: &ExpressionOrList{
All: []string{"true", "true"},
},
output: []byte(`all:
- "true"
- "true"`),
err: nil,
},
{
name: "all one",
input: &ExpressionOrList{
All: []string{"true"},
},
output: []byte(`"true"`),
err: nil,
},
{
name: "any",
input: &ExpressionOrList{
Any: []string{"true", "false"},
},
output: []byte(`any:
- "true"
- "false"`),
err: nil,
},
{
name: "any one",
input: &ExpressionOrList{
Any: []string{"true"},
},
output: []byte(`"true"`),
err: nil,
},
} {
t.Run(tt.name, func(t *testing.T) {
result, err := yaml.Marshal(tt.input)
if !errors.Is(err, tt.err) {
t.Errorf("wanted marshal error: %v but got: %v", tt.err, err)
}
result = bytes.TrimSpace(result)
if !bytes.Equal(result, tt.output) {
t.Logf("wanted: %q", string(tt.output))
t.Logf("got: %q", string(result))
t.Error("mismatched output")
}
})
}
}
func TestExpressionOrListUnmarshalJSON(t *testing.T) {
for _, tt := range []struct {
err error
validErr error

View file

@ -1,5 +1,5 @@
package config
type Weight struct {
Adjust int `json:"adjust"`
Adjust int `json:"adjust" yaml:"adjust"`
}