feat: add robots2policy CLI to convert robots.txt to Anubis CEL (#657)

* feat: add robots2policy CLI utility to convert robots.txt to Anubis challenge policies * feat: add documentation for robots2policy CLI tool * feat: implement crawl delay handling as weight adjustment in Anubis rules * feat: add various robots.txt and YAML configurations for user agent handling and crawl delays * test: add comprehensive tests for robots2policy conversion and parsing * fix: update example URL in usage instructions for robots2policy CLI * Update metadata check-spelling run (pull_request) for json/robots2policycli Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com> on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev> * docs: add crawl delay weight adjustment and deny user agents option to robots2policy CLI * Update cmd/robots2policy/main.go Co-authored-by: Xe Iaso <me@xeiaso.net> Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com> * Update cmd/robots2policy/main.go Co-authored-by: Xe Iaso <me@xeiaso.net> Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com> * fix(robots2policy): use sigs.k8s.io/yaml Signed-off-by: Xe Iaso <me@xeiaso.net> * feat(config): properly marshal bot policy rules Signed-off-by: Xe Iaso <me@xeiaso.net> * chore(yeetfile): expose robots2policy in libexec Signed-off-by: Xe Iaso <me@xeiaso.net> * fix(yeetfile): put robots2policy in $PATH Signed-off-by: Xe Iaso <me@xeiaso.net> * Update metadata check-spelling run (pull_request) for json/robots2policycli Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com> on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev> * style: reorder imports * refactor: use preexisting structs in config * fix: correct flag check in main function * fix: reorder fields in AnubisRule struct for better alignment * style: improve alignment of struct fields in AnubisRule and OGTagCache * Update metadata check-spelling run (pull_request) for json/robots2policycli Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com> on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev> * fix: add validation for generated Anubis rules from robots.txt * feat: add batch processing for robots.txt files to generate Anubis CEL policies * fix: improve usage message and error handling for input file requirement * refactor: update AnubisRule structure to use ExpressionOrList for improved expression handling * refactor: reorganize policy definitions in YAML files for consistency and clarity * fix: correct indentation in blacklist and complex YAML files for consistency * test: enhance output comparison in robots2policy tests for YAML and JSON formats * Revert "fix: improve usage message and error handling for input file requirement" This reverts commit ddcde1f2a326545d3ef2ec32e5e03f55f4f931a8. * fix: improve usage message and error handling in robots2policy Signed-off-by: Jason Cameron <git@jasoncameron.dev> --------- Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com> Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com> Signed-off-by: Xe Iaso <me@xeiaso.net> Signed-off-by: Jason Cameron <git@jasoncameron.dev> Co-authored-by: Xe Iaso <me@xeiaso.net>
2025-06-14 23:41:00 -04:00 · 2025-06-14 23:41:00 -04:00 · e0781e4560
commit e0781e4560
parent 7a195f1595
28 changed files with 1302 additions and 27 deletions
--- a/lib/policy/config/config.go
+++ b/lib/policy/config/config.go
@ -46,15 +46,15 @@ const (
 const DefaultAlgorithm = "fast"

 type BotConfig struct {
-	UserAgentRegex *string           `json:"user_agent_regex,omitempty"`
-	PathRegex      *string           `json:"path_regex,omitempty"`
-	HeadersRegex   map[string]string `json:"headers_regex,omitempty"`
-	Expression     *ExpressionOrList `json:"expression,omitempty"`
-	Challenge      *ChallengeRules   `json:"challenge,omitempty"`
-	Weight         *Weight           `json:"weight,omitempty"`
-	Name           string            `json:"name"`
-	Action         Rule              `json:"action"`
-	RemoteAddr     []string          `json:"remote_addresses,omitempty"`
+	UserAgentRegex *string           `json:"user_agent_regex,omitempty" yaml:"user_agent_regex,omitempty"`
+	PathRegex      *string           `json:"path_regex,omitempty" yaml:"path_regex,omitempty"`
+	HeadersRegex   map[string]string `json:"headers_regex,omitempty" yaml:"headers_regex,omitempty"`
+	Expression     *ExpressionOrList `json:"expression,omitempty" yaml:"expression,omitempty"`
+	Challenge      *ChallengeRules   `json:"challenge,omitempty" yaml:"challenge,omitempty"`
+	Weight         *Weight           `json:"weight,omitempty" yaml:"weight,omitempty"`
+	Name           string            `json:"name" yaml:"name"`
+	Action         Rule              `json:"action" yaml:"action"`
+	RemoteAddr     []string          `json:"remote_addresses,omitempty" yaml:"remote_addresses,omitempty"`
 }

 func (b BotConfig) Zero() bool {
@ -170,9 +170,9 @@ func (b *BotConfig) Valid() error {
 }

 type ChallengeRules struct {
-	Algorithm  string `json:"algorithm"`
-	Difficulty int    `json:"difficulty"`
-	ReportAs   int    `json:"report_as"`
+	Algorithm  string `json:"algorithm,omitempty" yaml:"algorithm,omitempty"`
+	Difficulty int    `json:"difficulty,omitempty" yaml:"difficulty,omitempty"`
+	ReportAs   int    `json:"report_as,omitempty" yaml:"report_as,omitempty"`
 }

 var (
--- a/lib/policy/config/expressionorlist.go
+++ b/lib/policy/config/expressionorlist.go
@ -13,9 +13,9 @@ var (
 )

 type ExpressionOrList struct {
-	Expression string   `json:"-"`
-	All        []string `json:"all,omitempty"`
-	Any        []string `json:"any,omitempty"`
+	Expression string   `json:"-" yaml:"-"`
+	All        []string `json:"all,omitempty" yaml:"all,omitempty"`
+	Any        []string `json:"any,omitempty" yaml:"any,omitempty"`
 }

 func (eol ExpressionOrList) Equal(rhs *ExpressionOrList) bool {
@ -34,6 +34,43 @@ func (eol ExpressionOrList) Equal(rhs *ExpressionOrList) bool {
 	return true
 }

+func (eol *ExpressionOrList) MarshalYAML() (any, error) {
+	switch {
+	case len(eol.All) == 1 && len(eol.Any) == 0:
+		eol.Expression = eol.All[0]
+		eol.All = nil
+	case len(eol.Any) == 1 && len(eol.All) == 0:
+		eol.Expression = eol.Any[0]
+		eol.Any = nil
+	}
+
+	if eol.Expression != "" {
+		return eol.Expression, nil
+	}
+
+	type RawExpressionOrList ExpressionOrList
+	return RawExpressionOrList(*eol), nil
+}
+
+func (eol *ExpressionOrList) MarshalJSON() ([]byte, error) {
+	switch {
+	case len(eol.All) == 1 && len(eol.Any) == 0:
+		eol.Expression = eol.All[0]
+		eol.All = nil
+	case len(eol.Any) == 1 && len(eol.All) == 0:
+		eol.Expression = eol.Any[0]
+		eol.Any = nil
+	}
+
+	if eol.Expression != "" {
+		return json.Marshal(string(eol.Expression))
+	}
+
+	type RawExpressionOrList ExpressionOrList
+	val := RawExpressionOrList(*eol)
+	return json.Marshal(val)
+}
+
 func (eol *ExpressionOrList) UnmarshalJSON(data []byte) error {
 	switch string(data[0]) {
 	case `"`: // string
--- a/lib/policy/config/expressionorlist_test.go
+++ b/lib/policy/config/expressionorlist_test.go
@ -1,12 +1,147 @@
 package config

 import (
+	"bytes"
 	"encoding/json"
 	"errors"
 	"testing"
+
+	yaml "sigs.k8s.io/yaml/goyaml.v3"
 )

-func TestExpressionOrListUnmarshal(t *testing.T) {
+func TestExpressionOrListMarshalJSON(t *testing.T) {
+	for _, tt := range []struct {
+		name   string
+		input  *ExpressionOrList
+		output []byte
+		err    error
+	}{
+		{
+			name: "single expression",
+			input: &ExpressionOrList{
+				Expression: "true",
+			},
+			output: []byte(`"true"`),
+			err:    nil,
+		},
+		{
+			name: "all",
+			input: &ExpressionOrList{
+				All: []string{"true", "true"},
+			},
+			output: []byte(`{"all":["true","true"]}`),
+			err:    nil,
+		},
+		{
+			name: "all one",
+			input: &ExpressionOrList{
+				All: []string{"true"},
+			},
+			output: []byte(`"true"`),
+			err:    nil,
+		},
+		{
+			name: "any",
+			input: &ExpressionOrList{
+				Any: []string{"true", "false"},
+			},
+			output: []byte(`{"any":["true","false"]}`),
+			err:    nil,
+		},
+		{
+			name: "any one",
+			input: &ExpressionOrList{
+				Any: []string{"true"},
+			},
+			output: []byte(`"true"`),
+			err:    nil,
+		},
+	} {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := json.Marshal(tt.input)
+			if !errors.Is(err, tt.err) {
+				t.Errorf("wanted marshal error: %v but got: %v", tt.err, err)
+			}
+
+			if !bytes.Equal(result, tt.output) {
+				t.Logf("wanted: %s", string(tt.output))
+				t.Logf("got:    %s", string(result))
+				t.Error("mismatched output")
+			}
+		})
+	}
+}
+
+func TestExpressionOrListMarshalYAML(t *testing.T) {
+	for _, tt := range []struct {
+		name   string
+		input  *ExpressionOrList
+		output []byte
+		err    error
+	}{
+		{
+			name: "single expression",
+			input: &ExpressionOrList{
+				Expression: "true",
+			},
+			output: []byte(`"true"`),
+			err:    nil,
+		},
+		{
+			name: "all",
+			input: &ExpressionOrList{
+				All: []string{"true", "true"},
+			},
+			output: []byte(`all:
+    - "true"
+    - "true"`),
+			err: nil,
+		},
+		{
+			name: "all one",
+			input: &ExpressionOrList{
+				All: []string{"true"},
+			},
+			output: []byte(`"true"`),
+			err:    nil,
+		},
+		{
+			name: "any",
+			input: &ExpressionOrList{
+				Any: []string{"true", "false"},
+			},
+			output: []byte(`any:
+    - "true"
+    - "false"`),
+			err: nil,
+		},
+		{
+			name: "any one",
+			input: &ExpressionOrList{
+				Any: []string{"true"},
+			},
+			output: []byte(`"true"`),
+			err:    nil,
+		},
+	} {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := yaml.Marshal(tt.input)
+			if !errors.Is(err, tt.err) {
+				t.Errorf("wanted marshal error: %v but got: %v", tt.err, err)
+			}
+
+			result = bytes.TrimSpace(result)
+
+			if !bytes.Equal(result, tt.output) {
+				t.Logf("wanted: %q", string(tt.output))
+				t.Logf("got:    %q", string(result))
+				t.Error("mismatched output")
+			}
+		})
+	}
+}
+
+func TestExpressionOrListUnmarshalJSON(t *testing.T) {
 	for _, tt := range []struct {
 		err      error
 		validErr error
--- a/lib/policy/config/weight.go
+++ b/lib/policy/config/weight.go
@ -1,5 +1,5 @@
 package config

 type Weight struct {
-	Adjust int `json:"adjust"`
+	Adjust int `json:"adjust" yaml:"adjust"`
 }