feat: add robots2policy CLI to convert robots.txt to Anubis CEL (#657)

* feat: add robots2policy CLI utility to convert robots.txt to Anubis challenge policies * feat: add documentation for robots2policy CLI tool * feat: implement crawl delay handling as weight adjustment in Anubis rules * feat: add various robots.txt and YAML configurations for user agent handling and crawl delays * test: add comprehensive tests for robots2policy conversion and parsing * fix: update example URL in usage instructions for robots2policy CLI * Update metadata check-spelling run (pull_request) for json/robots2policycli Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com> on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev> * docs: add crawl delay weight adjustment and deny user agents option to robots2policy CLI * Update cmd/robots2policy/main.go Co-authored-by: Xe Iaso <me@xeiaso.net> Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com> * Update cmd/robots2policy/main.go Co-authored-by: Xe Iaso <me@xeiaso.net> Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com> * fix(robots2policy): use sigs.k8s.io/yaml Signed-off-by: Xe Iaso <me@xeiaso.net> * feat(config): properly marshal bot policy rules Signed-off-by: Xe Iaso <me@xeiaso.net> * chore(yeetfile): expose robots2policy in libexec Signed-off-by: Xe Iaso <me@xeiaso.net> * fix(yeetfile): put robots2policy in $PATH Signed-off-by: Xe Iaso <me@xeiaso.net> * Update metadata check-spelling run (pull_request) for json/robots2policycli Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com> on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev> * style: reorder imports * refactor: use preexisting structs in config * fix: correct flag check in main function * fix: reorder fields in AnubisRule struct for better alignment * style: improve alignment of struct fields in AnubisRule and OGTagCache * Update metadata check-spelling run (pull_request) for json/robots2policycli Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com> on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev> * fix: add validation for generated Anubis rules from robots.txt * feat: add batch processing for robots.txt files to generate Anubis CEL policies * fix: improve usage message and error handling for input file requirement * refactor: update AnubisRule structure to use ExpressionOrList for improved expression handling * refactor: reorganize policy definitions in YAML files for consistency and clarity * fix: correct indentation in blacklist and complex YAML files for consistency * test: enhance output comparison in robots2policy tests for YAML and JSON formats * Revert "fix: improve usage message and error handling for input file requirement" This reverts commit ddcde1f2a326545d3ef2ec32e5e03f55f4f931a8. * fix: improve usage message and error handling in robots2policy Signed-off-by: Jason Cameron <git@jasoncameron.dev> --------- Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com> Signed-off-by: Jason Cameron <jasoncameron.all@gmail.com> Signed-off-by: Xe Iaso <me@xeiaso.net> Signed-off-by: Jason Cameron <git@jasoncameron.dev> Co-authored-by: Xe Iaso <me@xeiaso.net>
2025-06-14 23:41:00 -04:00 · 2025-06-14 23:41:00 -04:00 · e0781e4560
commit e0781e4560
parent 7a195f1595
28 changed files with 1302 additions and 27 deletions
--- a/cmd/robots2policy/robots2policy_test.go
+++ b/cmd/robots2policy/robots2policy_test.go
@ -0,0 +1,418 @@
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"reflect"
+	"strings"
+	"testing"
+
+	"gopkg.in/yaml.v3"
+)
+
+type TestCase struct {
+	name         string
+	robotsFile   string
+	expectedFile string
+	options      TestOptions
+}
+
+type TestOptions struct {
+	format           string
+	action           string
+	crawlDelayWeight int
+	policyName       string
+	deniedAction     string
+}
+
+func TestDataFileConversion(t *testing.T) {
+
+	testCases := []TestCase{
+		{
+			name:         "simple_default",
+			robotsFile:   "simple.robots.txt",
+			expectedFile: "simple.yaml",
+			options:      TestOptions{format: "yaml"},
+		},
+		{
+			name:         "simple_json",
+			robotsFile:   "simple.robots.txt",
+			expectedFile: "simple.json",
+			options:      TestOptions{format: "json"},
+		},
+		{
+			name:         "simple_deny_action",
+			robotsFile:   "simple.robots.txt",
+			expectedFile: "deny-action.yaml",
+			options:      TestOptions{format: "yaml", action: "DENY"},
+		},
+		{
+			name:         "simple_custom_name",
+			robotsFile:   "simple.robots.txt",
+			expectedFile: "custom-name.yaml",
+			options:      TestOptions{format: "yaml", policyName: "my-custom-policy"},
+		},
+		{
+			name:         "blacklist_with_crawl_delay",
+			robotsFile:   "blacklist.robots.txt",
+			expectedFile: "blacklist.yaml",
+			options:      TestOptions{format: "yaml", crawlDelayWeight: 3},
+		},
+		{
+			name:         "wildcards",
+			robotsFile:   "wildcards.robots.txt",
+			expectedFile: "wildcards.yaml",
+			options:      TestOptions{format: "yaml"},
+		},
+		{
+			name:         "empty_file",
+			robotsFile:   "empty.robots.txt",
+			expectedFile: "empty.yaml",
+			options:      TestOptions{format: "yaml"},
+		},
+		{
+			name:         "complex_scenario",
+			robotsFile:   "complex.robots.txt",
+			expectedFile: "complex.yaml",
+			options:      TestOptions{format: "yaml", crawlDelayWeight: 5},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			robotsPath := filepath.Join("testdata", tc.robotsFile)
+			expectedPath := filepath.Join("testdata", tc.expectedFile)
+
+			// Read robots.txt input
+			robotsFile, err := os.Open(robotsPath)
+			if err != nil {
+				t.Fatalf("Failed to open robots file %s: %v", robotsPath, err)
+			}
+			defer robotsFile.Close()
+
+			// Parse robots.txt
+			rules, err := parseRobotsTxt(robotsFile)
+			if err != nil {
+				t.Fatalf("Failed to parse robots.txt: %v", err)
+			}
+
+			// Set test options
+			oldFormat := *outputFormat
+			oldAction := *baseAction
+			oldCrawlDelay := *crawlDelay
+			oldPolicyName := *policyName
+			oldDeniedAction := *userAgentDeny
+
+			if tc.options.format != "" {
+				*outputFormat = tc.options.format
+			}
+			if tc.options.action != "" {
+				*baseAction = tc.options.action
+			}
+			if tc.options.crawlDelayWeight > 0 {
+				*crawlDelay = tc.options.crawlDelayWeight
+			}
+			if tc.options.policyName != "" {
+				*policyName = tc.options.policyName
+			}
+			if tc.options.deniedAction != "" {
+				*userAgentDeny = tc.options.deniedAction
+			}
+
+			// Restore options after test
+			defer func() {
+				*outputFormat = oldFormat
+				*baseAction = oldAction
+				*crawlDelay = oldCrawlDelay
+				*policyName = oldPolicyName
+				*userAgentDeny = oldDeniedAction
+			}()
+
+			// Convert to Anubis rules
+			anubisRules := convertToAnubisRules(rules)
+
+			// Generate output
+			var actualOutput []byte
+			switch strings.ToLower(*outputFormat) {
+			case "yaml":
+				actualOutput, err = yaml.Marshal(anubisRules)
+			case "json":
+				actualOutput, err = json.MarshalIndent(anubisRules, "", "  ")
+			}
+			if err != nil {
+				t.Fatalf("Failed to marshal output: %v", err)
+			}
+
+			// Read expected output
+			expectedOutput, err := os.ReadFile(expectedPath)
+			if err != nil {
+				t.Fatalf("Failed to read expected file %s: %v", expectedPath, err)
+			}
+
+			if strings.ToLower(*outputFormat) == "yaml" {
+				var actualData []interface{}
+				var expectedData []interface{}
+
+				err = yaml.Unmarshal(actualOutput, &actualData)
+				if err != nil {
+					t.Fatalf("Failed to unmarshal actual output: %v", err)
+				}
+
+				err = yaml.Unmarshal(expectedOutput, &expectedData)
+				if err != nil {
+					t.Fatalf("Failed to unmarshal expected output: %v", err)
+				}
+
+				// Compare data structures
+				if !compareData(actualData, expectedData) {
+					actualStr := strings.TrimSpace(string(actualOutput))
+					expectedStr := strings.TrimSpace(string(expectedOutput))
+					t.Errorf("Output mismatch for %s\nExpected:\n%s\n\nActual:\n%s", tc.name, expectedStr, actualStr)
+				}
+			} else {
+				var actualData []interface{}
+				var expectedData []interface{}
+
+				err = json.Unmarshal(actualOutput, &actualData)
+				if err != nil {
+					t.Fatalf("Failed to unmarshal actual JSON output: %v", err)
+				}
+
+				err = json.Unmarshal(expectedOutput, &expectedData)
+				if err != nil {
+					t.Fatalf("Failed to unmarshal expected JSON output: %v", err)
+				}
+
+				// Compare data structures
+				if !compareData(actualData, expectedData) {
+					actualStr := strings.TrimSpace(string(actualOutput))
+					expectedStr := strings.TrimSpace(string(expectedOutput))
+					t.Errorf("Output mismatch for %s\nExpected:\n%s\n\nActual:\n%s", tc.name, expectedStr, actualStr)
+				}
+			}
+		})
+	}
+}
+
+func TestCaseInsensitiveParsing(t *testing.T) {
+	robotsTxt := `User-Agent: *
+Disallow: /admin
+Crawl-Delay: 10
+
+User-agent: TestBot
+disallow: /test
+crawl-delay: 5
+
+USER-AGENT: UpperBot
+DISALLOW: /upper
+CRAWL-DELAY: 20`
+
+	reader := strings.NewReader(robotsTxt)
+	rules, err := parseRobotsTxt(reader)
+	if err != nil {
+		t.Fatalf("Failed to parse case-insensitive robots.txt: %v", err)
+	}
+
+	expectedRules := 3
+	if len(rules) != expectedRules {
+		t.Errorf("Expected %d rules, got %d", expectedRules, len(rules))
+	}
+
+	// Check that all crawl delays were parsed
+	for i, rule := range rules {
+		expectedDelays := []int{10, 5, 20}
+		if rule.CrawlDelay != expectedDelays[i] {
+			t.Errorf("Rule %d: expected crawl delay %d, got %d", i, expectedDelays[i], rule.CrawlDelay)
+		}
+	}
+}
+
+func TestVariousOutputFormats(t *testing.T) {
+	robotsTxt := `User-agent: *
+Disallow: /admin`
+
+	reader := strings.NewReader(robotsTxt)
+	rules, err := parseRobotsTxt(reader)
+	if err != nil {
+		t.Fatalf("Failed to parse robots.txt: %v", err)
+	}
+
+	oldPolicyName := *policyName
+	*policyName = "test-policy"
+	defer func() { *policyName = oldPolicyName }()
+
+	anubisRules := convertToAnubisRules(rules)
+
+	// Test YAML output
+	yamlOutput, err := yaml.Marshal(anubisRules)
+	if err != nil {
+		t.Fatalf("Failed to marshal YAML: %v", err)
+	}
+
+	if !strings.Contains(string(yamlOutput), "name: test-policy-disallow-1") {
+		t.Errorf("YAML output doesn't contain expected rule name")
+	}
+
+	// Test JSON output
+	jsonOutput, err := json.MarshalIndent(anubisRules, "", "  ")
+	if err != nil {
+		t.Fatalf("Failed to marshal JSON: %v", err)
+	}
+
+	if !strings.Contains(string(jsonOutput), `"name": "test-policy-disallow-1"`) {
+		t.Errorf("JSON output doesn't contain expected rule name")
+	}
+}
+
+func TestDifferentActions(t *testing.T) {
+	robotsTxt := `User-agent: *
+Disallow: /admin`
+
+	testActions := []string{"ALLOW", "DENY", "CHALLENGE", "WEIGH"}
+
+	for _, action := range testActions {
+		t.Run("action_"+action, func(t *testing.T) {
+			reader := strings.NewReader(robotsTxt)
+			rules, err := parseRobotsTxt(reader)
+			if err != nil {
+				t.Fatalf("Failed to parse robots.txt: %v", err)
+			}
+
+			oldAction := *baseAction
+			*baseAction = action
+			defer func() { *baseAction = oldAction }()
+
+			anubisRules := convertToAnubisRules(rules)
+
+			if len(anubisRules) != 1 {
+				t.Fatalf("Expected 1 rule, got %d", len(anubisRules))
+			}
+
+			if anubisRules[0].Action != action {
+				t.Errorf("Expected action %s, got %s", action, anubisRules[0].Action)
+			}
+		})
+	}
+}
+
+func TestPolicyNaming(t *testing.T) {
+	robotsTxt := `User-agent: *
+Disallow: /admin
+Disallow: /private
+
+User-agent: BadBot
+Disallow: /`
+
+	testNames := []string{"custom-policy", "my-rules", "site-protection"}
+
+	for _, name := range testNames {
+		t.Run("name_"+name, func(t *testing.T) {
+			reader := strings.NewReader(robotsTxt)
+			rules, err := parseRobotsTxt(reader)
+			if err != nil {
+				t.Fatalf("Failed to parse robots.txt: %v", err)
+			}
+
+			oldName := *policyName
+			*policyName = name
+			defer func() { *policyName = oldName }()
+
+			anubisRules := convertToAnubisRules(rules)
+
+			// Check that all rule names use the custom prefix
+			for _, rule := range anubisRules {
+				if !strings.HasPrefix(rule.Name, name+"-") {
+					t.Errorf("Rule name %s doesn't start with expected prefix %s-", rule.Name, name)
+				}
+			}
+		})
+	}
+}
+
+func TestCrawlDelayWeights(t *testing.T) {
+	robotsTxt := `User-agent: *
+Disallow: /admin
+Crawl-delay: 10
+
+User-agent: SlowBot
+Disallow: /slow
+Crawl-delay: 60`
+
+	testWeights := []int{1, 5, 10, 25}
+
+	for _, weight := range testWeights {
+		t.Run(fmt.Sprintf("weight_%d", weight), func(t *testing.T) {
+			reader := strings.NewReader(robotsTxt)
+			rules, err := parseRobotsTxt(reader)
+			if err != nil {
+				t.Fatalf("Failed to parse robots.txt: %v", err)
+			}
+
+			oldWeight := *crawlDelay
+			*crawlDelay = weight
+			defer func() { *crawlDelay = oldWeight }()
+
+			anubisRules := convertToAnubisRules(rules)
+
+			// Count weight rules and verify they have correct weight
+			weightRules := 0
+			for _, rule := range anubisRules {
+				if rule.Action == "WEIGH" && rule.Weight != nil {
+					weightRules++
+					if rule.Weight.Adjust != weight {
+						t.Errorf("Expected weight %d, got %d", weight, rule.Weight.Adjust)
+					}
+				}
+			}
+
+			expectedWeightRules := 2 // One for *, one for SlowBot
+			if weightRules != expectedWeightRules {
+				t.Errorf("Expected %d weight rules, got %d", expectedWeightRules, weightRules)
+			}
+		})
+	}
+}
+
+func TestBlacklistActions(t *testing.T) {
+	robotsTxt := `User-agent: BadBot
+Disallow: /
+
+User-agent: SpamBot
+Disallow: /`
+
+	testActions := []string{"DENY", "CHALLENGE"}
+
+	for _, action := range testActions {
+		t.Run("blacklist_"+action, func(t *testing.T) {
+			reader := strings.NewReader(robotsTxt)
+			rules, err := parseRobotsTxt(reader)
+			if err != nil {
+				t.Fatalf("Failed to parse robots.txt: %v", err)
+			}
+
+			oldAction := *userAgentDeny
+			*userAgentDeny = action
+			defer func() { *userAgentDeny = oldAction }()
+
+			anubisRules := convertToAnubisRules(rules)
+
+			// All rules should be blacklist rules with the specified action
+			for _, rule := range anubisRules {
+				if !strings.Contains(rule.Name, "blacklist") {
+					t.Errorf("Expected blacklist rule, got %s", rule.Name)
+				}
+				if rule.Action != action {
+					t.Errorf("Expected action %s, got %s", action, rule.Action)
+				}
+			}
+		})
+	}
+}
+
+// compareData performs a deep comparison of two data structures,
+// ignoring differences that are semantically equivalent in YAML/JSON
+func compareData(actual, expected interface{}) bool {
+	return reflect.DeepEqual(actual, expected)
+}