perf: Replace internal SHA256 hashing with xxhash for 4-6x performance improvement (#676)

* perf(internal): Use FastHash for internal hashing docs: Add xxhash performance improvement to changelog entry feat(hash): Add fast non-cryptographic hash function Signed-off-by: Jason Cameron <git@jasoncameron.dev> * test(hash): add xxhash benchmarks and collision tests Signed-off-by: Jason Cameron <git@jasoncameron.dev> * Update metadata check-spelling run (pull_request) for json/hash Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com> on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev> --------- Signed-off-by: Jason Cameron <git@jasoncameron.dev> Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
2025-06-16 22:53:53 -04:00 · 2025-06-16 22:53:53 -04:00 · e2b46fc5e7
commit e2b46fc5e7
parent 3437e575d4
11 changed files with 291 additions and 16 deletions
--- a/internal/hash_bench_test.go
+++ b/internal/hash_bench_test.go
@ -0,0 +1,261 @@
+package internal
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+)
+
+// XXHash64sum is a test alias for FastHash to benchmark against SHA256
+func XXHash64sum(text string) string {
+	return FastHash(text)
+}
+
+// Test data that matches real usage patterns in the codebase
+var (
+	// Typical policy checker inputs
+	policyInputs = []string{
+		"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+		"User-Agent: bot/1.0",
+		"User-Agent: GoogleBot/2.1",
+		"/robots.txt",
+		"/api/.*",
+		"10.0.0.0/8",
+		"192.168.1.0/24",
+		"172.16.0.0/12",
+	}
+
+	// Challenge data from challengeFor function
+	challengeInputs = []string{
+		"Accept-Language=en-US,X-Real-IP=192.168.1.100,User-Agent=Mozilla/5.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=abc123,Difficulty=5",
+		"Accept-Language=fr-FR,X-Real-IP=10.0.0.50,User-Agent=Chrome/91.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=def456,Difficulty=3",
+		"Accept-Language=es-ES,X-Real-IP=172.16.1.1,User-Agent=Safari/14.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=ghi789,Difficulty=7",
+	}
+
+	// Bot rule patterns
+	botRuleInputs = []string{
+		"GoogleBot::path:/robots.txt",
+		"BingBot::useragent:Mozilla/5.0 (compatible; bingbot/2.0)",
+		"FacebookBot::headers:Accept-Language,User-Agent",
+		"TwitterBot::cidr:192.168.1.0/24",
+	}
+
+	// CEL expressions from policy rules
+	celInputs = []string{
+		`request.headers["User-Agent"].contains("bot")`,
+		`request.path.startsWith("/api/") && request.method == "POST"`,
+		`request.remoteAddress in ["192.168.1.0/24", "10.0.0.0/8"]`,
+		`request.userAgent.matches(".*[Bb]ot.*") || request.userAgent.matches(".*[Cc]rawler.*")`,
+	}
+
+	// Thoth ASN checker inputs
+	asnInputs = []string{
+		"ASNChecker\nAS 15169\nAS 8075\nAS 32934",
+		"ASNChecker\nAS 13335\nAS 16509\nAS 14061",
+		"ASNChecker\nAS 36351\nAS 20940\nAS 8100",
+	}
+)
+
+func BenchmarkSHA256_PolicyInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := policyInputs[i%len(policyInputs)]
+		_ = SHA256sum(input)
+	}
+}
+
+func BenchmarkXXHash_PolicyInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := policyInputs[i%len(policyInputs)]
+		_ = XXHash64sum(input)
+	}
+}
+
+func BenchmarkSHA256_ChallengeInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := challengeInputs[i%len(challengeInputs)]
+		_ = SHA256sum(input)
+	}
+}
+
+func BenchmarkXXHash_ChallengeInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := challengeInputs[i%len(challengeInputs)]
+		_ = XXHash64sum(input)
+	}
+}
+
+func BenchmarkSHA256_BotRuleInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := botRuleInputs[i%len(botRuleInputs)]
+		_ = SHA256sum(input)
+	}
+}
+
+func BenchmarkXXHash_BotRuleInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := botRuleInputs[i%len(botRuleInputs)]
+		_ = XXHash64sum(input)
+	}
+}
+
+func BenchmarkSHA256_CELInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := celInputs[i%len(celInputs)]
+		_ = SHA256sum(input)
+	}
+}
+
+func BenchmarkXXHash_CELInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := celInputs[i%len(celInputs)]
+		_ = XXHash64sum(input)
+	}
+}
+
+func BenchmarkSHA256_ASNInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := asnInputs[i%len(asnInputs)]
+		_ = SHA256sum(input)
+	}
+}
+
+func BenchmarkXXHash_ASNInputs(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		input := asnInputs[i%len(asnInputs)]
+		_ = XXHash64sum(input)
+	}
+}
+
+// Benchmark the policy list hashing used in checker.go
+func BenchmarkSHA256_PolicyList(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		var sb strings.Builder
+		for _, input := range policyInputs {
+			fmt.Fprintln(&sb, SHA256sum(input))
+		}
+		_ = SHA256sum(sb.String())
+	}
+}
+
+func BenchmarkXXHash_PolicyList(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		var sb strings.Builder
+		for _, input := range policyInputs {
+			fmt.Fprintln(&sb, XXHash64sum(input))
+		}
+		_ = XXHash64sum(sb.String())
+	}
+}
+
+// Tests that xxhash doesn't have collisions in realistic scenarios
+func TestHashCollisions(t *testing.T) {
+	allInputs := append(append(append(append(policyInputs, challengeInputs...), botRuleInputs...), celInputs...), asnInputs...)
+
+	// Start with realistic inputs from actual usage
+	xxhashHashes := make(map[string]string)
+	for _, input := range allInputs {
+		hash := XXHash64sum(input)
+		if existing, exists := xxhashHashes[hash]; exists {
+			t.Errorf("XXHash collision detected: %q and %q both hash to %s", input, existing, hash)
+		}
+		xxhashHashes[hash] = input
+	}
+
+	t.Logf("Basic test: %d realistic inputs, no collisions", len(allInputs))
+
+	// Test similar strings that might cause hash collisions
+	prefixes := []string{"User-Agent: ", "X-Real-IP: ", "Accept-Language: ", "Host: "}
+	suffixes := []string{"bot", "crawler", "spider", "scraper", "Mozilla", "Chrome", "Safari", "Firefox"}
+	variations := []string{"", "/1.0", "/2.0", " (compatible)", " (Windows)", " (Linux)", " (Mac)"}
+
+	stressCount := 0
+	for _, prefix := range prefixes {
+		for _, suffix := range suffixes {
+			for _, variation := range variations {
+				for i := 0; i < 100; i++ {
+					input := fmt.Sprintf("%s%s%s-%d", prefix, suffix, variation, i)
+					hash := XXHash64sum(input)
+					if existing, exists := xxhashHashes[hash]; exists {
+						t.Errorf("XXHash collision in stress test: %q and %q both hash to %s", input, existing, hash)
+					}
+					xxhashHashes[hash] = input
+					stressCount++
+				}
+			}
+		}
+	}
+	t.Logf("Stress test 1: %d similar string variations, no collisions", stressCount)
+
+	// Test sequential patterns that might be problematic
+	patterns := []string{
+		"192.168.1.%d",
+		"10.0.0.%d",
+		"172.16.%d.1",
+		"challenge-%d",
+		"bot-rule-%d",
+		"policy-%016x",
+		"session-%016x",
+	}
+
+	seqCount := 0
+	for _, pattern := range patterns {
+		for i := 0; i < 10000; i++ {
+			input := fmt.Sprintf(pattern, i)
+			hash := XXHash64sum(input)
+			if existing, exists := xxhashHashes[hash]; exists {
+				t.Errorf("XXHash collision in sequential test: %q and %q both hash to %s", input, existing, hash)
+			}
+			xxhashHashes[hash] = input
+			seqCount++
+		}
+	}
+	t.Logf("Stress test 2: %d sequential patterns, no collisions", seqCount)
+
+	totalInputs := len(allInputs) + stressCount + seqCount
+	t.Logf("TOTAL: Tested %d inputs across realistic scenarios - NO COLLISIONS", totalInputs)
+}
+
+// Verify xxhash output works as cache keys
+func TestXXHashFormat(t *testing.T) {
+	testCases := []string{
+		"short",
+		"",
+		"very long string with lots of content that might be used in policy checking and other internal hashing scenarios",
+		"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+	}
+
+	for _, input := range testCases {
+		hash := XXHash64sum(input)
+
+		// Check it's valid hex
+		if len(hash) == 0 {
+			t.Errorf("Empty hash for input %q", input)
+		}
+
+		// xxhash is 64-bit so max 16 hex chars
+		if len(hash) > 16 {
+			t.Errorf("Hash too long for input %q: %s (length %d)", input, hash, len(hash))
+		}
+
+		// Make sure it's all hex characters
+		for _, char := range hash {
+			if !((char >= '0' && char <= '9') || (char >= 'a' && char <= 'f')) {
+				t.Errorf("Non-hex character %c in hash %s for input %q", char, hash, input)
+			}
+		}
+
+		t.Logf("Input: %q -> Hash: %s", input, hash)
+	}
+}