perf: Replace internal SHA256 hashing with xxhash for 4-6x performance improvement (#676)
* perf(internal): Use FastHash for internal hashing docs: Add xxhash performance improvement to changelog entry feat(hash): Add fast non-cryptographic hash function Signed-off-by: Jason Cameron <git@jasoncameron.dev> * test(hash): add xxhash benchmarks and collision tests Signed-off-by: Jason Cameron <git@jasoncameron.dev> * Update metadata check-spelling run (pull_request) for json/hash Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com> on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev> --------- Signed-off-by: Jason Cameron <git@jasoncameron.dev> Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
This commit is contained in:
parent
3437e575d4
commit
e2b46fc5e7
11 changed files with 291 additions and 16 deletions
261
internal/hash_bench_test.go
Normal file
261
internal/hash_bench_test.go
Normal file
|
|
@ -0,0 +1,261 @@
|
|||
package internal
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// XXHash64sum is a test alias for FastHash to benchmark against SHA256
|
||||
func XXHash64sum(text string) string {
|
||||
return FastHash(text)
|
||||
}
|
||||
|
||||
// Test data that matches real usage patterns in the codebase
|
||||
var (
|
||||
// Typical policy checker inputs
|
||||
policyInputs = []string{
|
||||
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||
"User-Agent: bot/1.0",
|
||||
"User-Agent: GoogleBot/2.1",
|
||||
"/robots.txt",
|
||||
"/api/.*",
|
||||
"10.0.0.0/8",
|
||||
"192.168.1.0/24",
|
||||
"172.16.0.0/12",
|
||||
}
|
||||
|
||||
// Challenge data from challengeFor function
|
||||
challengeInputs = []string{
|
||||
"Accept-Language=en-US,X-Real-IP=192.168.1.100,User-Agent=Mozilla/5.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=abc123,Difficulty=5",
|
||||
"Accept-Language=fr-FR,X-Real-IP=10.0.0.50,User-Agent=Chrome/91.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=def456,Difficulty=3",
|
||||
"Accept-Language=es-ES,X-Real-IP=172.16.1.1,User-Agent=Safari/14.0,WeekTime=2025-06-16T00:00:00Z,Fingerprint=ghi789,Difficulty=7",
|
||||
}
|
||||
|
||||
// Bot rule patterns
|
||||
botRuleInputs = []string{
|
||||
"GoogleBot::path:/robots.txt",
|
||||
"BingBot::useragent:Mozilla/5.0 (compatible; bingbot/2.0)",
|
||||
"FacebookBot::headers:Accept-Language,User-Agent",
|
||||
"TwitterBot::cidr:192.168.1.0/24",
|
||||
}
|
||||
|
||||
// CEL expressions from policy rules
|
||||
celInputs = []string{
|
||||
`request.headers["User-Agent"].contains("bot")`,
|
||||
`request.path.startsWith("/api/") && request.method == "POST"`,
|
||||
`request.remoteAddress in ["192.168.1.0/24", "10.0.0.0/8"]`,
|
||||
`request.userAgent.matches(".*[Bb]ot.*") || request.userAgent.matches(".*[Cc]rawler.*")`,
|
||||
}
|
||||
|
||||
// Thoth ASN checker inputs
|
||||
asnInputs = []string{
|
||||
"ASNChecker\nAS 15169\nAS 8075\nAS 32934",
|
||||
"ASNChecker\nAS 13335\nAS 16509\nAS 14061",
|
||||
"ASNChecker\nAS 36351\nAS 20940\nAS 8100",
|
||||
}
|
||||
)
|
||||
|
||||
func BenchmarkSHA256_PolicyInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := policyInputs[i%len(policyInputs)]
|
||||
_ = SHA256sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkXXHash_PolicyInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := policyInputs[i%len(policyInputs)]
|
||||
_ = XXHash64sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSHA256_ChallengeInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := challengeInputs[i%len(challengeInputs)]
|
||||
_ = SHA256sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkXXHash_ChallengeInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := challengeInputs[i%len(challengeInputs)]
|
||||
_ = XXHash64sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSHA256_BotRuleInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := botRuleInputs[i%len(botRuleInputs)]
|
||||
_ = SHA256sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkXXHash_BotRuleInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := botRuleInputs[i%len(botRuleInputs)]
|
||||
_ = XXHash64sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSHA256_CELInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := celInputs[i%len(celInputs)]
|
||||
_ = SHA256sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkXXHash_CELInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := celInputs[i%len(celInputs)]
|
||||
_ = XXHash64sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSHA256_ASNInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := asnInputs[i%len(asnInputs)]
|
||||
_ = SHA256sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkXXHash_ASNInputs(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
input := asnInputs[i%len(asnInputs)]
|
||||
_ = XXHash64sum(input)
|
||||
}
|
||||
}
|
||||
|
||||
// Benchmark the policy list hashing used in checker.go
|
||||
func BenchmarkSHA256_PolicyList(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
var sb strings.Builder
|
||||
for _, input := range policyInputs {
|
||||
fmt.Fprintln(&sb, SHA256sum(input))
|
||||
}
|
||||
_ = SHA256sum(sb.String())
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkXXHash_PolicyList(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
var sb strings.Builder
|
||||
for _, input := range policyInputs {
|
||||
fmt.Fprintln(&sb, XXHash64sum(input))
|
||||
}
|
||||
_ = XXHash64sum(sb.String())
|
||||
}
|
||||
}
|
||||
|
||||
// Tests that xxhash doesn't have collisions in realistic scenarios
|
||||
func TestHashCollisions(t *testing.T) {
|
||||
allInputs := append(append(append(append(policyInputs, challengeInputs...), botRuleInputs...), celInputs...), asnInputs...)
|
||||
|
||||
// Start with realistic inputs from actual usage
|
||||
xxhashHashes := make(map[string]string)
|
||||
for _, input := range allInputs {
|
||||
hash := XXHash64sum(input)
|
||||
if existing, exists := xxhashHashes[hash]; exists {
|
||||
t.Errorf("XXHash collision detected: %q and %q both hash to %s", input, existing, hash)
|
||||
}
|
||||
xxhashHashes[hash] = input
|
||||
}
|
||||
|
||||
t.Logf("Basic test: %d realistic inputs, no collisions", len(allInputs))
|
||||
|
||||
// Test similar strings that might cause hash collisions
|
||||
prefixes := []string{"User-Agent: ", "X-Real-IP: ", "Accept-Language: ", "Host: "}
|
||||
suffixes := []string{"bot", "crawler", "spider", "scraper", "Mozilla", "Chrome", "Safari", "Firefox"}
|
||||
variations := []string{"", "/1.0", "/2.0", " (compatible)", " (Windows)", " (Linux)", " (Mac)"}
|
||||
|
||||
stressCount := 0
|
||||
for _, prefix := range prefixes {
|
||||
for _, suffix := range suffixes {
|
||||
for _, variation := range variations {
|
||||
for i := 0; i < 100; i++ {
|
||||
input := fmt.Sprintf("%s%s%s-%d", prefix, suffix, variation, i)
|
||||
hash := XXHash64sum(input)
|
||||
if existing, exists := xxhashHashes[hash]; exists {
|
||||
t.Errorf("XXHash collision in stress test: %q and %q both hash to %s", input, existing, hash)
|
||||
}
|
||||
xxhashHashes[hash] = input
|
||||
stressCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
t.Logf("Stress test 1: %d similar string variations, no collisions", stressCount)
|
||||
|
||||
// Test sequential patterns that might be problematic
|
||||
patterns := []string{
|
||||
"192.168.1.%d",
|
||||
"10.0.0.%d",
|
||||
"172.16.%d.1",
|
||||
"challenge-%d",
|
||||
"bot-rule-%d",
|
||||
"policy-%016x",
|
||||
"session-%016x",
|
||||
}
|
||||
|
||||
seqCount := 0
|
||||
for _, pattern := range patterns {
|
||||
for i := 0; i < 10000; i++ {
|
||||
input := fmt.Sprintf(pattern, i)
|
||||
hash := XXHash64sum(input)
|
||||
if existing, exists := xxhashHashes[hash]; exists {
|
||||
t.Errorf("XXHash collision in sequential test: %q and %q both hash to %s", input, existing, hash)
|
||||
}
|
||||
xxhashHashes[hash] = input
|
||||
seqCount++
|
||||
}
|
||||
}
|
||||
t.Logf("Stress test 2: %d sequential patterns, no collisions", seqCount)
|
||||
|
||||
totalInputs := len(allInputs) + stressCount + seqCount
|
||||
t.Logf("TOTAL: Tested %d inputs across realistic scenarios - NO COLLISIONS", totalInputs)
|
||||
}
|
||||
|
||||
// Verify xxhash output works as cache keys
|
||||
func TestXXHashFormat(t *testing.T) {
|
||||
testCases := []string{
|
||||
"short",
|
||||
"",
|
||||
"very long string with lots of content that might be used in policy checking and other internal hashing scenarios",
|
||||
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
}
|
||||
|
||||
for _, input := range testCases {
|
||||
hash := XXHash64sum(input)
|
||||
|
||||
// Check it's valid hex
|
||||
if len(hash) == 0 {
|
||||
t.Errorf("Empty hash for input %q", input)
|
||||
}
|
||||
|
||||
// xxhash is 64-bit so max 16 hex chars
|
||||
if len(hash) > 16 {
|
||||
t.Errorf("Hash too long for input %q: %s (length %d)", input, hash, len(hash))
|
||||
}
|
||||
|
||||
// Make sure it's all hex characters
|
||||
for _, char := range hash {
|
||||
if !((char >= '0' && char <= '9') || (char >= 'a' && char <= 'f')) {
|
||||
t.Errorf("Non-hex character %c in hash %s for input %q", char, hash, input)
|
||||
}
|
||||
}
|
||||
|
||||
t.Logf("Input: %q -> Hash: %s", input, hash)
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue