Weighs the soul of incoming HTTP requests to stop AI crawlers

fix(config): actually load threshold config (#696)

* fix(config): actually load threshold config

Signed-off-by: Xe Iaso <me@xeiaso.net>

* chore: spelling

Signed-off-by: Xe Iaso <me@xeiaso.net>

* test(lib): fix test failures

Signed-off-by: Xe Iaso <me@xeiaso.net>

---------

Signed-off-by: Xe Iaso <me@xeiaso.net>

authored by Xe Iaso and committed by GitHub 7aa732c7 226cf36b

+2
.github/actions/spelling/excludes.txt
··· 83 83 ^\Q.github/FUNDING.yml\E$ 84 84 ^\Q.github/workflows/spelling.yml\E$ 85 85 ^data/crawlers/ 86 + ^docs/blog/tags\.yml$ 86 87 ^docs/manifest/.*$ 87 88 ^docs/static/\.nojekyll$ 89 + ^lib/policy/config/testdata/bad/unparseable\.json$ 88 90 ignore$ 89 91 robots.txt
+1 -2
.github/actions/spelling/expect.txt
··· 44 44 challengemozilla 45 45 checkpath 46 46 checkresult 47 - chen 48 47 chibi 49 48 cidranger 50 49 ckie ··· 61 60 Debian 62 61 debrpm 63 62 decaymap 64 - decompiling 65 63 Diffbot 66 64 discordapp 67 65 discordbot ··· 300 298 xff 301 299 XForwarded 302 300 XNG 301 + XOB 303 302 XReal 304 303 yae 305 304 YAMLTo
+17 -20
lib/anubis_test.go
··· 24 24 internal.InitSlog("debug") 25 25 } 26 26 27 - func loadPolicies(t *testing.T, fname string) *policy.ParsedConfig { 27 + func loadPolicies(t *testing.T, fname string, difficulty int) *policy.ParsedConfig { 28 28 t.Helper() 29 29 30 30 ctx := thothmock.WithMockThoth(t) 31 31 32 - anubisPolicy, err := LoadPoliciesOrDefault(ctx, fname, anubis.DefaultDifficulty) 32 + if fname == "" { 33 + fname = "./testdata/test_config.yaml" 34 + } 35 + 36 + anubisPolicy, err := LoadPoliciesOrDefault(ctx, fname, difficulty) 33 37 if err != nil { 34 38 t.Fatal(err) 35 39 } ··· 176 180 177 181 // Regression test for CVE-2025-24369 178 182 func TestCVE2025_24369(t *testing.T) { 179 - pol := loadPolicies(t, "") 180 - pol.DefaultDifficulty = 4 183 + pol := loadPolicies(t, "", anubis.DefaultDifficulty) 181 184 182 185 srv := spawnAnubis(t, Options{ 183 186 Next: http.NewServeMux(), ··· 200 203 } 201 204 202 205 func TestCookieCustomExpiration(t *testing.T) { 203 - pol := loadPolicies(t, "") 204 - pol.DefaultDifficulty = 0 206 + pol := loadPolicies(t, "", 0) 205 207 ckieExpiration := 10 * time.Minute 206 208 207 209 srv := spawnAnubis(t, Options{ ··· 250 252 } 251 253 252 254 func TestCookieSettings(t *testing.T) { 253 - pol := loadPolicies(t, "") 254 - pol.DefaultDifficulty = 0 255 + pol := loadPolicies(t, "", 0) 255 256 256 257 srv := spawnAnubis(t, Options{ 257 258 Next: http.NewServeMux(), ··· 316 317 317 318 for i := 1; i < 10; i++ { 318 319 t.Run(fmt.Sprint(i), func(t *testing.T) { 319 - anubisPolicy, err := LoadPoliciesOrDefault(t.Context(), "", i) 320 - if err != nil { 321 - t.Fatal(err) 322 - } 320 + anubisPolicy := loadPolicies(t, "", i) 323 321 324 322 s, err := New(Options{ 325 323 Next: h, ··· 337 335 338 336 req.Header.Add("X-Real-Ip", "127.0.0.1") 339 337 340 - _, bot, err := s.check(req) 338 + cr, bot, err := s.check(req) 341 339 if err != nil { 342 340 t.Fatal(err) 343 341 } 342 + 343 + t.Log(cr.Name) 344 344 345 345 if bot.Challenge.Difficulty != i { 346 346 t.Errorf("Challenge.Difficulty is wrong, wanted %d, got: %d", i, bot.Challenge.Difficulty) ··· 389 389 // Reset the global BasePrefix before each test 390 390 anubis.BasePrefix = "" 391 391 392 - pol := loadPolicies(t, "") 393 - pol.DefaultDifficulty = 4 392 + pol := loadPolicies(t, "", 4) 394 393 395 394 srv := spawnAnubis(t, Options{ 396 395 Next: h, ··· 518 517 "DENY": 403, 519 518 } 520 519 521 - pol := loadPolicies(t, "./testdata/aggressive_403.yaml") 522 - pol.DefaultDifficulty = 4 520 + pol := loadPolicies(t, "./testdata/aggressive_403.yaml", 4) 523 521 524 522 srv := spawnAnubis(t, Options{ 525 523 Next: h, ··· 553 551 func TestCloudflareWorkersRule(t *testing.T) { 554 552 for _, variant := range []string{"cel", "header"} { 555 553 t.Run(variant, func(t *testing.T) { 556 - pol := loadPolicies(t, "./testdata/cloudflare-workers-"+variant+".yaml") 554 + pol := loadPolicies(t, "./testdata/cloudflare-workers-"+variant+".yaml", 0) 557 555 558 556 h := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 559 557 fmt.Fprintln(w, "OK") ··· 609 607 } 610 608 611 609 func TestRuleChange(t *testing.T) { 612 - pol := loadPolicies(t, "testdata/rule_change.yaml") 613 - pol.DefaultDifficulty = 0 610 + pol := loadPolicies(t, "testdata/rule_change.yaml", 0) 614 611 ckieExpiration := 10 * time.Minute 615 612 616 613 srv := spawnAnubis(t, Options{
+1 -1
lib/config_test.go
··· 26 26 for _, st := range finfos { 27 27 st := st 28 28 t.Run(st.Name(), func(t *testing.T) { 29 - if _, err := LoadPoliciesOrDefault(t.Context(), filepath.Join("policy", "config", "testdata", "good", st.Name()), anubis.DefaultDifficulty); err == nil { 29 + if _, err := LoadPoliciesOrDefault(t.Context(), filepath.Join("policy", "config", "testdata", "bad", st.Name()), anubis.DefaultDifficulty); err == nil { 30 30 t.Fatal(err) 31 31 } else { 32 32 t.Log(err)
+55
lib/policy/config/asn_test.go
··· 1 + package config 2 + 3 + import ( 4 + "errors" 5 + "fmt" 6 + "testing" 7 + ) 8 + 9 + func TestASNsValid(t *testing.T) { 10 + for _, tt := range []struct { 11 + name string 12 + input *ASNs 13 + err error 14 + }{ 15 + { 16 + name: "basic valid", 17 + input: &ASNs{ 18 + Match: []uint32{13335}, // Cloudflare 19 + }, 20 + }, 21 + { 22 + name: "private ASN", 23 + input: &ASNs{ 24 + Match: []uint32{64513, 4206942069}, // 16 and 32 bit private ASN 25 + }, 26 + err: ErrPrivateASN, 27 + }, 28 + } { 29 + t.Run(tt.name, func(t *testing.T) { 30 + if err := tt.input.Valid(); !errors.Is(err, tt.err) { 31 + t.Logf("want: %v", tt.err) 32 + t.Logf("got: %v", err) 33 + t.Error("got wrong validation error") 34 + } 35 + }) 36 + } 37 + } 38 + 39 + func TestIsPrivateASN(t *testing.T) { 40 + for _, tt := range []struct { 41 + input uint32 42 + output bool 43 + }{ 44 + {13335, false}, // Cloudflare 45 + {64513, true}, // 16 bit private ASN 46 + {4206942069, true}, // 32 bit private ASN 47 + } { 48 + t.Run(fmt.Sprint(tt.input, "->", tt.output), func(t *testing.T) { 49 + result := isPrivateASN(tt.input) 50 + if result != tt.output { 51 + t.Errorf("wanted isPrivateASN(%d) == %v, got: %v", tt.input, tt.output, result) 52 + } 53 + }) 54 + } 55 + }
+5 -6
lib/policy/config/config.go
··· 326 326 Bots []BotOrImport `json:"bots"` 327 327 DNSBL bool `json:"dnsbl"` 328 328 StatusCodes StatusCodes `json:"status_codes"` 329 - Thresholds []Threshold `json:"threshold"` 329 + Thresholds []Threshold `json:"thresholds"` 330 330 } 331 331 332 332 func (c *fileConfig) Valid() error { ··· 344 344 345 345 if err := c.StatusCodes.Valid(); err != nil { 346 346 errs = append(errs, err) 347 - } 348 - 349 - if len(c.Thresholds) == 0 { 350 - errs = append(errs, ErrNoThresholdRulesDefined) 351 347 } 352 348 353 349 for i, t := range c.Thresholds { ··· 369 365 Challenge: http.StatusOK, 370 366 Deny: http.StatusOK, 371 367 }, 372 - Thresholds: DefaultThresholds, 373 368 } 374 369 375 370 if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil { ··· 405 400 406 401 result.Bots = append(result.Bots, *boi.BotConfig) 407 402 } 403 + } 404 + 405 + if len(c.Thresholds) == 0 { 406 + c.Thresholds = DefaultThresholds 408 407 } 409 408 410 409 for _, t := range c.Thresholds {
+1 -1
lib/policy/config/geoip.go
··· 8 8 ) 9 9 10 10 var ( 11 - countryCodeRegexp = regexp.MustCompile(`^\w{2}$`) 11 + countryCodeRegexp = regexp.MustCompile(`^[a-zA-Z]{2}$`) 12 12 13 13 ErrNotCountryCode = errors.New("config.Bot: invalid country code") 14 14 )
+36
lib/policy/config/geoip_test.go
··· 1 + package config 2 + 3 + import ( 4 + "errors" 5 + "testing" 6 + ) 7 + 8 + func TestGeoIPValid(t *testing.T) { 9 + for _, tt := range []struct { 10 + name string 11 + input *GeoIP 12 + err error 13 + }{ 14 + { 15 + name: "basic valid", 16 + input: &GeoIP{ 17 + Countries: []string{"CA"}, 18 + }, 19 + }, 20 + { 21 + name: "invalid country", 22 + input: &GeoIP{ 23 + Countries: []string{"XOB"}, 24 + }, 25 + err: ErrNotCountryCode, 26 + }, 27 + } { 28 + t.Run(tt.name, func(t *testing.T) { 29 + if err := tt.input.Valid(); !errors.Is(err, tt.err) { 30 + t.Logf("want: %v", tt.err) 31 + t.Logf("got: %v", err) 32 + t.Error("got wrong validation error") 33 + } 34 + }) 35 + } 36 + }
+11
lib/policy/config/testdata/bad/threshold-challenge-without-challenge.yaml
··· 1 + bots: 2 + - name: simple-weight-adjust 3 + action: WEIGH 4 + user_agent_regex: Mozilla 5 + weight: 6 + adjust: 5 7 + 8 + thresholds: 9 + - name: extreme-suspicion 10 + expression: "true" 11 + action: WEIGH
+15
lib/policy/config/testdata/bad/thresholds.yaml
··· 1 + bots: 2 + - name: simple-weight-adjust 3 + action: WEIGH 4 + user_agent_regex: Mozilla 5 + weight: 6 + adjust: 5 7 + 8 + thresholds: 9 + - name: extreme-suspicion 10 + expression: "true" 11 + action: WEIGH 12 + challenge: 13 + algorithm: fast 14 + difficulty: 4 15 + report_as: 4
+19
lib/policy/config/threshold_test.go
··· 3 3 import ( 4 4 "errors" 5 5 "fmt" 6 + "os" 7 + "path/filepath" 6 8 "testing" 7 9 ) 8 10 ··· 90 92 }) 91 93 } 92 94 } 95 + 96 + func TestLoadActuallyLoadsThresholds(t *testing.T) { 97 + fin, err := os.Open(filepath.Join(".", "testdata", "good", "thresholds.yaml")) 98 + if err != nil { 99 + t.Fatal(err) 100 + } 101 + defer fin.Close() 102 + 103 + c, err := Load(fin, fin.Name()) 104 + if err != nil { 105 + t.Fatal(err) 106 + } 107 + 108 + if len(c.Thresholds) != 4 { 109 + t.Errorf("wanted 4 thresholds, got %d thresholds", len(c.Thresholds)) 110 + } 111 + }
+38
lib/testdata/test_config.yaml
··· 1 + bots: 2 + - import: (data)/bots/_deny-pathological.yaml 3 + - import: (data)/bots/aggressive-brazilian-scrapers.yaml 4 + - import: (data)/meta/ai-block-aggressive.yaml 5 + - import: (data)/crawlers/_allow-good.yaml 6 + - import: (data)/clients/x-firefox-ai.yaml 7 + - import: (data)/common/keep-internet-working.yaml 8 + - name: countries-with-aggressive-scrapers 9 + action: WEIGH 10 + geoip: 11 + countries: 12 + - BR 13 + - CN 14 + weight: 15 + adjust: 10 16 + - name: aggressive-asns-without-functional-abuse-contact 17 + action: WEIGH 18 + asns: 19 + match: 20 + - 13335 # Cloudflare 21 + - 136907 # Huawei Cloud 22 + - 45102 # Alibaba Cloud 23 + weight: 24 + adjust: 10 25 + - name: generic-browser 26 + user_agent_regex: >- 27 + Mozilla|Opera 28 + action: WEIGH 29 + weight: 30 + adjust: 10 31 + 32 + dnsbl: false 33 + 34 + status_codes: 35 + CHALLENGE: 200 36 + DENY: 200 37 + 38 + thresholds: []