comparing main and feat/production-resilience-improvements on skywatch.blue/skywatch-phash-rs

+1 -1

.env.example

··· 2 2 PDS_ENDPOINT=https://bsky.social 3 3 4 4 # PLC Configuration 5 - PLC_ENDPOINT=https://plc.directory 5 + PLC_ENDPOINT=https://plc.wtf 6 6 7 7 # Automod Account Configuration (REQUIRED) 8 8 # The automod account has moderator permissions and is used for authentication

+143 -6

Cargo.lock

··· 565 565 "multihash", 566 566 "serde", 567 567 "serde_bytes", 568 - "unsigned-varint", 568 + "unsigned-varint 0.8.0", 569 569 ] 570 570 571 571 [[package]] ··· 743 743 ] 744 744 745 745 [[package]] 746 + name = "curve25519-dalek" 747 + version = "4.1.3" 748 + source = "registry+https://github.com/rust-lang/crates.io-index" 749 + checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" 750 + dependencies = [ 751 + "cfg-if", 752 + "cpufeatures", 753 + "curve25519-dalek-derive", 754 + "digest", 755 + "fiat-crypto", 756 + "rustc_version", 757 + "subtle", 758 + "zeroize", 759 + ] 760 + 761 + [[package]] 762 + name = "curve25519-dalek-derive" 763 + version = "0.1.1" 764 + source = "registry+https://github.com/rust-lang/crates.io-index" 765 + checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" 766 + dependencies = [ 767 + "proc-macro2", 768 + "quote", 769 + "syn 2.0.108", 770 + ] 771 + 772 + [[package]] 746 773 name = "darling" 747 774 version = "0.21.3" 748 775 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 840 867 841 868 [[package]] 842 869 name = "deranged" 843 - version = "0.5.4" 870 + version = "0.5.5" 844 871 source = "registry+https://github.com/rust-lang/crates.io-index" 845 - checksum = "a41953f86f8a05768a6cda24def994fd2f424b04ec5c719cf89989779f199071" 872 + checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" 846 873 dependencies = [ 847 874 "powerfmt", 848 875 ] ··· 915 942 "rfc6979", 916 943 "signature", 917 944 "spki", 945 + ] 946 + 947 + [[package]] 948 + name = "ed25519" 949 + version = "2.2.3" 950 + source = "registry+https://github.com/rust-lang/crates.io-index" 951 + checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" 952 + dependencies = [ 953 + "pkcs8", 954 + "signature", 955 + ] 956 + 957 + [[package]] 958 + name = "ed25519-dalek" 959 + version = "2.2.0" 960 + source = "registry+https://github.com/rust-lang/crates.io-index" 961 + checksum = "70e796c081cee67dc755e1a36a0a172b897fab85fc3f6bc48307991f64e4eca9" 962 + dependencies = [ 963 + "curve25519-dalek", 964 + "ed25519", 965 + "rand_core 0.6.4", 966 + "serde", 967 + "sha2", 968 + "subtle", 969 + "zeroize", 918 970 ] 919 971 920 972 [[package]] ··· 1059 1111 "rand_core 0.6.4", 1060 1112 "subtle", 1061 1113 ] 1114 + 1115 + [[package]] 1116 + name = "fiat-crypto" 1117 + version = "0.2.9" 1118 + source = "registry+https://github.com/rust-lang/crates.io-index" 1119 + checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" 1062 1120 1063 1121 [[package]] 1064 1122 name = "filetime" ··· 1891 1949 ] 1892 1950 1893 1951 [[package]] 1952 + name = "iroh-car" 1953 + version = "0.5.1" 1954 + source = "registry+https://github.com/rust-lang/crates.io-index" 1955 + checksum = "cb7f8cd4cb9aa083fba8b52e921764252d0b4dcb1cd6d120b809dbfe1106e81a" 1956 + dependencies = [ 1957 + "anyhow", 1958 + "cid", 1959 + "futures", 1960 + "serde", 1961 + "serde_ipld_dagcbor", 1962 + "thiserror 1.0.69", 1963 + "tokio", 1964 + "unsigned-varint 0.7.2", 1965 + ] 1966 + 1967 + [[package]] 1894 1968 name = "is_ci" 1895 1969 version = "1.2.0" 1896 1970 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1923 1997 [[package]] 1924 1998 name = "jacquard" 1925 1999 version = "0.8.0" 2000 + source = "registry+https://github.com/rust-lang/crates.io-index" 2001 + checksum = "11e763fb566b9ffa3c6b68d65da64a5028e03c3ebf9b3c4521e76c06edd65734" 1926 2002 dependencies = [ 1927 2003 "bon", 1928 2004 "bytes", ··· 1955 2031 [[package]] 1956 2032 name = "jacquard-api" 1957 2033 version = "0.8.0" 2034 + source = "registry+https://github.com/rust-lang/crates.io-index" 2035 + checksum = "5db12067a89e7092a995229973d44f094d39d15667f48a7d36fe833de8f2caa7" 1958 2036 dependencies = [ 1959 2037 "bon", 1960 2038 "bytes", ··· 1969 2047 [[package]] 1970 2048 name = "jacquard-common" 1971 2049 version = "0.8.0" 2050 + source = "registry+https://github.com/rust-lang/crates.io-index" 2051 + checksum = "3f5ad103ff5efa640e34a4c26a57b6ae56585ad3fab99477d386f09f5119fef1" 1972 2052 dependencies = [ 1973 2053 "base64 0.22.1", 1974 2054 "bon", ··· 1976 2056 "chrono", 1977 2057 "ciborium", 1978 2058 "cid", 2059 + "ed25519-dalek", 1979 2060 "futures", 1980 2061 "getrandom 0.2.16", 1981 2062 "getrandom 0.3.4", ··· 2009 2090 [[package]] 2010 2091 name = "jacquard-derive" 2011 2092 version = "0.8.0" 2093 + source = "registry+https://github.com/rust-lang/crates.io-index" 2094 + checksum = "107f2ecd44086d7f5f89a328589f5535d02a35cf70c9e54362deeccdcdeac662" 2012 2095 dependencies = [ 2013 2096 "proc-macro2", 2014 2097 "quote", ··· 2018 2101 [[package]] 2019 2102 name = "jacquard-identity" 2020 2103 version = "0.8.0" 2104 + source = "registry+https://github.com/rust-lang/crates.io-index" 2105 + checksum = "48e7b884ae9fa95e20e3da45be923a2850dd350feca7ef3c26af2e50e5f96dd4" 2021 2106 dependencies = [ 2022 2107 "bon", 2023 2108 "bytes", ··· 2041 2126 [[package]] 2042 2127 name = "jacquard-oauth" 2043 2128 version = "0.8.0" 2129 + source = "registry+https://github.com/rust-lang/crates.io-index" 2130 + checksum = "aaffa112735305f436ef6249f13ec48e5add7229e920f72032f73e764e40022b" 2044 2131 dependencies = [ 2045 2132 "base64 0.22.1", 2046 2133 "bytes", ··· 2072 2159 ] 2073 2160 2074 2161 [[package]] 2162 + name = "jacquard-repo" 2163 + version = "0.8.0" 2164 + source = "registry+https://github.com/rust-lang/crates.io-index" 2165 + checksum = "a7a1395886e68b60e71ebb42fdbce01b884979f290e462751a346ad75e5d74de" 2166 + dependencies = [ 2167 + "bytes", 2168 + "cid", 2169 + "ed25519-dalek", 2170 + "ipld-core", 2171 + "iroh-car", 2172 + "jacquard-common", 2173 + "jacquard-derive", 2174 + "k256", 2175 + "miette", 2176 + "multihash", 2177 + "n0-future", 2178 + "p256", 2179 + "serde", 2180 + "serde_bytes", 2181 + "serde_ipld_dagcbor", 2182 + "sha2", 2183 + "smol_str", 2184 + "thiserror 2.0.17", 2185 + "tokio", 2186 + "trait-variant", 2187 + ] 2188 + 2189 + [[package]] 2075 2190 name = "jni" 2076 2191 version = "0.21.1" 2077 2192 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2158 2273 "cfg-if", 2159 2274 "ecdsa", 2160 2275 "elliptic-curve", 2276 + "once_cell", 2161 2277 "sha2", 2278 + "signature", 2162 2279 ] 2163 2280 2164 2281 [[package]] ··· 2494 2611 dependencies = [ 2495 2612 "core2", 2496 2613 "serde", 2497 - "unsigned-varint", 2614 + "unsigned-varint 0.8.0", 2498 2615 ] 2499 2616 2500 2617 [[package]] ··· 3603 3720 checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" 3604 3721 3605 3722 [[package]] 3723 + name = "rustc_version" 3724 + version = "0.4.1" 3725 + source = "registry+https://github.com/rust-lang/crates.io-index" 3726 + checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" 3727 + dependencies = [ 3728 + "semver", 3729 + ] 3730 + 3731 + [[package]] 3606 3732 name = "rustdct" 3607 3733 version = "0.7.1" 3608 3734 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 3782 3908 "core-foundation-sys", 3783 3909 "libc", 3784 3910 ] 3911 + 3912 + [[package]] 3913 + name = "semver" 3914 + version = "1.0.27" 3915 + source = "registry+https://github.com/rust-lang/crates.io-index" 3916 + checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" 3785 3917 3786 3918 [[package]] 3787 3919 name = "send_wrapper" ··· 3991 4123 "jacquard", 3992 4124 "jacquard-api", 3993 4125 "jacquard-common", 3994 - "jacquard-identity", 3995 - "jacquard-oauth", 4126 + "jacquard-repo", 3996 4127 "miette", 3997 4128 "mockito", 3998 4129 "redis", ··· 4803 4934 version = "0.2.6" 4804 4935 source = "registry+https://github.com/rust-lang/crates.io-index" 4805 4936 checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" 4937 + 4938 + [[package]] 4939 + name = "unsigned-varint" 4940 + version = "0.7.2" 4941 + source = "registry+https://github.com/rust-lang/crates.io-index" 4942 + checksum = "6889a77d49f1f013504cec6bf97a2c730394adedaeb1deb5ea08949a50541105" 4806 4943 4807 4944 [[package]] 4808 4945 name = "unsigned-varint"

+5 -6

Cargo.toml

··· 11 11 tokio = { version = "1", features = ["full"] } 12 12 futures-util = "0.3" 13 13 14 - # ATProto client (Jacquard) - using local path 15 - jacquard = { path = "../jacquard/crates/jacquard" } 16 - jacquard-api = { path = "../jacquard/crates/jacquard-api" } 17 - jacquard-common = { path = "../jacquard/crates/jacquard-common", features = ["websocket"] } 18 - jacquard-identity = { path = "../jacquard/crates/jacquard-identity" } 19 - jacquard-oauth = { path = "../jacquard/crates/jacquard-oauth" } 14 + # ATProto client (Jacquard) 15 + jacquard = "0.8.0" 16 + jacquard-api = "0.8.0" 17 + jacquard-common = { version = "0.8.0", features = ["websocket"] } 18 + jacquard-repo = "0.8.0" 20 19 21 20 # Serialization 22 21 serde = { version = "1.0", features = ["derive"] }

+124

PLAN_REMAINING.md

··· 1 + # Production Resilience Improvements - Implementation Plan 2 + 3 + ## Overview 4 + Complete the remaining critical issues needed for full production readiness. Each task includes implementation, testing, and verification. 5 + 6 + --- 7 + 8 + ## Task 1: Implement Circuit Breaker Pattern for External APIs 9 + 10 + **Objective:** Prevent cascading failures when external APIs (Ozone, PDS, PLC) degrade or fail. 11 + 12 + **Requirements:** 13 + 1. Create a circuit breaker module (`src/resilience/circuit_breaker.rs`) 14 + 2. Implement three independent circuit breakers: 15 + - **Ozone API** - Opens after 5 consecutive failures, half-opens after 60s 16 + - **PDS Blob Fetch** - Opens after 3 consecutive failures per endpoint, 5m timeout 17 + - **PLC Resolution** - Opens after 3 consecutive failures per endpoint, 5m timeout 18 + 3. Circuit breaker states: Closed → Open → Half-Open → Closed 19 + 4. Add metrics tracking: circuit_breaker_state, circuit_breaker_transitions 20 + 5. Update error handling to respect circuit breaker state 21 + 6. Add comprehensive tests: 22 + - State transitions (closed → open → half-open → closed) 23 + - Failure threshold triggers 24 + - Success during half-open closes circuit 25 + - Failure during half-open reopens circuit 26 + - Timeout calculation 27 + 28 + **Files to Modify:** 29 + - Create: `src/resilience/circuit_breaker.rs` 30 + - Create: `src/resilience/mod.rs` 31 + - Modify: `src/moderation/helpers.rs` - Wrap Ozone calls with circuit breaker 32 + - Modify: `src/processor/matcher.rs` - Wrap PDS/CDN calls with circuit breaker 33 + - Modify: `src/plc/mod.rs` - Wrap PLC calls with circuit breaker 34 + - Modify: `src/main.rs` - Initialize circuit breakers 35 + - Modify: `src/metrics/mod.rs` - Add circuit breaker metrics 36 + 37 + **Tests Required:** 38 + - Unit tests for circuit breaker state machine 39 + - Integration tests for API call wrapping 40 + - Test timeout and recovery scenarios 41 + 42 + --- 43 + 44 + ## Task 2: Add Redis Connection Failure Backoff and Recovery 45 + 46 + **Objective:** Handle Redis unavailability gracefully instead of crashing or spinning in error loops. 47 + 48 + **Requirements:** 49 + 1. Implement connection retry logic with exponential backoff 50 + 2. Add connection pooling via `ConnectionManager` (redis crate provides this) 51 + 3. Add health check mechanism with configurable interval 52 + 4. Implement graceful degradation: 53 + - First failure: Log and retry with 100ms delay 54 + - Exponential backoff: Cap at 10s 55 + - After 5 consecutive failures: Switch to circuit breaker state 56 + 5. Add configuration: 57 + - `REDIS_HEALTH_CHECK_INTERVAL_SECS` (default: 30s) 58 + - `REDIS_MAX_BACKOFF_SECS` (default: 10s) 59 + 6. Add metrics: `redis_connection_failures`, `redis_reconnect_attempts`, `redis_health_status` 60 + 7. Add logging for connection state changes 61 + 62 + **Files to Modify:** 63 + - Modify: `src/config/mod.rs` - Add redis retry configuration 64 + - Create: `src/redis_pool.rs` - Redis connection manager with backoff 65 + - Modify: `src/queue/redis_queue.rs` - Use connection manager 66 + - Modify: `src/cache/mod.rs` - Use connection manager 67 + - Modify: `src/main.rs` - Initialize redis pool 68 + - Modify: `src/metrics/mod.rs` - Add redis metrics 69 + 70 + **Tests Required:** 71 + - Unit tests for backoff calculation 72 + - Connection failure retry tests 73 + - Health check tests 74 + - Graceful degradation tests 75 + 76 + --- 77 + 78 + ## Task 3: Add Integration Tests for Critical Paths 79 + 80 + **Objective:** Achieve >50% test coverage for critical paths (up from <5% currently). 81 + 82 + **Requirements:** 83 + 1. Use `testcontainers` for Redis integration tests 84 + 2. Test critical paths: 85 + - **Worker job processing flow** - Mock blob download, verify cache, verify moderation 86 + - **Jetstream message → Job Queue flow** - Verify message parsing, job creation 87 + - **Moderation actions** - Test with mock Ozone API, verify retry logic 88 + - **Blob download fallback** - Test CDN failure → PDS success 89 + - **Cache operations** - Test cache hit/miss, TTL expiration 90 + 3. Tests must be isolated (no shared state between tests) 91 + 4. Use `tokio::test` for async tests 92 + 5. Mock external services (Ozone, PDS, PLC) where appropriate 93 + 6. Test error scenarios: 94 + - Blob download timeout 95 + - Cache miss 96 + - Moderation API retry exhaustion 97 + - Redis unavailability 98 + 7. Organize tests in `tests/integration/` directory 99 + 100 + **Files to Create:** 101 + - Create: `tests/integration/mod.rs` 102 + - Create: `tests/integration/worker_test.rs` - Worker job processing 103 + - Create: `tests/integration/cache_test.rs` - Cache operations 104 + - Create: `tests/integration/blob_download_test.rs` - Download with fallback 105 + - Create: `tests/integration/moderation_test.rs` - Moderation actions with retry 106 + - Create: `tests/integration/helpers.rs` - Test fixtures and mocks 107 + 108 + **Acceptance Criteria:** 109 + - All 10+ integration tests pass 110 + - >50% code coverage for critical modules (worker, cache, moderation) 111 + - Tests are deterministic (no flakiness) 112 + - Tests complete in <30s total 113 + 114 + --- 115 + 116 + ## Success Criteria 117 + 118 + - [ ] All tasks implemented and tests passing 119 + - [ ] No panics or unwraps in production code 120 + - [ ] Circuit breaker prevents cascading failures 121 + - [ ] Redis connection failures handled gracefully 122 + - [ ] Integration tests provide >50% coverage 123 + - [ ] Code review passes for all changes 124 + - [ ] All changes committed to `feat/production-resilience-improvements` branch

+1490

llms.txt

··· 1 + # skywatch-phash-rs: Comprehensive Codebase Overview 2 + 3 + ## Project Summary 4 + 5 + **skywatch-phash-rs** is a high-performance, real-time perceptual hash-based image moderation service for Bluesky/ATProto. It monitors Bluesky's Jetstream firehose for posts with images, computes perceptual hashes (aHash), matches them against a configurable set of known harmful image hashes, and automatically applies moderation actions (labels and reports) through the Ozone moderation system. 6 + 7 + **Technology Stack:** 8 + - Language: Rust (Edition 2024) 9 + - Async Runtime: Tokio 10 + - ATProto Client: Jacquard (internal crate from sibling directory) 11 + - Message Queue: Redis (async, with connection pooling) 12 + - Image Processing: image crate + image_hasher (aHash/average hash algorithm) 13 + - Error Handling: miette (diagnostic errors) + thiserror 14 + - Logging: tracing + tracing-subscriber (structured, JSON-capable) 15 + 16 + **Current Version:** 0.2.0 17 + **License:** MIT 18 + 19 + --- 20 + 21 + ## Project Purpose & Key Features 22 + 23 + ### Core Functionality 24 + 1. **Real-time Jetstream Subscription**: Subscribes to Bluesky's firehose, filtering only posts with embedded images 25 + 2. **Perceptual Hash Computation**: Computes 64-bit average hash (aHash) for each image blob (CID) 26 + 3. **Hamming Distance Matching**: Compares computed hashes against configurable rules using Hamming distance thresholds 27 + 4. **Automated Moderation**: Takes configured actions on matches: 28 + - Apply labels to posts and/or accounts 29 + - File reports to posts and/or accounts 30 + - Takedown posts and/or accounts (future capability) 31 + 5. **Resilient Job Processing**: Redis-backed job queue with retry logic and dead-letter handling 32 + 6. **Phash Caching**: Caches computed hashes in Redis to reduce redundant work on viral images 33 + 7. **Deduplication**: Redis-backed claim tracking prevents duplicate moderation actions within 7-day windows 34 + 8. **Metrics & Observability**: Lock-free atomic counters track jobs, blobs, matches, cache performance, and moderation actions 35 + 9. **Graceful Shutdown**: Saves cursor position and logs final metrics on exit 36 + 37 + ### Key Non-Features (Explicitly Handled) 38 + - Does NOT block Jetstream ingestion on processing delays (decoupled via queue) 39 + - Does NOT lose jobs mid-processing (Redis persistence) 40 + - Does NOT duplicate moderation actions (Redis claims + Ozone verification) 41 + - Does NOT require external authentication servers (Jacquard handles session management) 42 + 43 + --- 44 + 45 + ## Directory Structure 46 + 47 + ``` 48 + skywatch-phash-rs/ 49 + ├── src/ 50 + │ ├── main.rs # Entry point: orchestrates Jetstream, queue, workers, metrics 51 + │ ├── lib.rs # Module exports 52 + │ │ 53 + │ ├── types/mod.rs # Core data structures 54 + │ │ ├── BlobCheck # Rule definition with phashes, threshold, actions 55 + │ │ ├── BlobReference # Image blob CID + optional MIME type 56 + │ │ ├── ImageJob # Post metadata + blobs for processing 57 + │ │ └── MatchResult # Result of phash matching with distance 58 + │ │ 59 + │ ├── config/mod.rs # Environment variable configuration (required & optional) 60 + │ │ 61 + │ ├── jetstream/ 62 + │ │ ├── mod.rs # JetstreamClient: WebSocket subscriber with retry/failover 63 + │ │ ├── events.rs # Event extraction (blob parsing from post records) 64 + │ │ └── cursor.rs # Cursor persistence (firehose_cursor.db) 65 + │ │ 66 + │ ├── processor/ 67 + │ │ ├── mod.rs # Module exports 68 + │ │ ├── phash.rs # Perceptual hash computation (aHash 8x8 -> 16 hex chars) 69 + │ │ └── matcher.rs # Blob check matching, blob download, job processing 70 + │ │ 71 + │ ├── queue/ 72 + │ │ ├── redis_queue.rs # Redis job queue (pending/processing/dead-letter) 73 + │ │ └── worker.rs # Worker pool: job dequeue, process, retry, take moderation actions 74 + │ │ 75 + │ ├── cache/mod.rs # Redis phash cache (get_or_compute pattern) 76 + │ │ 77 + │ ├── moderation/ 78 + │ │ ├── mod.rs # Module exports 79 + │ │ ├── post.rs # Post label/report actions (future: takedown) 80 + │ │ ├── account.rs # Account label/report/comment actions (future: takedown) 81 + │ │ ├── claims.rs # Redis claim tracking for deduplication 82 + │ │ ├── helpers.rs # Shared moderation logic 83 + │ │ └── rate_limiter.rs # Rate limit tracking (respect Ozone API limits) 84 + │ │ 85 + │ ├── agent/ 86 + │ │ ├── mod.rs # Module exports 87 + │ │ └── session.rs # AgentSession: authenticated Jacquard client wrapper 88 + │ │ 89 + │ ├── plc/mod.rs # PLC Directory client with endpoint failover 90 + │ │ 91 + │ └── metrics/mod.rs # Metrics collector: lock-free atomic counters 92 + │ 93 + ├── rules/ 94 + │ └── blobs.json # BlobCheck rule definitions (loaded at startup) 95 + │ 96 + ├── Cargo.toml # Dependencies and build configuration 97 + ├── README.md # User-facing documentation 98 + ├── ARCHITECTURE.md # Deep architecture guide (TypeScript context, not perfectly aligned with Rust) 99 + └── CLAUDE.md # Project-specific guidelines for Claude Code 100 + 101 + ``` 102 + 103 + --- 104 + 105 + ## Core Module Details 106 + 107 + ### 1. main.rs (Entry Point, ~285 lines) 108 + 109 + **Responsibilities:** 110 + - Load configuration from environment variables 111 + - Authenticate with Bluesky/Ozone via Jacquard 112 + - Initialize Redis queue, cache, metrics 113 + - Load blob check rules from `rules/blobs.json` 114 + - Start Jetstream subscriber with auto-retry and failover 115 + - Spawn job receiver task (Jetstream events -> Redis queue) 116 + - Spawn worker pool (N concurrent workers processing jobs) 117 + - Spawn metrics logger (logs stats every 60 seconds) 118 + - Handle graceful shutdown (Ctrl+C, SIGTERM) 119 + 120 + **Key Design Patterns:** 121 + - Jetstream connection with exponential backoff (retry_delay up to max_retry_delay_secs) 122 + - URL rotation: primary Jetstream URL + fallback URLs 123 + - Job channel (mpsc) decouples Jetstream from Redis for resilience 124 + - Worker pool runs as N independent futures (not tokio::spawn to avoid HRTB issues) 125 + - `tokio::select!` for clean multi-task orchestration 126 + 127 + **Critical Code Sections:** 128 + - Lines 88-93: Cursor loading and resumption 129 + - Lines 98-158: Jetstream retry loop with exponential backoff 130 + - Lines 160-188: Job receiver task (Jetstream -> Redis) 131 + - Lines 190-216: Worker pool initialization 132 + - Lines 233-262: Shutdown coordination with `tokio::select!` 133 + 134 + **Dependencies:** 135 + - tokio (async runtime, channels, signals) 136 + - tracing (structured logging) 137 + - miette (error context and diagnostics) 138 + 139 + --- 140 + 141 + ### 2. types/mod.rs (Data Structures, ~138 lines) 142 + 143 + **Core Types:** 144 + 145 + ```rust 146 + BlobCheck { 147 + phashes: Vec<CowStr<'static>>, // Known bad image hashes (16 hex chars each) 148 + label: CowStr<'static>, // Label to apply (e.g., "troll", "spam") 149 + comment: CowStr<'static>, // Report comment with context 150 + report_acct: bool, // Report the account? 151 + label_acct: bool, // Label the account? 152 + report_post: bool, // Report the post? 153 + to_label: bool, // Label the post? 154 + takedown_post: bool, // Takedown post? (default: false) 155 + takedown_acct: bool, // Takedown account? (default: false) 156 + hamming_threshold: Option<u32>, // Per-rule threshold (overrides default) 157 + description: Option<CowStr<'static>>, // Internal documentation 158 + ignore_did: Option<Vec<Did<'static>>>, // DIDs to exempt from this rule 159 + } 160 + 161 + BlobReference { 162 + cid: Cid<'static>, // Content ID of the blob 163 + mime_type: Option<CowStr<'static>>, // Optional MIME type (may be missing) 164 + } 165 + 166 + ImageJob { 167 + post_uri: AtUri<'static>, // "at://did/app.bsky.feed.post/rkey" 168 + post_cid: Cid<'static>, // Post commit CID 169 + post_did: Did<'static>, // Author DID 170 + blobs: Vec<BlobReference>, // Embedded images 171 + timestamp: i64, // Job creation time (millis) 172 + attempts: u32, // Retry counter 173 + } 174 + 175 + MatchResult { 176 + phash: CowStr<'static>, // Computed hash 177 + matched_check: BlobCheck, // Matching rule 178 + matched_phash: CowStr<'static>, // Matched rule's phash 179 + hamming_distance: u32, // Distance (0-64) 180 + } 181 + ``` 182 + 183 + **Design Notes:** 184 + - Uses `CowStr<'static>` from Jacquard for zero-copy serialization (owned in all contexts) 185 + - Custom deserializers handle Jacquard's type conversions (Did, Cid, AtUri) 186 + - `ignore_did` field uses serde alias "ignoreDID" for JSON compatibility 187 + - All deserialized values converted to 'static lifetime for cross-thread safety 188 + 189 + --- 190 + 191 + ### 3. config/mod.rs (Configuration, ~236 lines) 192 + 193 + **Configuration Hierarchy:** 194 + 195 + ```rust 196 + Config { 197 + jetstream: JetstreamConfig { 198 + url: String, // Primary Jetstream WebSocket URL 199 + fallback_urls: Vec<String>, // Fallback URLs for resilience 200 + wanted_collections: Vec<String>, // Always ["app.bsky.feed.post"] 201 + cursor_update_interval: u64, // Save cursor every N millis (default: 10000) 202 + retry_delay_secs: u64, // Initial backoff (default: 5) 203 + max_retry_delay_secs: u64, // Max backoff (default: 300) 204 + }, 205 + redis: RedisConfig { 206 + url: String, // Redis connection string 207 + }, 208 + processing: ProcessingConfig { 209 + concurrency: usize, // Number of worker threads (default: 10) 210 + retry_attempts: u32, // Max retries per job (default: 3) 211 + retry_delay: u64, // Delay between retries in ms (default: 1000) 212 + }, 213 + cache: CacheConfig { 214 + enabled: bool, // Enable phash caching? (default: true) 215 + ttl: u64, // Cache TTL in seconds (default: 86400 = 24h) 216 + }, 217 + pds: PdsConfig { 218 + endpoint: String, // PDS endpoint for blob fetch (default: https://bsky.social) 219 + }, 220 + plc: PlcConfig { 221 + endpoint: String, // PLC Directory for DID resolution 222 + fallback_endpoints: Vec<String>, // Fallback PLC endpoints 223 + }, 224 + automod: AutomodConfig { 225 + handle: String, // Automod account handle (REQUIRED) 226 + password: String, // App password (REQUIRED) 227 + }, 228 + ozone: OzoneConfig { 229 + url: String, // Ozone URL (REQUIRED, for context only) 230 + pds: String, // Ozone PDS endpoint (REQUIRED, used in auth) 231 + }, 232 + moderation: ModerationConfig { 233 + labeler_did: String, // Labeler DID (REQUIRED) 234 + rate_limit: u64, // Rate limit delay in ms (default: 100) 235 + }, 236 + phash: PhashConfig { 237 + default_hamming_threshold: u32, // Default hamming threshold (default: 3) 238 + }, 239 + } 240 + ``` 241 + 242 + **Environment Variables:** 243 + 244 + **Required:** 245 + - `AUTOMOD_HANDLE` - Bluesky handle for labeler account 246 + - `AUTOMOD_PASSWORD` - App password (NOT user password) 247 + - `LABELER_DID` - DID of labeler account 248 + - `OZONE_URL` - Ozone service URL 249 + - `OZONE_PDS` - Ozone PDS endpoint (for agent initialization) 250 + 251 + **Optional (with defaults):** 252 + - `JETSTREAM_URL` (default: "wss://jetstream.atproto.tools/subscribe") 253 + - `JETSTREAM_FALLBACK_URLS` (comma-separated, default: fire.hose.cam URLs) 254 + - `REDIS_URL` (default: "redis://localhost:6379") 255 + - `PLC_ENDPOINT` (default: "https://plc.directory") 256 + - `PDS_ENDPOINT` (default: "https://bsky.social") 257 + - `PROCESSING_CONCURRENCY` (default: 10) 258 + - `RETRY_ATTEMPTS` (default: 3) 259 + - `RETRY_DELAY_MS` (default: 1000) 260 + - `CACHE_ENABLED` (default: true) 261 + - `CACHE_TTL_SECONDS` (default: 86400) 262 + - `PHASH_HAMMING_THRESHOLD` (default: 3) 263 + - `RATE_LIMIT_MS` (default: 100) 264 + 265 + **Parsing Helpers:** 266 + - `get_env()` - Required env var or default 267 + - `get_env_u32/u64/usize()` - Parse numeric with default 268 + - `get_env_bool()` - Parse "true", "1", "yes" (case-insensitive) 269 + - `get_env_list()` - Parse comma-separated list 270 + 271 + **Load Flow:** 272 + 1. Call `dotenvy::dotenv().ok()` to load `.env` (if exists) 273 + 2. Parse all config sections in `Config::from_env()` 274 + 3. Return miette::Result with diagnostic context on error 275 + 276 + --- 277 + 278 + ### 4. jetstream/mod.rs (WebSocket Subscriber, ~234 lines) 279 + 280 + **JetstreamClient Structure:** 281 + 282 + ```rust 283 + pub struct JetstreamClient { 284 + url: Url, 285 + cursor: Option<i64>, // Microsecond timestamp from last processed event 286 + } 287 + ``` 288 + 289 + **Core Method: `subscribe()`** 290 + - Connects to Jetstream WebSocket using Jacquard's `TungsteniteSubscriptionClient` 291 + - Configures subscription for "app.bsky.feed.post" creates only 292 + - Runs main message loop with multi-task coordination: 293 + - Message timeout: 120 seconds (kill connection if no data) 294 + - Heartbeat: Log every 30 seconds 295 + - Cursor update: Save every 10 seconds 296 + - Shutdown: Graceful exit on broadcast signal 297 + - Extracts cursor (time_us) from each message for resumption 298 + - Calls `process_message()` for each commit event 299 + - Returns `Ok(())` on graceful shutdown, `Err` on connection failure 300 + 301 + **Message Processing: `process_message()`** 302 + - Filters: Only processes `Commit` messages with `Create` operation 303 + - Skips: Non-post collections, updates/deletes, posts without records 304 + - Extracts blobs: Calls `events::extract_blobs_from_record(record_data)` 305 + - Creates ImageJob: post_uri, post_cid, post_did, blobs, timestamp, attempts=0 306 + - Sends to job channel for queueing 307 + - Silently skips if job channel closed (receiver stopped) 308 + 309 + **Error Handling:** 310 + - WebSocket errors: Break loop to allow reconnection 311 + - Message parse errors: Log and continue 312 + - Job send errors: Warn but continue (graceful handling) 313 + 314 + **Dependencies:** 315 + - tokio (channels, time, select) 316 + - jacquard_common (Jetstream types, WebSocket client) 317 + - tracing (logging) 318 + 319 + --- 320 + 321 + ### 5. jetstream/cursor.rs (Cursor Persistence, ~42 lines) 322 + 323 + **Purpose:** Persist Jetstream cursor to disk for recovery after restart 324 + 325 + **Key Functions:** 326 + ```rust 327 + pub fn read_cursor() -> Option<i64> 328 + // Reads from "firehose_cursor.db" 329 + // Returns None if file missing or unparseable 330 + // Logs info on success, warns on error 331 + 332 + pub fn write_cursor(cursor: i64) -> Result<()> 333 + // Writes cursor to "firehose_cursor.db" 334 + // Overwrites existing file 335 + // Returns miette::Result 336 + ``` 337 + 338 + **Cursor Semantics:** 339 + - Value is **microsecond timestamp** (NOT millisecond) 340 + - Obtained from `JetstreamMessage.time_us` field 341 + - Used to resume subscription at exact position on restart 342 + - Prevents reprocessing of same posts 343 + 344 + **File Path:** `./firehose_cursor.db` (relative to working directory) 345 + 346 + **Integration:** 347 + - main.rs: Lines 88-93 load cursor before Jetstream connection 348 + - jetstream/mod.rs: Lines 118-125 save cursor every 10 seconds 349 + - jetstream/mod.rs: Lines 132-139 save final cursor on shutdown 350 + 351 + --- 352 + 353 + ### 6. processor/phash.rs (Hash Computation, ~156 lines) 354 + 355 + **Algorithm: Average Hash (aHash) with 8x8 Grid** 356 + 357 + ```rust 358 + pub fn compute_phash(image_bytes: &[u8]) -> Result<String, PhashError> 359 + // 1. Load image from bytes using image crate 360 + // 2. Call compute_phash_from_image() 361 + // 3. Return 16-character hex string 362 + 363 + pub fn compute_phash_from_image(img: &DynamicImage) -> Result<String, PhashError> 364 + // 1. Configure hasher: HashAlg::Mean (average), size 8x8 365 + // 2. Compute hash via image_hasher 366 + // 3. Convert to hex string via hash_to_hex() 367 + // 4. Validate length (must be exactly 16 chars) 368 + 369 + fn hash_to_hex(hash: &ImageHash) -> Result<String, PhashError> 370 + // Convert 8 bytes to 16 hex characters (lowercase) 371 + // Each byte -> 2 hex digits 372 + // Format: "deadbeefdeadbeef" 373 + ``` 374 + 375 + **Hamming Distance Calculation:** 376 + 377 + ```rust 378 + pub fn hamming_distance(hash1: &str, hash2: &str) -> Result<u32, PhashError> 379 + // 1. Validate both hashes are 16 hex chars 380 + // 2. Parse as u64 from base 16 381 + // 3. XOR to find differing bits 382 + // 4. Count set bits using Brian Kernighan's algorithm: 383 + // while n > 0: count++; n &= n - 1 384 + // 5. Return count (range 0-64) 385 + ``` 386 + 387 + **Test Coverage:** 388 + - Identical hashes: distance = 0 389 + - Completely different: distance = 64 390 + - Single bit difference: distance = 1 391 + - Invalid format validation: length, hex parsing 392 + - Phash format validation: output is 16 hex chars 393 + 394 + **Design Notes:** 395 + - Uses image_hasher crate for robust image decoding 396 + - HashAlg::Mean = average hash (matching TypeScript version) 397 + - Size 8x8 = 64-bit hash (exactly 16 hex chars) 398 + - No normalization or preprocessing (raw pixels) 399 + - Deterministic: same image always produces same hash 400 + 401 + --- 402 + 403 + ### 7. processor/matcher.rs (Rule Matching & Blob Processing, ~299 lines) 404 + 405 + **Key Functions:** 406 + 407 + ```rust 408 + pub async fn load_blob_checks(path: &Path) -> Result<Vec<BlobCheck>> 409 + // Load rules from JSON file 410 + // Deserialize into Vec<BlobCheck> 411 + // Log count on success 412 + 413 + pub async fn download_blob(client: &Client, config: &Config, did: &str, cid: &str) -> Result<Vec<u8>> 414 + // Try CDN first: https://cdn.bsky.app/img/feed_fullsize/plain/{did}/{cid}@{format} 415 + // Try formats: jpeg, png, webp (in order) 416 + // Fall back to PDS if CDN fails 417 + // PDS URL: {pds_endpoint}/xrpc/com.atproto.sync.getBlob?did={did}&cid={cid} 418 + // Return raw bytes 419 + 420 + pub fn match_phash(phash: &str, blob_checks: &[BlobCheck], did: &str, default_threshold: u32) -> Option<MatchResult> 421 + // For each BlobCheck: 422 + // - Skip if did in ignore_did list 423 + // - Get threshold (per-check or default) 424 + // - For each check's phash: 425 + // - Compute hamming_distance 426 + // - If distance <= threshold, return MatchResult (first match wins) 427 + // Return None if no match 428 + 429 + pub async fn process_blob(client: &Client, config: &Config, blob_checks: &[BlobCheck], did: &str, blob: &BlobReference) -> Result<Option<MatchResult>> 430 + // 1. Download blob bytes 431 + // 2. Compute phash 432 + // 3. Match against checks 433 + // 4. Return Option<MatchResult> 434 + 435 + pub async fn process_image_job(client: &Client, config: &Config, blob_checks: &[BlobCheck], job: &ImageJob) -> Result<Vec<MatchResult>> 436 + // For each blob in job: 437 + // - Call process_blob() 438 + // - Collect matches (skip errors, continue) 439 + // Return all matches found 440 + ``` 441 + 442 + **Test Coverage:** 443 + - Exact phash match: distance = 0 444 + - Within threshold: distance <= threshold 445 + - Exceeds threshold: distance > threshold, no match 446 + - Ignored DIDs: skipped entirely 447 + - Real rules loading: verifies rules/blobs.json format 448 + 449 + **Design Notes:** 450 + - First match wins (no combining multiple rules) 451 + - Missing MIME type is accepted (optional field) 452 + - Errors in blob processing don't stop job (continue to next blob) 453 + - CDN as primary path (faster, reduces PDS load) 454 + - PDS fallback for unavailable CDN images 455 + 456 + --- 457 + 458 + ### 8. queue/redis_queue.rs (Job Persistence, ~150 lines) 459 + 460 + **Redis Queue Structure:** 461 + 462 + ``` 463 + PENDING_QUEUE: "jobs:pending" -> List (FIFO for new jobs) 464 + PROCESSING_QUEUE: "jobs:processing" -> List (active jobs, not used in current impl) 465 + DEAD_LETTER_QUEUE: "jobs:dead" -> List (jobs exhausted retries) 466 + ``` 467 + 468 + **JobQueue Implementation:** 469 + 470 + ```rust 471 + pub struct JobQueue { 472 + redis: redis::aio::MultiplexedConnection, 473 + max_retries: u32, 474 + } 475 + 476 + pub async fn new(config: &Config) -> Result<Self> 477 + // Connect to Redis via multiplexed connection 478 + // Load max_retries from config 479 + 480 + pub async fn push(&mut self, job: &ImageJob) -> Result<()> 481 + // Serialize job to JSON 482 + // RPUSH to PENDING_QUEUE 483 + 484 + pub async fn pop(&mut self, timeout_secs: usize) -> Result<Option<ImageJob>> 485 + // BLPOP from PENDING_QUEUE with timeout 486 + // Deserialize JSON to ImageJob 487 + // Return Option 488 + 489 + pub async fn retry(&mut self, mut job: ImageJob) -> Result<()> 490 + // Increment attempts 491 + // If attempts >= max_retries: 492 + // - Move to dead-letter via move_to_dead_letter() 493 + // Else: 494 + // - Re-push to pending 495 + 496 + pub async fn move_to_dead_letter(&mut self, job: &ImageJob) -> Result<()> 497 + // Serialize and RPUSH to DEAD_LETTER_QUEUE 498 + 499 + pub async fn stats(&mut self) -> Result<QueueStats> 500 + // Return lengths of all three queues 501 + ``` 502 + 503 + **State Machine:** 504 + ``` 505 + Job created -> RPUSH to PENDING 506 + Worker BLPOP from PENDING 507 + Worker processes -> success: discarded 508 + Worker processes -> error: retry() 509 + if attempts < max: RPUSH to PENDING 510 + if attempts >= max: RPUSH to DEAD_LETTER 511 + ``` 512 + 513 + **Design Notes:** 514 + - Multiplexed connection for async concurrency 515 + - No PROCESSING_QUEUE used (could be added for transactional safety) 516 + - JSON serialization (human-readable, debuggable) 517 + - Timeout on BLPOP (1 second default in worker) 518 + - Dead-letter for observability (can inspect failed jobs) 519 + 520 + --- 521 + 522 + ### 9. queue/worker.rs (Job Processing, ~200+ lines) 523 + 524 + **Worker Pool Architecture:** 525 + 526 + ```rust 527 + pub struct WorkerPool { 528 + config: Config, 529 + client: Client, 530 + agent: AgentSession, 531 + blob_checks: Vec<BlobCheck>, 532 + metrics: Metrics, 533 + rate_limiter: RateLimiter, 534 + } 535 + 536 + pub fn new(...) -> Self 537 + // Create single worker pool instance 538 + // Config, HTTP client, agent, rules all cloned for sharing 539 + 540 + pub async fn start(&self, mut queue: JobQueue, mut cache: PhashCache, mut shutdown_rx: broadcast::Receiver<()>) -> Result<()> 541 + // Main worker loop 542 + // Each worker runs independently (tokio::select!) 543 + ``` 544 + 545 + **Processing Loop (per worker):** 546 + ``` 547 + 1. Select from: 548 + - Shutdown signal -> break 549 + - Queue pop (1 second timeout) -> if Some(job): 550 + a. For each blob in job: 551 + - Check cache (get_or_compute pattern) 552 + - If cache miss: download + phash 553 + - Match against rules 554 + - If match: execute moderation actions 555 + b. Track metrics (blobs, matches, etc) 556 + c. On success: remove from queue (implicit) 557 + d. On error: call retry() 558 + 2. Continue loop 559 + ``` 560 + 561 + **Moderation Action Execution:** 562 + ``` 563 + For each match found: 564 + - If to_label: create post label (with claim check) 565 + - If report_post: create post report 566 + - If label_acct: create account label (with claim check) 567 + - If report_acct: create account report 568 + - If takedown_post: takedown post (future) 569 + - If takedown_acct: takedown account (future) 570 + ``` 571 + 572 + **Rate Limiting Integration:** 573 + - RateLimiter wraps moderation actions 574 + - Enforces delay between actions (config.moderation.rate_limit) 575 + - Prevents overwhelming Ozone API 576 + 577 + **Error Handling:** 578 + - Blob download error: retry job 579 + - Phash computation error: retry job 580 + - Moderation action error: log and continue (don't retry) 581 + - Queue error: continue to next iteration 582 + 583 + **Design Notes:** 584 + - Each worker owns its own queue and cache connections 585 + - No lock contention (workers are independent) 586 + - Shutdown via broadcast receiver (all workers stop together) 587 + - Redis client created per select! iteration (necessary for multiplexed connection reuse) 588 + - Metrics are thread-safe (Arc<AtomicU64>) 589 + 590 + --- 591 + 592 + ### 10. cache/mod.rs (Phash Caching, ~120 lines) 593 + 594 + **Redis Phash Cache:** 595 + 596 + ``` 597 + Key Pattern: "phash:{cid}" 598 + Value: hex hash string (16 chars) 599 + TTL: config.cache.ttl (default: 86400 = 24 hours) 600 + ``` 601 + 602 + **PhashCache Structure:** 603 + 604 + ```rust 605 + pub struct PhashCache { 606 + redis: redis::aio::MultiplexedConnection, 607 + ttl: u64, 608 + enabled: bool, 609 + } 610 + 611 + pub async fn new(config: &Config) -> Result<Self> 612 + // Connect to Redis 613 + // Store ttl and enabled flag 614 + 615 + pub async fn get(&mut self, cid: &str) -> Result<Option<String>> 616 + // If !enabled: return Ok(None) 617 + // GET from "phash:{cid}" 618 + // Log cache hit/miss 619 + 620 + pub async fn set(&mut self, cid: &str, phash: &str) -> Result<()> 621 + // If !enabled: return Ok(()) 622 + // SET with EX (expire time) 623 + // Log cached entry 624 + 625 + pub async fn delete(&mut self, cid: &str) -> Result<()> 626 + // If !enabled: return Ok(()) 627 + // DEL "phash:{cid}" 628 + 629 + pub fn is_enabled(&self) -> bool 630 + // Return enabled flag 631 + 632 + pub async fn get_or_compute<F, Fut>(&mut self, cid: &str, compute_fn: F) -> Result<String> 633 + // Check cache 634 + // If hit: return 635 + // If miss: call compute_fn() 636 + // Set cache with result 637 + // Return result 638 + ``` 639 + 640 + **Performance Characteristics:** 641 + - Cache hit rate: 20-40% typical (viral images) 642 + - TTL: 24 hours (prevents stale hashes) 643 + - Fallback: If disabled, computed fresh each time 644 + - No memory limit (Redis-managed) 645 + 646 + **Design Notes:** 647 + - Optional feature (config.cache.enabled = true/false) 648 + - Zero-copy: strings passed by reference 649 + - Fail-open: cache errors logged but don't break processing 650 + - get_or_compute() pattern reduces boilerplate in workers 651 + 652 + --- 653 + 654 + ### 11. moderation/ (Action Execution) 655 + 656 + **Module Structure:** 657 + 658 + ``` 659 + moderation/ 660 + ├── mod.rs # Exports 661 + ├── post.rs # Post label/report actions 662 + ├── account.rs # Account label/report/comment actions 663 + ├── claims.rs # Redis deduplication claims 664 + ├── rate_limiter.rs # Rate limit enforcement 665 + └── helpers.rs # Shared utilities 666 + ``` 667 + 668 + **Post Actions: post.rs** 669 + 670 + ```rust 671 + pub async fn create_post_label( 672 + agent: &Arc<Agent<MemoryCredentialSession>>, 673 + labeler_did: &str, 674 + post_uri: &str, 675 + post_cid: &str, 676 + label: &str, 677 + comment: &str, 678 + phash: &str, 679 + distance: u32, 680 + ) -> Result<()> 681 + // Emit mod event via Ozone: 682 + // - $type: "tools.ozone.moderation.defs#modEventLabel" 683 + // - subject: strongRef {uri, cid} 684 + // - createLabelVals: [label] 685 + // - comment: "{timestamp}: {comment} at {uri} with phash \"{phash}\" (distance={distance})" 686 + 687 + pub async fn create_post_report( 688 + agent: &Arc<Agent<MemoryCredentialSession>>, 689 + labeler_did: &str, 690 + post_uri: &str, 691 + post_cid: &str, 692 + label: &str, 693 + comment: &str, 694 + phash: &str, 695 + ) -> Result<()> 696 + // Emit mod event via Ozone: 697 + // - $type: "tools.ozone.moderation.defs#modEventReport" 698 + // - subject: strongRef {uri, cid} 699 + // - reportType: "com.atproto.moderation.defs#reasonOther" 700 + // - comment: "{timestamp}: {comment} at {uri} with phash \"{phash}\"" 701 + ``` 702 + 703 + **Account Actions: account.rs** 704 + 705 + ```rust 706 + pub async fn create_account_label( 707 + agent: &Arc<Agent<MemoryCredentialSession>>, 708 + labeler_did: &str, 709 + did: &str, 710 + label: &str, 711 + comment: &str, 712 + ) -> Result<()> 713 + // Emit mod event: 714 + // - $type: "tools.ozone.moderation.defs#modEventLabel" 715 + // - subject: repoRef {did} 716 + // - createLabelVals: [label] 717 + // - comment: "{timestamp}: {comment} for account {did}" 718 + 719 + pub async fn create_account_report( 720 + agent: &Arc<Agent<MemoryCredentialSession>>, 721 + labeler_did: &str, 722 + did: &str, 723 + label: &str, 724 + comment: &str, 725 + ) -> Result<()> 726 + // Emit mod event: 727 + // - $type: "tools.ozone.moderation.defs#modEventReport" 728 + // - subject: repoRef {did} 729 + // - reportType: "com.atproto.moderation.defs#reasonOther" 730 + // - comment: "{timestamp}: {comment} for account {did}" 731 + 732 + pub async fn create_account_comment( 733 + agent: &Arc<Agent<MemoryCredentialSession>>, 734 + labeler_did: &str, 735 + did: &str, 736 + comment: &str, 737 + ) -> Result<()> 738 + // Emit mod event: 739 + // - $type: "tools.ozone.moderation.defs#modEventComment" 740 + // - subject: repoRef {did} 741 + // - comment: "{timestamp}: {comment}" 742 + ``` 743 + 744 + **Claims: claims.rs (Deduplication)** 745 + 746 + ```rust 747 + pub async fn try_claim_post_label( 748 + redis: &mut redis::aio::MultiplexedConnection, 749 + uri: &str, 750 + label: &str, 751 + ) -> Result<bool> 752 + // Key: "claim:post:label:{uri}:{label}" 753 + // SET {key} "1" NX EX 604800 (7 days) 754 + // Return true if SET succeeded (claim acquired) 755 + 756 + pub async fn try_claim_account_label( 757 + redis: &mut redis::aio::MultiplexedConnection, 758 + did: &str, 759 + label: &str, 760 + ) -> Result<bool> 761 + // Key: "claim:account:label:{did}:{label}" 762 + // SET {key} "1" NX EX 604800 763 + // Return true if SET succeeded 764 + 765 + pub async fn has_been_claimed_recently( 766 + redis: &mut redis::aio::MultiplexedConnection, 767 + claim_key: &str, 768 + ) -> Result<bool> 769 + // GET claim_key 770 + // Return true if exists 771 + ``` 772 + 773 + **Rate Limiter: rate_limiter.rs** 774 + 775 + ```rust 776 + pub struct RateLimiter { 777 + delay_ms: u64, 778 + last_action: Arc<Mutex<Instant>>, 779 + } 780 + 781 + pub async fn limit<F, Fut>(&self, action: F) -> Result<Fut::Output> 782 + where 783 + F: FnOnce() -> Fut, 784 + Fut: Future, 785 + // Wait if needed to enforce rate limit 786 + // Execute action 787 + // Update last_action timestamp 788 + ``` 789 + 790 + --- 791 + 792 + ### 12. metrics/mod.rs (Observability, ~353 lines) 793 + 794 + **Metrics Tracker:** 795 + 796 + ```rust 797 + pub struct Metrics { 798 + inner: Arc<MetricsInner>, // All fields are Arc<AtomicU64> 799 + } 800 + 801 + struct MetricsInner { 802 + // Jobs 803 + jobs_received: AtomicU64, // From Jetstream 804 + jobs_processed: AtomicU64, // Completed (success or fail) 805 + jobs_failed: AtomicU64, // Failed at all retries 806 + jobs_retried: AtomicU64, // Retried (attempts > 0) 807 + 808 + // Blobs 809 + blobs_processed: AtomicU64, // Hashed (success) 810 + blobs_downloaded: AtomicU64, // Downloaded from CDN/PDS 811 + 812 + // Matches 813 + matches_found: AtomicU64, // Phashes matched rules 814 + 815 + // Cache 816 + cache_hits: AtomicU64, // Cached phash used 817 + cache_misses: AtomicU64, // Phash not cached, computed 818 + 819 + // Moderation 820 + posts_labeled: AtomicU64, // Post labels created 821 + posts_reported: AtomicU64, // Post reports created 822 + accounts_labeled: AtomicU64, // Account labels created 823 + accounts_reported: AtomicU64, // Account reports created 824 + 825 + // Skipped (deduplication) 826 + posts_already_labeled: AtomicU64, 827 + posts_already_reported: AtomicU64, 828 + accounts_already_labeled: AtomicU64, 829 + accounts_already_reported: AtomicU64, 830 + } 831 + ``` 832 + 833 + **Key Methods:** 834 + - `inc_*()`: Atomic increment 835 + - Getters: Load current value 836 + - `log_stats()`: Log all metrics (called every 60 seconds + on shutdown) 837 + - `cache_hit_rate()`: Calculate percentage 838 + - `snapshot()`: Immutable snapshot for reporting 839 + 840 + **Lock-Free Design:** 841 + - All operations use `AtomicU64` with `Ordering::Relaxed` 842 + - No mutexes (no contention on increments) 843 + - Multiple workers can update simultaneously 844 + - Consistent snapshot possible via `.snapshot()` 845 + 846 + **Logged Every 60 Seconds:** 847 + ``` 848 + Jobs: received=X, processed=Y, failed=Z, retried=W 849 + Blobs: processed=X, downloaded=Y 850 + Matches: found=X 851 + Cache: hits=X, misses=Y, hit_rate=Z% 852 + Moderation: posts_labeled=X, posts_reported=Y, accounts_labeled=Z, accounts_reported=W 853 + Skipped (deduplication): posts_already_labeled=X, posts_already_reported=Y, accounts_already_labeled=Z, accounts_already_reported=W 854 + ``` 855 + 856 + --- 857 + 858 + ### 13. agent/session.rs (Authentication) 859 + 860 + **AgentSession Wrapper:** 861 + 862 + ```rust 863 + pub struct AgentSession { 864 + agent: Arc<Agent<MemoryCredentialSession>>, // Jacquard client 865 + did: Arc<str>, // Authenticated DID 866 + } 867 + 868 + pub async fn new(config: &Config) -> Result<Self> 869 + // Create MemoryCredentialSession::authenticated() 870 + // Pass handle, password, ozone.pds 871 + // Extract did and create Arc<Agent> 872 + // Return AgentSession 873 + 874 + pub fn agent(&self) -> &Arc<Agent<MemoryCredentialSession>> 875 + // Get reference to Jacquard agent 876 + 877 + pub fn did(&self) -> &str 878 + // Get authenticated DID 879 + ``` 880 + 881 + **Session Management:** 882 + - Jacquard handles token refresh internally 883 + - MemoryCredentialSession stores tokens in memory (no file I/O) 884 + - No manual token refresh needed (transparent) 885 + - Credentials passed once at initialization 886 + 887 + **Thread Safety:** 888 + - Agent wrapped in Arc (shareable across threads) 889 + - All internal types use 'static lifetime 890 + - Clone is cheap (Arc clone) 891 + 892 + --- 893 + 894 + ### 14. plc/mod.rs (DID Resolution, ~130 lines) 895 + 896 + **PLC Directory Client with Failover:** 897 + 898 + ```rust 899 + pub struct PlcClient { 900 + client: Client, 901 + endpoints: Vec<String>, // Primary + fallbacks 902 + } 903 + 904 + pub fn new(client: Client, config: &PlcConfig) -> Self 905 + // Combine primary + fallback endpoints 906 + // Store as vector for round-robin 907 + 908 + pub async fn resolve_did(&self, did: &str) -> Result<DidDocument> 909 + // For each endpoint: 910 + // - GET {endpoint}/{did} 911 + // - Parse JSON to DidDocument 912 + // - On success: return (log fallback usage if idx > 0) 913 + // - On error: continue to next endpoint 914 + // If all fail: return error with last error 915 + 916 + pub async fn get_pds_endpoint(&self, did: &str) -> Result<String> 917 + // Call resolve_did() 918 + // Find service with type "AtprotoPersonalDataServer" 919 + // Return serviceEndpoint URL 920 + // Error if not found 921 + ``` 922 + 923 + **DidDocument Structure:** 924 + 925 + ```rust 926 + pub struct DidDocument { 927 + pub id: String, 928 + pub also_known_as: Vec<String>, 929 + pub service: Vec<ServiceEndpoint>, 930 + } 931 + 932 + pub struct ServiceEndpoint { 933 + pub id: String, 934 + pub service_type: String, 935 + pub service_endpoint: String, 936 + } 937 + ``` 938 + 939 + **Design Notes:** 940 + - Automatic failover on network/parsing errors 941 + - Logs when fallback succeeds (operational visibility) 942 + - Used in future versions for DID -> PDS resolution 943 + - Currently not used in main processing (PDS endpoint from config) 944 + 945 + --- 946 + 947 + ### 15. rules/blobs.json (Rule Configuration) 948 + 949 + **Current Rules (4 rules):** 950 + 951 + ```json 952 + [ 953 + { 954 + "phashes": ["07870707...", "d9794408...", ...], 955 + "label": "troll", 956 + "comment": "Image is used in harassment campaign", 957 + "reportAcct": false, 958 + "labelAcct": true, 959 + "reportPost": false, 960 + "toLabel": true, 961 + "hammingThreshold": 1, 962 + "description": "Will Stancil Harassment Memes", 963 + "ignoreDID": ["did:plc:7umvpuxe2vbrc3zrzuquzniu"] 964 + }, 965 + { 966 + "phashes": ["00fffd7c...", "ffbf8f83...", ...], 967 + "label": "maga-trump", 968 + "comment": "Pro-trump imagery", 969 + "reportAcct": true, 970 + "labelAcct": false, 971 + "reportPost": false, 972 + "toLabel": true, 973 + "hammingThreshold": 3, 974 + "description": "Sample harassment image variants" 975 + }, 976 + ... 977 + ] 978 + ``` 979 + 980 + **Rule Fields:** 981 + - `phashes`: Array of 16-char hex hashes to match 982 + - `label`: Label to apply (e.g., "troll", "spam", "csam") 983 + - `comment`: Description for audit trail 984 + - `reportAcct`: Report the account 985 + - `labelAcct`: Label the account 986 + - `reportPost`: Report the post 987 + - `toLabel`: Label the post 988 + - `hammingThreshold`: Max hamming distance for match (overrides global default) 989 + - `description`: Internal documentation (not used) 990 + - `ignoreDID`: Optional array of DIDs to exempt from this rule 991 + 992 + **Matching Logic:** 993 + 1. For each rule in order: 994 + - Skip if post author DID in `ignoreDID` 995 + - Get threshold (per-rule or global default) 996 + - For each rule's phash: 997 + - Compute hamming_distance with computed phash 998 + - If distance <= threshold: Match found, execute actions 999 + 2. First match wins (no combining multiple rules) 1000 + 1001 + --- 1002 + 1003 + ## Data Flow & Workflow 1004 + 1005 + ### High-Level Process 1006 + 1007 + ``` 1008 + 1. JETSTREAM INGEST (main.rs:98-158) 1009 + Jetstream WebSocket -> posts with images 1010 + ↓ (extract blobs) 1011 + ImageJob created with post_uri, post_cid, post_did, blobs, timestamp, attempts=0 1012 + ↓ (send) 1013 + Job channel (mpsc, unbounded) 1014 + 1015 + 2. JOB QUEUING (main.rs:160-188) 1016 + Job receiver task polls from channel 1017 + ↓ (serialize) 1018 + Redis RPUSH to "jobs:pending" 1019 + ↓ 1020 + Metrics: inc_jobs_received() 1021 + 1022 + 3. WORKER PROCESSING (worker.rs:91+) 1023 + N concurrent workers run independently 1024 + Each worker: 1025 + - BLPOP from "jobs:pending" with 1s timeout 1026 + - For each blob in job: 1027 + a. Check phash cache ("phash:{cid}") 1028 + ↓ (if miss) 1029 + b. Download blob (CDN first, fall back to PDS) 1030 + ↓ 1031 + c. Compute phash (aHash 8x8 -> 16 hex) 1032 + ↓ 1033 + d. Cache phash in Redis (24h TTL) 1034 + ↓ 1035 + e. Match against rules (hamming_distance <= threshold) 1036 + ↓ (if match found) 1037 + f. Execute moderation actions: 1038 + - Check claims (deduplication, 7-day TTL) 1039 + - Create post/account labels or reports 1040 + - Rate-limited (config.moderation.rate_limit) 1041 + ↓ 1042 + - Update metrics (matches, labels, reports, etc) 1043 + - On error: call retry() -> re-push to "jobs:pending" 1044 + - On max retries: move to "jobs:dead" 1045 + 1046 + 4. METRICS LOGGING (main.rs:220-229) 1047 + Every 60 seconds: Metrics::log_stats() 1048 + Outputs: jobs, blobs, matches, cache, moderation, skipped 1049 + 1050 + 5. GRACEFUL SHUTDOWN (main.rs:233-284) 1051 + Ctrl+C or SIGTERM 1052 + ↓ 1053 + Send shutdown signal to all tasks 1054 + ↓ 1055 + Workers finish current jobs and exit 1056 + ↓ 1057 + Jetstream client writes final cursor 1058 + ↓ 1059 + Log final metrics 1060 + ↓ 1061 + Exit 1062 + ``` 1063 + 1064 + ### Error Handling Strategy 1065 + 1066 + **Level 1: Blob Processing** 1067 + - Download error: retry job (up to max_retries) 1068 + - Phash computation error: retry job 1069 + - Image decode error: log and continue (next blob) 1070 + 1071 + **Level 2: Rule Matching** 1072 + - Hamming distance error: log and continue 1073 + - No error if no match found (normal case) 1074 + 1075 + **Level 3: Moderation Actions** 1076 + - Label/report API errors: log but don't retry job 1077 + - Rate limit respected (wait before action) 1078 + - Claim check may skip action (already done in 7 days) 1079 + 1080 + **Level 4: Queue Management** 1081 + - Job pushed successfully: tracked in metrics 1082 + - Job failed after max retries: moved to dead-letter 1083 + - Dead-letter jobs observable via Redis (debugging) 1084 + 1085 + --- 1086 + 1087 + ## Testing Structure 1088 + 1089 + **Unit Tests Included:** 1090 + 1091 + 1. **config/mod.rs** 1092 + - `test_get_env_bool()` - Boolean parsing 1093 + - `test_get_env_u32()` - U32 parsing 1094 + 1095 + 2. **processor/phash.rs** 1096 + - `test_hamming_distance_identical()` - Same hash = 0 1097 + - `test_hamming_distance_different()` - Opposite hash = 64 1098 + - `test_hamming_distance_one_bit()` - One bit diff = 1 1099 + - `test_hamming_distance_invalid_length()` - Validation 1100 + - `test_hamming_distance_invalid_hex()` - Hex validation 1101 + - `test_phash_format()` - Output format (16 chars, valid hex) 1102 + 1103 + 3. **processor/matcher.rs** 1104 + - `test_match_phash_exact()` - Exact match 1105 + - `test_match_phash_within_threshold()` - Within threshold 1106 + - `test_match_phash_exceeds_threshold()` - Exceeds threshold 1107 + - `test_match_phash_ignored_did()` - DID exemption 1108 + - `test_load_real_rules()` - Load rules/blobs.json 1109 + 1110 + 4. **metrics/mod.rs** 1111 + - `test_metrics_increment()` - Atomic increment 1112 + - `test_cache_hit_rate()` - Hit rate calculation 1113 + - `test_metrics_snapshot()` - Snapshot consistency 1114 + 1115 + 5. **plc/mod.rs** 1116 + - `test_plc_resolve()` - Real PLC resolution (requires network) 1117 + 1118 + **Integration Tests (Require Redis):** 1119 + - cache/mod.rs - Cache get/set/delete operations 1120 + 1121 + **Run Tests:** 1122 + ```bash 1123 + # All unit tests 1124 + cargo test 1125 + 1126 + # Specific test 1127 + cargo test test_hamming_distance_identical 1128 + 1129 + # Show output 1130 + cargo test -- --nocapture 1131 + 1132 + # Ignored tests (network-dependent) 1133 + cargo test -- --ignored 1134 + ``` 1135 + 1136 + --- 1137 + 1138 + ## Key Design Decisions & Rationale 1139 + 1140 + ### 1. Redis for Persistence 1141 + - **Why:** Durability across process crashes, distributed state 1142 + - **Trade-off:** Adds Redis dependency (no in-memory fallback) 1143 + - **Mitigation:** Dead-letter queue preserves failed jobs for inspection 1144 + 1145 + ### 2. Job Queue Decouples Ingestion 1146 + - **Why:** Jetstream can be very fast, workers may lag 1147 + - **How:** mpsc channel -> Redis queue -> worker pool 1148 + - **Benefit:** Backpressure natural (Redis queue grows if workers slow) 1149 + 1150 + ### 3. Phash Caching (Redis) 1151 + - **Why:** Viral images processed multiple times, compute cost high 1152 + - **TTL:** 24 hours (balance between freshness and hit rate) 1153 + - **Hit Rate:** 20-40% typical (good ROI for cost) 1154 + 1155 + ### 4. Claims Deduplication 1156 + - **Why:** Prevent duplicate moderation actions within 7 days 1157 + - **How:** Redis SET ... NX (atomic acquire) 1158 + - **Check:** Verify label still exists in Ozone (race condition safety) 1159 + - **Trade-off:** May skip legitimate re-moderation within 7 days 1160 + 1161 + ### 5. Worker Pool Pattern 1162 + - **Design:** N independent workers, not tokio::spawn (HRTB issue) 1163 + - **Concurrency:** Multiplexed Redis connections (no lock contention) 1164 + - **Shutdown:** Broadcast receiver stops all workers together 1165 + 1166 + ### 6. Per-Rule Hamming Threshold 1167 + - **Why:** Different rule types need different sensitivity 1168 + - **Example:** Exact harassment memes (threshold 1) vs looser CSAM detection (threshold 5) 1169 + - **Default:** Overridable via PHASH_HAMMING_THRESHOLD 1170 + 1171 + ### 7. Cursor Persistence 1172 + - **Why:** Resume from exact position after restart 1173 + - **Format:** Microsecond timestamp (not millisecond) 1174 + - **Frequency:** Every 10 seconds + on shutdown 1175 + - **File:** firehose_cursor.db (working directory) 1176 + 1177 + ### 8. Jetstream Failover 1178 + - **Primary:** wss://jetstream.atproto.tools/subscribe 1179 + - **Fallbacks:** fire.hose.cam URLs (different provider) 1180 + - **Strategy:** Exponential backoff with exponential caps 1181 + 1182 + ### 9. Blob Download Fallback 1183 + - **Primary:** CDN (cdn.bsky.app/img/feed_fullsize) 1184 + - **Secondary:** PDS (com.atproto.sync.getBlob) 1185 + - **Why:** CDN is faster, reduces PDS load 1186 + 1187 + ### 10. Rate Limiting 1188 + - **Purpose:** Respect Ozone API quotas 1189 + - **Mechanism:** Delay before each moderation action 1190 + - **Config:** rate_limit_ms (default 100) 1191 + - **Future:** Track RateLimit headers from Ozone responses 1192 + 1193 + --- 1194 + 1195 + ## Performance Characteristics 1196 + 1197 + ### Throughput 1198 + - Jetstream: ~1000s posts/second (unlimited) 1199 + - Workers: 10 workers × 1-5 blobs/second = 10-50 images/second 1200 + - Bottleneck: Network I/O (blob download), not hashing 1201 + 1202 + ### Latency 1203 + - Jetstream event -> job enqueue: <100ms 1204 + - Job dequeue -> phash computed: 200-500ms (network dependent) 1205 + - Phash -> moderation action: ~100ms (rate-limited) 1206 + - Total end-to-end: 300-700ms per image 1207 + 1208 + ### Memory 1209 + - Minimal (no large buffers held) 1210 + - Arc cloning for data sharing 1211 + - Metrics: lock-free atomics 1212 + - Config loaded once at startup 1213 + 1214 + ### CPU 1215 + - Phash computation: ~1-5ms per image (image_hasher) 1216 + - Hamming distance: <1µs (bitwise operations) 1217 + - Not CPU-bound 1218 + 1219 + ### Disk 1220 + - Cursor file: <20 bytes (microsecond timestamp) 1221 + - Rules: JSON file (~10KB typical) 1222 + - Logs: optional (stdout/JSON logging) 1223 + 1224 + --- 1225 + 1226 + ## Known Limitations & Future Work 1227 + 1228 + ### Current Limitations 1229 + 1. Single-process (no distributed workers) 1230 + 2. In-memory PDS/PLC caches lost on restart 1231 + 3. No metrics server (Prometheus endpoint) 1232 + 4. Takedown actions not implemented (infrastructure) 1233 + 5. No image deduplication by CID before download 1234 + 6. No batch operations to Ozone API 1235 + 1236 + ### Future Enhancements (from README) 1237 + - Rate limit header parsing (adaptive backoff) 1238 + - Takedown post/account actions 1239 + - Distributed worker support 1240 + - Persistent moderation history 1241 + - Web UI for rule management 1242 + - Active monitoring/alerting 1243 + 1244 + --- 1245 + 1246 + ## Environment Setup 1247 + 1248 + ### Required Credentials 1249 + ```bash 1250 + AUTOMOD_HANDLE=automod.bsky.social # Labeler account handle 1251 + AUTOMOD_PASSWORD=xxxx-yyyy-zzzz-wwww # App password (NOT user password) 1252 + LABELER_DID=did:plc:example # Your labeler account DID 1253 + OZONE_URL=https://ozone.bsky.app # Ozone service URL 1254 + OZONE_PDS=https://pds.bluesky.social # Ozone PDS endpoint 1255 + ``` 1256 + 1257 + ### Docker Deployment 1258 + ```bash 1259 + cp .env.example .env 1260 + # Edit .env with credentials 1261 + docker compose up --build 1262 + ``` 1263 + 1264 + ### Local Development 1265 + ```bash 1266 + # Start Redis 1267 + docker run -d -p 6379:6379 redis 1268 + 1269 + # Create .env 1270 + cp .env.example .env 1271 + # Edit with credentials 1272 + 1273 + # Run 1274 + cargo run 1275 + 1276 + # Tests 1277 + cargo test 1278 + ``` 1279 + 1280 + ### CLI Tool: phash-cli 1281 + ```bash 1282 + cargo run --bin phash-cli path/to/image.jpg 1283 + # Output: e0e0e0e0e0fcfefe (16 hex chars) 1284 + ``` 1285 + 1286 + --- 1287 + 1288 + ## Critical Gotchas & Nuances 1289 + 1290 + ### 1. Cursor is Microseconds, Not Milliseconds 1291 + - Jetstream provides `time_us` (microseconds since epoch) 1292 + - NOT milliseconds (1,000x larger) 1293 + - Used directly for resumption 1294 + 1295 + ### 2. Hamming Threshold Comparison 1296 + - `distance <= threshold` (inclusive) 1297 + - Threshold 0 = exact match only 1298 + - Threshold 5 = default (moderate sensitivity) 1299 + 1300 + ### 3. First Match Wins 1301 + - Rules evaluated in order 1302 + - First matching rule's actions executed 1303 + - No combining multiple rules 1304 + 1305 + ### 4. Ignore DIDs Are Per-Rule 1306 + - Each rule can have its own `ignoreDID` list 1307 + - Not a global blocklist 1308 + - Checked during matching 1309 + 1310 + ### 5. MIME Type Is Optional 1311 + - `BlobReference.mime_type` may be missing 1312 + - Code doesn't filter by MIME type 1313 + - SVG images might be processed (future: skip SVG) 1314 + 1315 + ### 6. Claims Are Deduplication Only 1316 + - Redis claim prevents action for 7 days 1317 + - Still checks Ozone API (belt-and-suspenders) 1318 + - May skip legitimate re-moderation within 7 days 1319 + 1320 + ### 7. Rate Limit Delay 1321 + - Applied BEFORE action (preventive) 1322 + - Not tied to Ozone response headers (yet) 1323 + - May result in artificial delay even at low load 1324 + 1325 + ### 8. Retry Logic 1326 + - Job retried on ANY error (not just transient) 1327 + - Max retries from config (default 3) 1328 + - Dead-letter after max retries (not discarded) 1329 + 1330 + ### 9. Cache TTL 1331 + - 24 hours default (very long) 1332 + - Can be tuned via CACHE_TTL_SECONDS 1333 + - Image may be edited/removed but hash cached 1334 + 1335 + ### 10. Jetstream Fallover 1336 + - URL rotation on failure 1337 + - Exponential backoff (5s -> 10s -> 20s ... -> 300s) 1338 + - Max 5 connection attempts total before giving up 1339 + 1340 + --- 1341 + 1342 + ## Code Quality & Testing 1343 + 1344 + ### Error Handling Approach 1345 + - **miette for diagnostics:** Rich error context and pretty printing 1346 + - **thiserror for custom types:** Derive Error trait 1347 + - **Result<T> pervasive:** No panics in business logic 1348 + - **Graceful degradation:** Errors logged, processing continues 1349 + 1350 + ### Concurrency Patterns 1351 + - **Arc for sharing:** Cheap clones across workers 1352 + - **Atomic types for metrics:** Lock-free increments 1353 + - **tokio::select! for orchestration:** Clean multi-task coordination 1354 + - **Multiplexed Redis connections:** One connection, concurrent operations 1355 + 1356 + ### Code Style 1357 + - Consistent module structure (mod.rs organization) 1358 + - Clear separation of concerns (processor, queue, moderation) 1359 + - Use of Jacquard types (CowStr, Did, AtUri, Cid) 1360 + - Comprehensive logging via tracing 1361 + 1362 + --- 1363 + 1364 + ## Summary of Key Components & Responsibilities 1365 + 1366 + | Component | File | Purpose | Key Types | 1367 + |-----------|------|---------|-----------| 1368 + | Entry Point | main.rs | Orchestrate startup, shutdown, task coordination | - | 1369 + | Configuration | config/mod.rs | Load env vars, provide config to all modules | Config, JetstreamConfig, etc | 1370 + | Jetstream | jetstream/ | Subscribe to firehose, extract blobs, handle cursor | JetstreamClient, ImageJob | 1371 + | Image Processing | processor/ | Hash computation, rule matching, blob download | Phash, hamming_distance, MatchResult | 1372 + | Job Queue | queue/ | Redis persistence, retry logic, dead-letter | JobQueue, WorkerPool, ImageJob | 1373 + | Cache | cache/mod.rs | Redis phash cache with TTL | PhashCache | 1374 + | Moderation | moderation/ | Execute label/report actions, deduplication | Claims, RateLimiter | 1375 + | Metrics | metrics/mod.rs | Track statistics with lock-free atomics | Metrics, MetricsSnapshot | 1376 + | Authentication | agent/ | Jacquard session wrapper | AgentSession | 1377 + | DID Resolution | plc/mod.rs | Resolve DIDs to PDS endpoints | PlcClient | 1378 + 1379 + --- 1380 + 1381 + ## Quick Reference: Important File Paths 1382 + 1383 + **Configuration:** 1384 + - Environment loading: `/Users/scarndp/dev/skywatch/skywatch-phash-rs/src/config/mod.rs:80-144` 1385 + - Jetstream config: `config/mod.rs:87-103` 1386 + - Processing config: `config/mod.rs:107-111` 1387 + - Cache config: `config/mod.rs:112-114` 1388 + 1389 + **Core Algorithm:** 1390 + - Phash computation: `/Users/scarndp/dev/skywatch/skywatch-phash-rs/src/processor/phash.rs:26-44` 1391 + - Hamming distance: `processor/phash.rs:72-98` 1392 + - Rule matching: `processor/matcher.rs:72-113` 1393 + 1394 + **Job Processing:** 1395 + - Worker loop: `/Users/scarndp/dev/skywatch/skywatch-phash-rs/src/queue/worker.rs:91-99` (main select loop) 1396 + - Job retry: `queue/redis_queue.rs:78-97` 1397 + - Queue push/pop: `queue/redis_queue.rs:40-76` 1398 + 1399 + **Moderation:** 1400 + - Post actions: `/Users/scarndp/dev/skywatch/skywatch-phash-rs/src/moderation/post.rs` 1401 + - Account actions: `moderation/account.rs` 1402 + - Claims checking: `moderation/claims.rs` 1403 + 1404 + **Metrics:** 1405 + - Metric types: `/Users/scarndp/dev/skywatch/skywatch-phash-rs/src/metrics/mod.rs:1-66` 1406 + - Log stats: `metrics/mod.rs:212-244` 1407 + 1408 + **Rules:** 1409 + - Current rules: `/Users/scarndp/dev/skywatch/skywatch-phash-rs/rules/blobs.json` 1410 + 1411 + --- 1412 + 1413 + ## Debugging Tips 1414 + 1415 + ### Enable Debug Logging 1416 + ```bash 1417 + RUST_LOG=debug cargo run 1418 + RUST_LOG=skywatch_phash_rs=debug,info cargo run 1419 + ``` 1420 + 1421 + ### Monitor Redis 1422 + ```bash 1423 + redis-cli 1424 + > KEYS "*" 1425 + > LLEN jobs:pending 1426 + > LLEN jobs:dead 1427 + > GET phash:{cid} 1428 + ``` 1429 + 1430 + ### Check Metrics in Real-Time 1431 + ```bash 1432 + # Logs printed every 60 seconds, watch with: 1433 + tail -f logs.txt | grep "=== Metrics ===" 1434 + ``` 1435 + 1436 + ### Cursor Position 1437 + ```bash 1438 + cat firehose_cursor.db 1439 + # Shows microsecond timestamp 1440 + ``` 1441 + 1442 + ### Test Phash CLI 1443 + ```bash 1444 + cargo run --bin phash-cli path/to/image.jpg 1445 + # Output: 16-char hex string 1446 + ``` 1447 + 1448 + ### Inspect Rules 1449 + ```bash 1450 + jq '.' rules/blobs.json 1451 + jq '.[] | {label, phashes: (.phashes | length)}' rules/blobs.json 1452 + ``` 1453 + 1454 + --- 1455 + 1456 + ## Deployment Notes 1457 + 1458 + ### Docker Compose 1459 + - Service: `app` (main binary) 1460 + - Dependencies: `redis` (persistent queue/cache) 1461 + - Environment: Sourced from `.env` 1462 + - Logs: Streamed to stdout/logs volume 1463 + 1464 + ### Graceful Shutdown 1465 + - Jetstream writes final cursor 1466 + - Workers finish active jobs (no kill -9) 1467 + - Metrics logged at shutdown 1468 + - Redis connections closed 1469 + 1470 + ### Monitoring 1471 + - Metrics every 60 seconds (INFO level) 1472 + - Final metrics on shutdown 1473 + - Structured JSON logging (if enabled) 1474 + - No built-in Prometheus endpoint (yet) 1475 + 1476 + --- 1477 + 1478 + ## References & Related Projects 1479 + 1480 + **Parent Repository:** `tangled.sh:skywatch.blue/skywatch-phash-rs` 1481 + **Jacquard Dependency:** `../jacquard/crates/jacquard` (local path) 1482 + **Bluesky/ATProto:** https://github.com/bluesky-social/atproto 1483 + **Image Hasher:** https://github.com/Ed-von-Schleck/image-hasher 1484 + 1485 + --- 1486 + 1487 + **Document Version:** 1.0 1488 + **Last Updated:** 2025-10-26 1489 + **Codebase Version:** 0.2.0 1490 + **Rust Edition:** 2024

+19 -25

src/cache/mod.rs

··· 2 2 use redis::AsyncCommands; 3 3 use tracing::{debug, info}; 4 4 5 - use crate::config::Config; 5 + use crate::redis_pool::RedisPool; 6 6 7 7 /// Redis key prefix for phash cache 8 8 const PHASH_CACHE_PREFIX: &str = "phash"; ··· 10 10 /// Phash cache for storing computed image hashes 11 11 #[derive(Clone)] 12 12 pub struct PhashCache { 13 - redis: redis::aio::MultiplexedConnection, 13 + pool: RedisPool, 14 14 ttl: u64, 15 15 enabled: bool, 16 16 } 17 17 18 18 impl PhashCache { 19 19 /// Create a new phash cache 20 - pub async fn new(config: &Config) -> Result<Self> { 21 - info!("Connecting to Redis: {}", config.redis.url); 22 - 23 - let client = redis::Client::open(config.redis.url.as_str()).into_diagnostic()?; 24 - let redis = client 25 - .get_multiplexed_async_connection() 26 - .await 27 - .into_diagnostic()?; 28 - 29 - info!("Connected to Redis, cache enabled: {}", config.cache.enabled); 20 + pub fn new(pool: RedisPool, ttl: u64, enabled: bool) -> Self { 21 + info!("Phash cache initialized (enabled: {})", enabled); 30 22 31 - Ok(Self { 32 - redis, 33 - ttl: config.cache.ttl, 34 - enabled: config.cache.enabled, 35 - }) 23 + Self { 24 + pool, 25 + ttl, 26 + enabled, 27 + } 36 28 } 37 29 38 30 /// Get cached phash for a blob CID 39 - pub async fn get(&mut self, cid: &str) -> Result<Option<String>> { 31 + pub async fn get(&self, cid: &str) -> Result<Option<String>> { 40 32 if !self.enabled { 41 33 return Ok(None); 42 34 } 43 35 44 36 let key = format!("{}:{}", PHASH_CACHE_PREFIX, cid); 45 37 46 - let result: Option<String> = self.redis.get(&key).await.into_diagnostic()?; 38 + let mut conn = self.pool.get_connection().await?; 39 + let result: Option<String> = conn.get(&key).await.into_diagnostic()?; 47 40 48 41 if result.is_some() { 49 42 debug!("Cache hit for CID: {}", cid); ··· 55 48 } 56 49 57 50 /// Set cached phash for a blob CID 58 - pub async fn set(&mut self, cid: &str, phash: &str) -> Result<()> { 51 + pub async fn set(&self, cid: &str, phash: &str) -> Result<()> { 59 52 if !self.enabled { 60 53 return Ok(()); 61 54 } 62 55 63 56 let key = format!("{}:{}", PHASH_CACHE_PREFIX, cid); 64 57 65 - let _: () = self 66 - .redis 58 + let mut conn = self.pool.get_connection().await?; 59 + let _: () = conn 67 60 .set_ex(&key, phash, self.ttl) 68 61 .await 69 62 .into_diagnostic()?; ··· 74 67 } 75 68 76 69 /// Delete cached phash for a blob CID 77 - pub async fn delete(&mut self, cid: &str) -> Result<()> { 70 + pub async fn delete(&self, cid: &str) -> Result<()> { 78 71 if !self.enabled { 79 72 return Ok(()); 80 73 } 81 74 82 75 let key = format!("{}:{}", PHASH_CACHE_PREFIX, cid); 83 76 84 - let _: () = self.redis.del(&key).await.into_diagnostic()?; 77 + let mut conn = self.pool.get_connection().await?; 78 + let _: () = conn.del(&key).await.into_diagnostic()?; 85 79 86 80 debug!("Deleted cached phash for CID: {}", cid); 87 81 ··· 94 88 } 95 89 96 90 /// Get or compute phash with caching 97 - pub async fn get_or_compute<F, Fut>(&mut self, cid: &str, compute_fn: F) -> Result<String> 91 + pub async fn get_or_compute<F, Fut>(&self, cid: &str, compute_fn: F) -> Result<String> 98 92 where 99 93 F: FnOnce() -> Fut, 100 94 Fut: std::future::Future<Output = Result<String>>,

+10

src/config/mod.rs

··· 28 28 #[derive(Debug, Clone)] 29 29 pub struct RedisConfig { 30 30 pub url: String, 31 + pub health_check_interval_secs: u64, 32 + pub max_backoff_secs: u64, 31 33 } 32 34 33 35 #[derive(Debug, Clone)] ··· 35 37 pub concurrency: usize, 36 38 pub retry_attempts: u32, 37 39 pub retry_delay: u64, 40 + /// Timeout per blob download attempt in seconds (per format/endpoint) 41 + pub blob_download_timeout_secs: u64, 42 + /// Total timeout for all blob download fallback attempts in seconds 43 + pub blob_total_timeout_secs: u64, 38 44 } 39 45 40 46 #[derive(Debug, Clone)] ··· 103 109 }, 104 110 redis: RedisConfig { 105 111 url: get_env("REDIS_URL", Some("redis://localhost:6379"))?, 112 + health_check_interval_secs: get_env_u64("REDIS_HEALTH_CHECK_INTERVAL_SECS", 30), 113 + max_backoff_secs: get_env_u64("REDIS_MAX_BACKOFF_SECS", 10), 106 114 }, 107 115 processing: ProcessingConfig { 108 116 concurrency: get_env_usize("PROCESSING_CONCURRENCY", 10), 109 117 retry_attempts: get_env_u32("RETRY_ATTEMPTS", 3), 110 118 retry_delay: get_env_u64("RETRY_DELAY_MS", 1000), 119 + blob_download_timeout_secs: get_env_u64("BLOB_DOWNLOAD_TIMEOUT_SECS", 10), 120 + blob_total_timeout_secs: get_env_u64("BLOB_TOTAL_TIMEOUT_SECS", 30), 111 121 }, 112 122 cache: CacheConfig { 113 123 enabled: get_env_bool("CACHE_ENABLED", true),

+6

src/lib.rs

··· 26 26 // PLC Directory client 27 27 pub mod plc; 28 28 29 + // Resilience patterns 30 + pub mod resilience; 31 + 32 + // Redis connection pool 33 + pub mod redis_pool; 34 + 29 35 // Re-export commonly used types 30 36 pub use config::Config; 31 37 pub use types::{BlobCheck, BlobReference, ImageJob, MatchResult};

+53 -14

src/main.rs

··· 14 14 metrics::Metrics, 15 15 processor::matcher, 16 16 queue::{JobQueue, WorkerPool}, 17 + redis_pool::RedisPool, 18 + resilience::CircuitBreaker, 17 19 }; 18 20 19 21 #[tokio::main] ··· 57 59 let metrics = Metrics::new(); 58 60 info!("Metrics tracker initialized"); 59 61 62 + // Create Redis connection pool 63 + let redis_pool = RedisPool::new(config.redis.clone(), metrics.clone()).await?; 64 + info!("Redis connection pool initialized"); 65 + 66 + // Start Redis health check loop 67 + let health_check_pool = redis_pool.clone(); 68 + tokio::spawn(async move { 69 + health_check_pool.start_health_check_loop().await; 70 + }); 71 + 60 72 // Create cache 61 - let cache = PhashCache::new(&config).await?; 73 + let cache = PhashCache::new( 74 + redis_pool.clone(), 75 + config.cache.ttl, 76 + config.cache.enabled, 77 + ); 62 78 info!("Cache initialized (enabled: {})", cache.is_enabled()); 63 79 64 80 // Create job queue 65 - let queue = JobQueue::new(&config).await?; 81 + let queue = JobQueue::new(redis_pool.clone(), config.processing.retry_attempts); 66 82 info!("Job queue initialized"); 67 83 84 + // Create circuit breakers with metrics 85 + let ozone_circuit_breaker = CircuitBreaker::with_metrics( 86 + "ozone-api", 87 + 5, // 5 consecutive failures 88 + 60, // 60s timeout 89 + 1, // 1 success to close 90 + metrics.clone(), 91 + ); 92 + info!("Ozone API circuit breaker initialized"); 93 + 94 + let pds_circuit_breaker = CircuitBreaker::with_metrics( 95 + "pds-blob", 96 + 3, // 3 consecutive failures 97 + 300, // 5m timeout 98 + 1, // 1 success to close 99 + metrics.clone(), 100 + ); 101 + info!("PDS blob circuit breaker initialized"); 102 + 103 + let _plc_circuit_breaker = CircuitBreaker::with_metrics( 104 + "plc-resolution", 105 + 3, // 3 consecutive failures 106 + 300, // 5m timeout 107 + 1, // 1 success to close 108 + metrics.clone(), 109 + ); 110 + // TODO: Integrate PLC circuit breaker when PLC resolution is added to the processing flow 111 + // Currently, PlcClient::with_circuit_breaker() exists but is not used 112 + info!("PLC resolution circuit breaker initialized (deferred: PLC not in current processing path)"); 113 + 68 114 // Create worker pool 69 115 let worker_pool = WorkerPool::new( 70 116 config.clone(), ··· 72 118 agent.clone(), 73 119 blob_checks.clone(), 74 120 metrics.clone(), 75 - ); 121 + ozone_circuit_breaker, 122 + pds_circuit_breaker, 123 + )?; 76 124 info!( 77 125 "Worker pool created with {} workers", 78 126 config.processing.concurrency ··· 160 208 // Start job receiver (receives from jetstream, pushes to queue) 161 209 info!("Starting job receiver..."); 162 210 let receiver_metrics = metrics.clone(); 163 - let receiver_config = config.clone(); 211 + let receiver_queue = queue.clone(); 164 212 let receiver_handle = tokio::spawn(async move { 165 - // Create fresh queue connection for receiver 166 - let mut queue_for_receiver = match JobQueue::new(&receiver_config).await { 167 - Ok(q) => q, 168 - Err(e) => { 169 - error!("Failed to create queue for receiver: {}", e); 170 - return; 171 - } 172 - }; 173 - 174 213 while let Some(job) = job_rx.recv().await { 175 214 debug!("Job receiver got job: {}", job.post_uri); 176 215 receiver_metrics.inc_jobs_received(); 177 - match queue_for_receiver.push(&job).await { 216 + match receiver_queue.push(&job).await { 178 217 Ok(_) => { 179 218 debug!("Pushed job to Redis queue: {}", job.post_uri); 180 219 }

+75

src/metrics/mod.rs

··· 37 37 posts_already_reported: AtomicU64, 38 38 accounts_already_labeled: AtomicU64, 39 39 accounts_already_reported: AtomicU64, 40 + 41 + // Redis connection metrics 42 + redis_connection_failures: AtomicU64, 43 + redis_reconnect_attempts: AtomicU64, 44 + redis_health_status: AtomicU64, 45 + 46 + // Circuit breaker metrics 47 + circuit_breaker_transitions: AtomicU64, 48 + circuit_breaker_rejections: AtomicU64, 40 49 } 41 50 42 51 impl Metrics { ··· 61 70 posts_already_reported: AtomicU64::new(0), 62 71 accounts_already_labeled: AtomicU64::new(0), 63 72 accounts_already_reported: AtomicU64::new(0), 73 + redis_connection_failures: AtomicU64::new(0), 74 + redis_reconnect_attempts: AtomicU64::new(0), 75 + redis_health_status: AtomicU64::new(1), 76 + circuit_breaker_transitions: AtomicU64::new(0), 77 + circuit_breaker_rejections: AtomicU64::new(0), 64 78 }), 65 79 } 66 80 } ··· 139 153 self.inner.accounts_already_reported.fetch_add(1, Ordering::Relaxed); 140 154 } 141 155 156 + // Redis connection metrics 157 + pub fn inc_redis_connection_failures(&self) { 158 + self.inner.redis_connection_failures.fetch_add(1, Ordering::Relaxed); 159 + } 160 + 161 + pub fn inc_redis_reconnect_attempts(&self) { 162 + self.inner.redis_reconnect_attempts.fetch_add(1, Ordering::Relaxed); 163 + } 164 + 165 + pub fn set_redis_health_status(&self, healthy: bool) { 166 + self.inner.redis_health_status.store(if healthy { 1 } else { 0 }, Ordering::Relaxed); 167 + } 168 + 169 + // Circuit breaker metrics 170 + pub fn inc_circuit_breaker_transitions(&self) { 171 + self.inner.circuit_breaker_transitions.fetch_add(1, Ordering::Relaxed); 172 + } 173 + 174 + pub fn inc_circuit_breaker_rejections(&self) { 175 + self.inner.circuit_breaker_rejections.fetch_add(1, Ordering::Relaxed); 176 + } 177 + 142 178 // Getters 143 179 pub fn jobs_received(&self) -> u64 { 144 180 self.inner.jobs_received.load(Ordering::Relaxed) ··· 208 244 self.inner.accounts_already_reported.load(Ordering::Relaxed) 209 245 } 210 246 247 + pub fn redis_connection_failures(&self) -> u64 { 248 + self.inner.redis_connection_failures.load(Ordering::Relaxed) 249 + } 250 + 251 + pub fn redis_reconnect_attempts(&self) -> u64 { 252 + self.inner.redis_reconnect_attempts.load(Ordering::Relaxed) 253 + } 254 + 255 + pub fn redis_health_status(&self) -> bool { 256 + self.inner.redis_health_status.load(Ordering::Relaxed) == 1 257 + } 258 + 259 + pub fn circuit_breaker_transitions(&self) -> u64 { 260 + self.inner.circuit_breaker_transitions.load(Ordering::Relaxed) 261 + } 262 + 263 + pub fn circuit_breaker_rejections(&self) -> u64 { 264 + self.inner.circuit_breaker_rejections.load(Ordering::Relaxed) 265 + } 266 + 211 267 /// Log current metrics 212 268 pub fn log_stats(&self) { 213 269 info!("=== Metrics ==="); ··· 241 297 self.accounts_already_labeled(), 242 298 self.accounts_already_reported() 243 299 ); 300 + info!("Redis: connection_failures={}, reconnect_attempts={}, health_status={}", 301 + self.redis_connection_failures(), 302 + self.redis_reconnect_attempts(), 303 + if self.redis_health_status() { "healthy" } else { "degraded" } 304 + ); 305 + info!("Circuit breakers: transitions={}, rejections={}", 306 + self.circuit_breaker_transitions(), 307 + self.circuit_breaker_rejections() 308 + ); 244 309 } 245 310 246 311 /// Calculate cache hit rate ··· 274 339 posts_already_reported: self.posts_already_reported(), 275 340 accounts_already_labeled: self.accounts_already_labeled(), 276 341 accounts_already_reported: self.accounts_already_reported(), 342 + redis_connection_failures: self.redis_connection_failures(), 343 + redis_reconnect_attempts: self.redis_reconnect_attempts(), 344 + redis_health_status: if self.redis_health_status() { 1 } else { 0 }, 345 + circuit_breaker_transitions: self.circuit_breaker_transitions(), 346 + circuit_breaker_rejections: self.circuit_breaker_rejections(), 277 347 } 278 348 } 279 349 } ··· 303 373 pub posts_already_reported: u64, 304 374 pub accounts_already_labeled: u64, 305 375 pub accounts_already_reported: u64, 376 + pub redis_connection_failures: u64, 377 + pub redis_reconnect_attempts: u64, 378 + pub redis_health_status: u64, 379 + pub circuit_breaker_transitions: u64, 380 + pub circuit_breaker_rejections: u64, 306 381 } 307 382 308 383 #[cfg(test)]

+7 -3

src/moderation/account.rs

··· 19 19 use crate::moderation::{ 20 20 build_mod_tool_meta, build_timestamped_comment, send_moderation_event, 21 21 }; 22 + use crate::resilience::CircuitBreaker; 22 23 23 24 /// Label an account with a specific label via Ozone moderation API 24 25 pub async fn label_account<'a>( 25 26 agent: &Agent<MemoryCredentialSession>, 26 27 config: &Config, 27 28 rate_limiter: &RateLimiter, 29 + circuit_breaker: Option<&CircuitBreaker>, 28 30 did: &Did<'a>, 29 31 label_val: &str, 30 32 check_comment: &str, ··· 58 60 }) 59 61 .build(); 60 62 61 - send_moderation_event(agent, config, rate_limiter, event).await?; 63 + send_moderation_event(agent, config, rate_limiter, circuit_breaker, event).await?; 62 64 63 65 debug!("Successfully labeled account: {}", did); 64 66 ··· 70 72 agent: &Agent<MemoryCredentialSession>, 71 73 config: &Config, 72 74 rate_limiter: &RateLimiter, 75 + circuit_breaker: Option<&CircuitBreaker>, 73 76 did: &Did<'a>, 74 77 reason: ReasonType<'static>, 75 78 check_comment: &str, ··· 103 106 }) 104 107 .build(); 105 108 106 - send_moderation_event(agent, config, rate_limiter, event).await?; 109 + send_moderation_event(agent, config, rate_limiter, circuit_breaker, event).await?; 107 110 108 111 debug!("Successfully reported account: {}", did); 109 112 ··· 115 118 agent: &Agent<MemoryCredentialSession>, 116 119 config: &Config, 117 120 rate_limiter: &RateLimiter, 121 + circuit_breaker: Option<&CircuitBreaker>, 118 122 did: &Did<'a>, 119 123 comment: &str, 120 124 created_by: &Did<'a>, ··· 136 140 ))) 137 141 .build(); 138 142 139 - send_moderation_event(agent, config, rate_limiter, event).await?; 143 + send_moderation_event(agent, config, rate_limiter, circuit_breaker, event).await?; 140 144 141 145 debug!("Successfully took down account: {}", did); 142 146

+62 -4

src/moderation/helpers.rs

··· 8 8 use jacquard_common::IntoStatic; 9 9 use miette::{IntoDiagnostic, Result}; 10 10 use std::collections::BTreeMap; 11 + use std::time::Duration; 12 + use tracing::{debug, warn}; 11 13 12 14 use crate::config::Config; 13 15 use crate::moderation::rate_limiter::RateLimiter; 16 + use crate::resilience::CircuitBreaker; 14 17 15 18 pub fn build_timestamped_comment(check_comment: &str, uri: &str, phash: &str) -> String { 16 19 let timestamp = chrono::Utc::now().to_rfc3339(); ··· 60 63 agent: &Agent<MemoryCredentialSession>, 61 64 config: &Config, 62 65 rate_limiter: &RateLimiter, 66 + circuit_breaker: Option<&CircuitBreaker>, 63 67 event: EmitEvent<'a>, 64 68 ) -> Result<()> { 65 - rate_limiter.wait().await; 69 + const MAX_RETRIES: u32 = 3; 70 + let mut retry_count = 0; 71 + let mut backoff = Duration::from_millis(100); 66 72 67 - let opts = build_moderation_call_opts(config); 68 - agent.send_with_opts(event, opts).await.into_diagnostic()?; 73 + loop { 74 + if let Some(cb) = circuit_breaker { 75 + if !cb.is_available().await { 76 + warn!("Circuit breaker '{}' is open, rejecting Ozone API call", cb.name()); 77 + return Err(miette::miette!("Circuit breaker open for Ozone API")); 78 + } 79 + } 69 80 70 - Ok(()) 81 + rate_limiter.wait().await; 82 + 83 + let opts = build_moderation_call_opts(config); 84 + match agent.send_with_opts(event.clone(), opts).await.into_diagnostic() { 85 + Ok(_) => { 86 + debug!("Moderation event sent successfully"); 87 + if let Some(cb) = circuit_breaker { 88 + cb.record_success().await; 89 + } 90 + return Ok(()); 91 + } 92 + Err(e) => { 93 + retry_count += 1; 94 + let error_msg = format!("{}", e); 95 + 96 + // Check if error is potentially transient 97 + let is_transient = error_msg.contains("500") 98 + || error_msg.contains("502") 99 + || error_msg.contains("503") 100 + || error_msg.contains("504") 101 + || error_msg.contains("timeout") 102 + || error_msg.contains("connection"); 103 + 104 + if retry_count > MAX_RETRIES || !is_transient { 105 + warn!( 106 + "Moderation API call failed (attempt {}/{}): {} (transient: {})", 107 + retry_count, MAX_RETRIES, error_msg, is_transient 108 + ); 109 + if let Some(cb) = circuit_breaker { 110 + cb.record_failure().await; 111 + } 112 + return Err(e); 113 + } 114 + 115 + warn!( 116 + "Moderation API call failed (attempt {}/{}), retrying in {:.0}ms: {}", 117 + retry_count, 118 + MAX_RETRIES, 119 + backoff.as_secs_f64() * 1000.0, 120 + error_msg 121 + ); 122 + 123 + tokio::time::sleep(backoff).await; 124 + let next_backoff_ms = (backoff.as_millis() as u64 * 2).min(5000); // Cap at 5s 125 + backoff = Duration::from_millis(next_backoff_ms); 126 + } 127 + } 128 + } 71 129 }

+7 -3

src/moderation/post.rs

··· 19 19 use crate::moderation::{ 20 20 build_mod_tool_meta, build_timestamped_comment, send_moderation_event, 21 21 }; 22 + use crate::resilience::CircuitBreaker; 22 23 23 24 /// Label a post with a specific label via Ozone moderation API 24 25 pub async fn label_post<'a>( 25 26 agent: &Agent<MemoryCredentialSession>, 26 27 config: &Config, 27 28 rate_limiter: &RateLimiter, 29 + circuit_breaker: Option<&CircuitBreaker>, 28 30 post_uri: &AtUri<'a>, 29 31 post_cid: &Cid<'a>, 30 32 label_val: &str, ··· 59 61 }) 60 62 .build(); 61 63 62 - send_moderation_event(agent, config, rate_limiter, event).await?; 64 + send_moderation_event(agent, config, rate_limiter, circuit_breaker, event).await?; 63 65 64 66 debug!("Successfully labeled post: {}", post_uri); 65 67 ··· 71 73 agent: &Agent<MemoryCredentialSession>, 72 74 config: &Config, 73 75 rate_limiter: &RateLimiter, 76 + circuit_breaker: Option<&CircuitBreaker>, 74 77 post_uri: &AtUri<'a>, 75 78 _post_cid: &Cid<'a>, 76 79 post_did: &Did<'a>, ··· 105 108 }) 106 109 .build(); 107 110 108 - send_moderation_event(agent, config, rate_limiter, event).await?; 111 + send_moderation_event(agent, config, rate_limiter, circuit_breaker, event).await?; 109 112 110 113 debug!("Successfully reported post: {}", post_uri); 111 114 ··· 117 120 agent: &Agent<MemoryCredentialSession>, 118 121 config: &Config, 119 122 rate_limiter: &RateLimiter, 123 + circuit_breaker: Option<&CircuitBreaker>, 120 124 post_uri: &AtUri<'a>, 121 125 post_cid: &Cid<'a>, 122 126 comment: &str, ··· 140 144 ))) 141 145 .build(); 142 146 143 - send_moderation_event(agent, config, rate_limiter, event).await?; 147 + send_moderation_event(agent, config, rate_limiter, circuit_breaker, event).await?; 144 148 145 149 debug!("Successfully took down post: {}", post_uri); 146 150

+8 -7

src/moderation/rate_limiter.rs

··· 3 3 state::{InMemoryState, NotKeyed}, 4 4 Quota, RateLimiter as GovernorRateLimiter, 5 5 }; 6 + use miette::{miette, Result}; 6 7 use std::sync::Arc; 7 8 use std::time::Duration; 8 9 ··· 15 16 impl RateLimiter { 16 17 /// Create a new rate limiter with the given rate limit in milliseconds 17 18 /// For example, rate_limit_ms = 100 means 100ms minimum between requests (10 requests per second) 18 - pub fn new(rate_limit_ms: u64) -> Self { 19 + pub fn new(rate_limit_ms: u64) -> Result<Self> { 19 20 let duration = if rate_limit_ms == 0 { 20 21 Duration::from_millis(1) 21 22 } else { 22 23 Duration::from_millis(rate_limit_ms) 23 24 }; 24 25 25 - // 1 request per rate_limit_ms duration 26 - let quota = Quota::with_period(duration).unwrap(); 26 + let quota = Quota::with_period(duration) 27 + .ok_or_else(|| miette!("Invalid rate limit duration: {}ms", rate_limit_ms))?; 27 28 let limiter = GovernorRateLimiter::direct(quota); 28 29 29 - Self { 30 + Ok(Self { 30 31 limiter: Arc::new(limiter), 31 - } 32 + }) 32 33 } 33 34 34 35 /// Wait until a request can be made according to the rate limit ··· 47 48 #[tokio::test] 48 49 async fn test_rate_limiter() { 49 50 // 100ms between requests = 10 requests per second 50 - let limiter = RateLimiter::new(100); 51 + let limiter = RateLimiter::new(100).unwrap(); 51 52 52 53 let start = Instant::now(); 53 54 ··· 65 66 #[tokio::test] 66 67 async fn test_rate_limiter_concurrent() { 67 68 // 100ms between requests = 10 requests per second 68 - let limiter = RateLimiter::new(100); 69 + let limiter = RateLimiter::new(100).unwrap(); 69 70 70 71 let start = Instant::now(); 71 72

+37 -1

src/plc/mod.rs

··· 4 4 use tracing::{debug, error, info, warn}; 5 5 6 6 use crate::config::PlcConfig; 7 + use crate::resilience::CircuitBreaker; 7 8 8 9 #[derive(Debug, Deserialize)] 9 10 pub struct DidDocument { ··· 26 27 pub struct PlcClient { 27 28 client: Client, 28 29 endpoints: Vec<String>, 30 + circuit_breaker: Option<CircuitBreaker>, 29 31 } 30 32 31 33 impl PlcClient { ··· 34 36 let mut endpoints = vec![config.endpoint.clone()]; 35 37 endpoints.extend(config.fallback_endpoints.clone()); 36 38 37 - Self { client, endpoints } 39 + Self { 40 + client, 41 + endpoints, 42 + circuit_breaker: None, 43 + } 44 + } 45 + 46 + /// Create a new PLC client with circuit breaker protection 47 + pub fn with_circuit_breaker( 48 + client: Client, 49 + config: &PlcConfig, 50 + circuit_breaker: CircuitBreaker, 51 + ) -> Self { 52 + let mut endpoints = vec![config.endpoint.clone()]; 53 + endpoints.extend(config.fallback_endpoints.clone()); 54 + 55 + Self { 56 + client, 57 + endpoints, 58 + circuit_breaker: Some(circuit_breaker), 59 + } 38 60 } 39 61 40 62 /// Resolve a DID to its DID document with automatic failover 41 63 pub async fn resolve_did(&self, did: &str) -> Result<DidDocument> { 64 + if let Some(cb) = &self.circuit_breaker { 65 + if !cb.is_available().await { 66 + warn!("Circuit breaker '{}' is open, rejecting PLC resolution", cb.name()); 67 + return Err(miette::miette!("Circuit breaker open for PLC resolution")); 68 + } 69 + } 70 + 42 71 let mut last_error = None; 43 72 44 73 for (idx, endpoint) in self.endpoints.iter().enumerate() { ··· 56 85 idx, 57 86 did 58 87 ); 88 + } 89 + if let Some(cb) = &self.circuit_breaker { 90 + cb.record_success().await; 59 91 } 60 92 return Ok(doc); 61 93 } ··· 87 119 did, 88 120 self.endpoints.len() 89 121 ); 122 + 123 + if let Some(cb) = &self.circuit_breaker { 124 + cb.record_failure().await; 125 + } 90 126 91 127 Err(last_error.unwrap_or_else(|| { 92 128 miette::miette!("All PLC endpoints failed for DID: {}", did)

+72 -14

src/processor/matcher.rs

··· 1 1 use miette::{IntoDiagnostic, Result}; 2 2 use reqwest::Client; 3 3 use std::path::Path; 4 + use std::time::{Duration, Instant}; 4 5 use tracing::{debug, info, warn}; 5 6 6 7 use crate::config::Config; 7 8 use crate::processor::phash; 9 + use crate::resilience::CircuitBreaker; 8 10 use crate::types::{BlobCheck, BlobReference, ImageJob, MatchResult}; 9 11 10 12 /// Load blob checks from a JSON file ··· 19 21 pub async fn download_blob( 20 22 client: &Client, 21 23 config: &Config, 24 + circuit_breaker: &CircuitBreaker, 22 25 did: &str, 23 26 cid: &str, 24 27 ) -> Result<Vec<u8>> { 28 + let start = Instant::now(); 29 + let per_attempt_timeout = Duration::from_secs(config.processing.blob_download_timeout_secs); 30 + let total_timeout = Duration::from_secs(config.processing.blob_total_timeout_secs); 31 + 25 32 // Try CDN first - attempt common image formats 26 33 for format in ["jpeg", "png", "webp"] { 34 + // Check if we've exceeded total timeout 35 + if start.elapsed() > total_timeout { 36 + warn!( 37 + "Blob download total timeout exceeded for did={}, cid={}", 38 + did, cid 39 + ); 40 + return Err(miette::miette!( 41 + "Blob download timeout after {:.1}s", 42 + start.elapsed().as_secs_f64() 43 + )); 44 + } 45 + 27 46 let cdn_url = format!( 28 47 "https://cdn.bsky.app/img/feed_fullsize/plain/{}/{}@{}", 29 48 did, cid, format 30 49 ); 31 50 32 - debug!("Trying CDN download: {}", cdn_url); 51 + debug!("Trying CDN download: {} (timeout: {}s)", cdn_url, config.processing.blob_download_timeout_secs); 33 52 34 - match client.get(&cdn_url).send().await { 53 + match client 54 + .get(&cdn_url) 55 + .timeout(per_attempt_timeout) 56 + .send() 57 + .await 58 + { 35 59 Ok(response) if response.status().is_success() => { 36 60 debug!("Successfully downloaded from CDN: did={}, cid={}", did, cid); 37 61 let bytes = response.bytes().await.into_diagnostic()?; ··· 41 65 debug!("CDN returned status {}, trying next format", response.status()); 42 66 } 43 67 Err(e) => { 44 - debug!("CDN request failed: {}, trying next format", e); 68 + debug!( 69 + "CDN request failed: {} (elapsed: {:.1}s), trying next format", 70 + e, 71 + start.elapsed().as_secs_f64() 72 + ); 45 73 } 46 74 } 47 75 } 48 76 49 77 // Fall back to PDS if CDN fails 50 - warn!("CDN failed for did={}, cid={}, falling back to PDS", did, cid); 78 + warn!( 79 + "CDN failed for did={}, cid={}, falling back to PDS (elapsed: {:.1}s)", 80 + did, 81 + cid, 82 + start.elapsed().as_secs_f64() 83 + ); 84 + 85 + // Check circuit breaker before attempting PDS 86 + if !circuit_breaker.is_available().await { 87 + warn!("Circuit breaker '{}' is open, rejecting PDS blob download", circuit_breaker.name()); 88 + return Err(miette::miette!("Circuit breaker open for PDS blob download")); 89 + } 90 + 91 + // Check if we've exceeded total timeout before PDS attempt 92 + if start.elapsed() > total_timeout { 93 + warn!("Blob download total timeout exceeded before PDS fallback"); 94 + return Err(miette::miette!( 95 + "Blob download timeout after {:.1}s", 96 + start.elapsed().as_secs_f64() 97 + )); 98 + } 51 99 52 100 let pds_url = format!( 53 101 "{}/xrpc/com.atproto.sync.getBlob?did={}&cid={}", 54 102 config.pds.endpoint, did, cid 55 103 ); 56 104 57 - debug!("Downloading from PDS: {}", pds_url); 105 + debug!("Downloading from PDS: {} (timeout: {}s)", pds_url, config.processing.blob_download_timeout_secs); 58 106 59 - let response = client 107 + match client 60 108 .get(&pds_url) 109 + .timeout(per_attempt_timeout) 61 110 .send() 62 111 .await 63 - .into_diagnostic()? 64 - .error_for_status() 65 - .into_diagnostic()?; 66 - 67 - let bytes = response.bytes().await.into_diagnostic()?; 68 - Ok(bytes.to_vec()) 112 + .into_diagnostic() 113 + .and_then(|resp| resp.error_for_status().into_diagnostic()) 114 + { 115 + Ok(response) => { 116 + let bytes = response.bytes().await.into_diagnostic()?; 117 + circuit_breaker.record_success().await; 118 + Ok(bytes.to_vec()) 119 + } 120 + Err(e) => { 121 + circuit_breaker.record_failure().await; 122 + Err(miette::miette!("PDS blob download failed: {}", e)) 123 + } 124 + } 69 125 } 70 126 71 127 /// Match a computed phash against blob checks ··· 116 172 pub async fn process_blob( 117 173 client: &Client, 118 174 config: &Config, 175 + circuit_breaker: &CircuitBreaker, 119 176 blob_checks: &[BlobCheck], 120 177 did: &str, 121 178 blob: &BlobReference, 122 179 ) -> Result<Option<MatchResult>> { 123 - let image_bytes = download_blob(client, config, did, &blob.cid).await?; 180 + let image_bytes = download_blob(client, config, circuit_breaker, did, &blob.cid).await?; 124 181 let phash = phash::compute_phash(&image_bytes)?; 125 182 debug!("Computed phash for blob {}: {}", blob.cid, phash); 126 183 ··· 133 190 pub async fn process_image_job( 134 191 client: &Client, 135 192 config: &Config, 193 + circuit_breaker: &CircuitBreaker, 136 194 blob_checks: &[BlobCheck], 137 195 job: &ImageJob, 138 196 ) -> Result<Vec<MatchResult>> { ··· 145 203 let mut matches = Vec::new(); 146 204 147 205 for blob in &job.blobs { 148 - match process_blob(client, config, blob_checks, &job.post_did, blob).await { 206 + match process_blob(client, config, circuit_breaker, blob_checks, &job.post_did, blob).await { 149 207 Ok(Some(result)) => { 150 208 matches.push(result); 151 209 }

+28 -36

src/queue/redis_queue.rs

··· 2 2 use redis::AsyncCommands; 3 3 use tracing::{debug, info, warn}; 4 4 5 - use crate::config::Config; 5 + use crate::redis_pool::RedisPool; 6 6 use crate::types::ImageJob; 7 7 8 8 /// Redis queue names ··· 13 13 /// Redis-based job queue for ImageJob processing 14 14 #[derive(Clone)] 15 15 pub struct JobQueue { 16 - redis: redis::aio::MultiplexedConnection, 16 + pool: RedisPool, 17 17 max_retries: u32, 18 18 } 19 19 20 20 impl JobQueue { 21 21 /// Create a new job queue 22 - pub async fn new(config: &Config) -> Result<Self> { 23 - info!("Connecting to Redis for job queue: {}", config.redis.url); 24 - 25 - let client = redis::Client::open(config.redis.url.as_str()).into_diagnostic()?; 26 - let redis = client 27 - .get_multiplexed_async_connection() 28 - .await 29 - .into_diagnostic()?; 30 - 31 - info!("Job queue connected to Redis"); 22 + pub fn new(pool: RedisPool, max_retries: u32) -> Self { 23 + info!("Job queue initialized with Redis pool"); 32 24 33 - Ok(Self { 34 - redis, 35 - max_retries: config.processing.retry_attempts, 36 - }) 25 + Self { 26 + pool, 27 + max_retries, 28 + } 37 29 } 38 30 39 31 /// Push a job to the pending queue 40 - pub async fn push(&mut self, job: &ImageJob) -> Result<()> { 32 + pub async fn push(&self, job: &ImageJob) -> Result<()> { 41 33 let job_json = serde_json::to_string(job).into_diagnostic()?; 42 34 43 - let _: () = self 44 - .redis 35 + let mut conn = self.pool.get_connection().await?; 36 + let _: () = conn 45 37 .rpush(PENDING_QUEUE, &job_json) 46 38 .await 47 39 .into_diagnostic()?; ··· 52 44 } 53 45 54 46 /// Pop a job from the pending queue (blocking with timeout) 55 - pub async fn pop(&mut self, timeout_secs: usize) -> Result<Option<ImageJob>> { 56 - let result: Option<Vec<String>> = self 57 - .redis 47 + pub async fn pop(&self, timeout_secs: usize) -> Result<Option<ImageJob>> { 48 + let mut conn = self.pool.get_connection().await?; 49 + let result: Option<Vec<String>> = conn 58 50 .blpop(PENDING_QUEUE, timeout_secs as f64) 59 51 .await 60 52 .into_diagnostic()?; ··· 76 68 } 77 69 78 70 /// Retry a failed job (increment attempts and re-queue) 79 - pub async fn retry(&mut self, mut job: ImageJob) -> Result<()> { 71 + pub async fn retry(&self, mut job: ImageJob) -> Result<()> { 80 72 job.attempts += 1; 81 73 82 74 if job.attempts >= self.max_retries { ··· 97 89 } 98 90 99 91 /// Move a job to the dead letter queue 100 - async fn move_to_dead_letter(&mut self, job: &ImageJob) -> Result<()> { 92 + async fn move_to_dead_letter(&self, job: &ImageJob) -> Result<()> { 101 93 let job_json = serde_json::to_string(job).into_diagnostic()?; 102 94 103 - let _: () = self 104 - .redis 95 + let mut conn = self.pool.get_connection().await?; 96 + let _: () = conn 105 97 .rpush(DEAD_LETTER_QUEUE, &job_json) 106 98 .await 107 99 .into_diagnostic()?; ··· 112 104 } 113 105 114 106 /// Get queue statistics 115 - pub async fn stats(&mut self) -> Result<QueueStats> { 116 - let pending: usize = self.redis.llen(PENDING_QUEUE).await.into_diagnostic()?; 117 - let processing: usize = self 118 - .redis 107 + pub async fn stats(&self) -> Result<QueueStats> { 108 + let mut conn = self.pool.get_connection().await?; 109 + let pending: usize = conn.llen(PENDING_QUEUE).await.into_diagnostic()?; 110 + let processing: usize = conn 119 111 .llen(PROCESSING_QUEUE) 120 112 .await 121 113 .into_diagnostic()?; 122 - let dead: usize = self.redis.llen(DEAD_LETTER_QUEUE).await.into_diagnostic()?; 114 + let dead: usize = conn.llen(DEAD_LETTER_QUEUE).await.into_diagnostic()?; 123 115 124 116 Ok(QueueStats { 125 117 pending, ··· 129 121 } 130 122 131 123 /// Clear all queues (for testing/maintenance) 132 - pub async fn clear_all(&mut self) -> Result<()> { 133 - let _: () = self.redis.del(PENDING_QUEUE).await.into_diagnostic()?; 134 - let _: () = self.redis.del(PROCESSING_QUEUE).await.into_diagnostic()?; 135 - let _: () = self 136 - .redis 124 + pub async fn clear_all(&self) -> Result<()> { 125 + let mut conn = self.pool.get_connection().await?; 126 + let _: () = conn.del(PENDING_QUEUE).await.into_diagnostic()?; 127 + let _: () = conn.del(PROCESSING_QUEUE).await.into_diagnostic()?; 128 + let _: () = conn 137 129 .del(DEAD_LETTER_QUEUE) 138 130 .await 139 131 .into_diagnostic()?;

+36 -22

src/queue/worker.rs

··· 15 15 use crate::moderation::{account, claims, post, rate_limiter::RateLimiter}; 16 16 use crate::processor::matcher; 17 17 use crate::queue::redis_queue::JobQueue; 18 + use crate::resilience::CircuitBreaker; 18 19 use crate::types::{BlobCheck, ImageJob, MatchResult}; 19 20 20 21 /// Macro to handle moderation actions with claim checking ··· 63 64 blob_checks: Vec<BlobCheck>, 64 65 metrics: Metrics, 65 66 rate_limiter: RateLimiter, 67 + ozone_circuit_breaker: CircuitBreaker, 68 + pds_circuit_breaker: CircuitBreaker, 66 69 } 67 70 68 71 impl WorkerPool { ··· 73 76 agent: AgentSession, 74 77 blob_checks: Vec<BlobCheck>, 75 78 metrics: Metrics, 76 - ) -> Self { 77 - let rate_limiter = RateLimiter::new(config.moderation.rate_limit); 79 + ozone_circuit_breaker: CircuitBreaker, 80 + pds_circuit_breaker: CircuitBreaker, 81 + ) -> Result<Self> { 82 + let rate_limiter = RateLimiter::new(config.moderation.rate_limit)?; 78 83 79 - Self { 84 + Ok(Self { 80 85 config, 81 86 client, 82 87 agent, 83 88 blob_checks, 84 89 metrics, 85 90 rate_limiter, 86 - } 91 + ozone_circuit_breaker, 92 + pds_circuit_breaker, 93 + }) 87 94 } 88 95 89 96 /// Start the worker pool - processes jobs sequentially 90 97 /// Concurrency is achieved by running multiple instances of this concurrently 91 98 pub async fn start( 92 99 &self, 93 - mut queue: JobQueue, 100 + queue: JobQueue, 94 101 mut cache: PhashCache, 95 102 mut shutdown_rx: tokio::sync::broadcast::Receiver<()>, 96 103 ) -> Result<()> { 104 + // Create Redis connection once at worker startup, reuse for all jobs 105 + let redis_client = redis::Client::open(self.config.redis.url.as_str()) 106 + .into_diagnostic()?; 107 + let mut redis_conn = redis_client 108 + .get_multiplexed_async_connection() 109 + .await 110 + .into_diagnostic() 111 + .map_err(|e| miette::miette!("Failed to establish Redis connection: {}", e))?; 112 + 97 113 loop { 98 114 tokio::select! { 99 115 _ = shutdown_rx.recv() => { ··· 105 121 match job_result { 106 122 Ok(Some(job)) => { 107 123 debug!("Worker popped job from queue: {}", job.post_uri); 108 - let redis_client = match redis::Client::open(self.config.redis.url.as_str()) { 109 - Ok(c) => c, 110 - Err(e) => { 111 - error!("Failed to create Redis client: {}", e); 112 - continue; 113 - } 114 - }; 115 - 116 - let mut redis_conn = match redis_client.get_multiplexed_async_connection().await { 117 - Ok(conn) => conn, 118 - Err(e) => { 119 - error!("Failed to connect to Redis: {}", e); 120 - continue; 121 - } 122 - }; 123 124 124 125 let job_clone = job.clone(); 125 126 if let Err(e) = Self::process_job( ··· 129 130 &self.blob_checks, 130 131 &self.metrics, 131 132 &self.rate_limiter, 133 + &self.ozone_circuit_breaker, 134 + &self.pds_circuit_breaker, 132 135 &mut cache, 133 136 &mut redis_conn, 134 137 job, ··· 166 169 blob_checks: &[BlobCheck], 167 170 metrics: &Metrics, 168 171 rate_limiter: &RateLimiter, 172 + ozone_circuit_breaker: &CircuitBreaker, 173 + pds_circuit_breaker: &CircuitBreaker, 169 174 cache: &mut PhashCache, 170 175 redis_conn: &mut redis::aio::MultiplexedConnection, 171 176 job: ImageJob, ··· 174 179 debug!("Processing job: {}", job.post_uri); 175 180 176 181 let matches = 177 - Self::process_job_blobs(config, client, blob_checks, metrics, cache, &job).await?; 182 + Self::process_job_blobs(config, client, blob_checks, metrics, pds_circuit_breaker, cache, &job).await?; 178 183 179 184 if matches.is_empty() { 180 185 debug!("No matches found for job: {}", job.post_uri); ··· 189 194 agent, 190 195 metrics, 191 196 rate_limiter, 197 + ozone_circuit_breaker, 192 198 redis_conn, 193 199 &job, 194 200 &match_result, ··· 214 220 client: &Client, 215 221 blob_checks: &[BlobCheck], 216 222 metrics: &Metrics, 223 + pds_circuit_breaker: &CircuitBreaker, 217 224 cache: &mut PhashCache, 218 225 job: &ImageJob, 219 226 ) -> Result<Vec<MatchResult>> { ··· 233 240 234 241 // Download and compute 235 242 let image_bytes = 236 - matcher::download_blob(client, config, &job.post_did, &blob.cid).await?; 243 + matcher::download_blob(client, config, pds_circuit_breaker, &job.post_did, &blob.cid).await?; 237 244 let computed_phash = crate::processor::phash::compute_phash(&image_bytes)?; 238 245 239 246 // Store in cache ··· 262 269 agent: &Arc<Agent<MemoryCredentialSession>>, 263 270 metrics: &Metrics, 264 271 rate_limiter: &RateLimiter, 272 + ozone_circuit_breaker: &CircuitBreaker, 265 273 redis_conn: &mut redis::aio::MultiplexedConnection, 266 274 job: &ImageJob, 267 275 match_result: &MatchResult, ··· 283 291 agent.as_ref(), 284 292 config, 285 293 rate_limiter, 294 + Some(ozone_circuit_breaker), 286 295 &job.post_uri, 287 296 &job.post_cid, 288 297 &job.post_did, ··· 304 313 agent.as_ref(), 305 314 config, 306 315 rate_limiter, 316 + Some(ozone_circuit_breaker), 307 317 &job.post_uri, 308 318 &job.post_cid, 309 319 &check.label, ··· 325 335 agent.as_ref(), 326 336 config, 327 337 rate_limiter, 338 + Some(ozone_circuit_breaker), 328 339 &job.post_did, 329 340 ReasonType::ComAtprotoModerationDefsReasonSpam, 330 341 &check.comment, ··· 345 356 agent.as_ref(), 346 357 config, 347 358 rate_limiter, 359 + Some(ozone_circuit_breaker), 348 360 &job.post_did, 349 361 &check.label, 350 362 &check.comment, ··· 372 384 blob_checks: self.blob_checks.clone(), 373 385 metrics: self.metrics.clone(), 374 386 rate_limiter: self.rate_limiter.clone(), 387 + ozone_circuit_breaker: self.ozone_circuit_breaker.clone(), 388 + pds_circuit_breaker: self.pds_circuit_breaker.clone(), 375 389 } 376 390 } 377 391 }

+206

src/redis_pool.rs

··· 1 + use miette::{IntoDiagnostic, Result}; 2 + use redis::aio::ConnectionManager; 3 + use redis::Client; 4 + use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; 5 + use std::sync::Arc; 6 + use std::time::Duration; 7 + use tokio::sync::RwLock; 8 + use tokio::time::sleep; 9 + use tracing::{debug, error, info, warn}; 10 + 11 + use crate::config::RedisConfig; 12 + use crate::metrics::Metrics; 13 + 14 + const INITIAL_BACKOFF_MS: u64 = 100; 15 + const MAX_CONSECUTIVE_FAILURES: u64 = 5; 16 + 17 + #[derive(Clone)] 18 + pub struct RedisPool { 19 + inner: Arc<RedisPoolInner>, 20 + } 21 + 22 + struct RedisPoolInner { 23 + config: RedisConfig, 24 + manager: RwLock<Option<ConnectionManager>>, 25 + metrics: Metrics, 26 + is_healthy: AtomicBool, 27 + consecutive_failures: AtomicU64, 28 + } 29 + 30 + impl RedisPool { 31 + pub async fn new(config: RedisConfig, metrics: Metrics) -> Result<Self> { 32 + info!("Initializing Redis connection pool: {}", config.url); 33 + 34 + let client = Client::open(config.url.as_str()).into_diagnostic()?; 35 + let manager = ConnectionManager::new(client.clone()) 36 + .await 37 + .into_diagnostic()?; 38 + 39 + info!("Redis connection pool initialized successfully"); 40 + 41 + let pool = Self { 42 + inner: Arc::new(RedisPoolInner { 43 + config, 44 + manager: RwLock::new(Some(manager)), 45 + metrics, 46 + is_healthy: AtomicBool::new(true), 47 + consecutive_failures: AtomicU64::new(0), 48 + }), 49 + }; 50 + 51 + Ok(pool) 52 + } 53 + 54 + pub async fn get_connection(&self) -> Result<ConnectionManager> { 55 + let manager_lock = self.inner.manager.read().await; 56 + 57 + if let Some(manager) = manager_lock.as_ref() { 58 + return Ok(manager.clone()); 59 + } 60 + 61 + drop(manager_lock); 62 + 63 + self.reconnect_with_backoff().await 64 + } 65 + 66 + async fn reconnect_with_backoff(&self) -> Result<ConnectionManager> { 67 + let mut backoff_ms = INITIAL_BACKOFF_MS; 68 + let max_backoff_ms = self.inner.config.max_backoff_secs * 1000; 69 + 70 + loop { 71 + let failures = self.inner.consecutive_failures.load(Ordering::Relaxed); 72 + 73 + if failures >= MAX_CONSECUTIVE_FAILURES { 74 + error!( 75 + "Redis connection failed {} times, entering degraded state", 76 + failures 77 + ); 78 + self.inner.is_healthy.store(false, Ordering::Relaxed); 79 + self.inner.metrics.set_redis_health_status(false); 80 + } 81 + 82 + self.inner.metrics.inc_redis_reconnect_attempts(); 83 + 84 + info!( 85 + "Attempting Redis reconnection (backoff: {}ms, failures: {})", 86 + backoff_ms, failures 87 + ); 88 + 89 + match Client::open(self.inner.config.url.as_str()) { 90 + Ok(client) => match ConnectionManager::new(client).await { 91 + Ok(manager) => { 92 + info!("Redis reconnection successful"); 93 + self.inner.consecutive_failures.store(0, Ordering::Relaxed); 94 + self.inner.is_healthy.store(true, Ordering::Relaxed); 95 + self.inner.metrics.set_redis_health_status(true); 96 + 97 + let mut manager_lock = self.inner.manager.write().await; 98 + *manager_lock = Some(manager.clone()); 99 + 100 + return Ok(manager); 101 + } 102 + Err(e) => { 103 + error!("Failed to create Redis connection manager: {}", e); 104 + self.inner.consecutive_failures.fetch_add(1, Ordering::Relaxed); 105 + self.inner.metrics.inc_redis_connection_failures(); 106 + } 107 + }, 108 + Err(e) => { 109 + error!("Failed to create Redis client: {}", e); 110 + self.inner.consecutive_failures.fetch_add(1, Ordering::Relaxed); 111 + self.inner.metrics.inc_redis_connection_failures(); 112 + } 113 + } 114 + 115 + sleep(Duration::from_millis(backoff_ms)).await; 116 + backoff_ms = (backoff_ms * 2).min(max_backoff_ms); 117 + } 118 + } 119 + 120 + pub async fn health_check(&self) -> bool { 121 + let manager_lock = self.inner.manager.read().await; 122 + 123 + if let Some(manager) = manager_lock.as_ref() { 124 + let mut conn = manager.clone(); 125 + match redis::cmd("PING").query_async::<String>(&mut conn).await { 126 + Ok(response) if response == "PONG" => { 127 + debug!("Redis health check: OK"); 128 + self.inner.consecutive_failures.store(0, Ordering::Relaxed); 129 + self.inner.is_healthy.store(true, Ordering::Relaxed); 130 + self.inner.metrics.set_redis_health_status(true); 131 + return true; 132 + } 133 + Ok(response) => { 134 + warn!("Redis health check: unexpected response '{}'", response); 135 + } 136 + Err(e) => { 137 + warn!("Redis health check failed: {}", e); 138 + self.inner.consecutive_failures.fetch_add(1, Ordering::Relaxed); 139 + self.inner.metrics.inc_redis_connection_failures(); 140 + } 141 + } 142 + } else { 143 + warn!("Redis health check: no connection available"); 144 + } 145 + 146 + self.inner.is_healthy.store(false, Ordering::Relaxed); 147 + self.inner.metrics.set_redis_health_status(false); 148 + false 149 + } 150 + 151 + pub fn is_healthy(&self) -> bool { 152 + self.inner.is_healthy.load(Ordering::Relaxed) 153 + } 154 + 155 + pub async fn start_health_check_loop(self) { 156 + let interval_secs = self.inner.config.health_check_interval_secs; 157 + info!( 158 + "Starting Redis health check loop (interval: {}s)", 159 + interval_secs 160 + ); 161 + 162 + let mut interval = tokio::time::interval(Duration::from_secs(interval_secs)); 163 + loop { 164 + interval.tick().await; 165 + self.health_check().await; 166 + } 167 + } 168 + } 169 + 170 + pub fn calculate_backoff(attempt: u64, max_backoff_secs: u64) -> Duration { 171 + let backoff_ms = INITIAL_BACKOFF_MS * 2u64.pow(attempt.min(10) as u32); 172 + let max_backoff_ms = max_backoff_secs * 1000; 173 + Duration::from_millis(backoff_ms.min(max_backoff_ms)) 174 + } 175 + 176 + #[cfg(test)] 177 + mod tests { 178 + use super::*; 179 + 180 + #[test] 181 + fn test_calculate_backoff() { 182 + assert_eq!(calculate_backoff(0, 10), Duration::from_millis(100)); 183 + assert_eq!(calculate_backoff(1, 10), Duration::from_millis(200)); 184 + assert_eq!(calculate_backoff(2, 10), Duration::from_millis(400)); 185 + assert_eq!(calculate_backoff(3, 10), Duration::from_millis(800)); 186 + assert_eq!(calculate_backoff(4, 10), Duration::from_millis(1600)); 187 + assert_eq!(calculate_backoff(5, 10), Duration::from_millis(3200)); 188 + assert_eq!(calculate_backoff(6, 10), Duration::from_millis(6400)); 189 + assert_eq!(calculate_backoff(7, 10), Duration::from_millis(10000)); 190 + assert_eq!(calculate_backoff(8, 10), Duration::from_millis(10000)); 191 + assert_eq!(calculate_backoff(100, 10), Duration::from_millis(10000)); 192 + } 193 + 194 + #[test] 195 + fn test_calculate_backoff_different_max() { 196 + assert_eq!(calculate_backoff(0, 5), Duration::from_millis(100)); 197 + assert_eq!(calculate_backoff(1, 5), Duration::from_millis(200)); 198 + assert_eq!(calculate_backoff(2, 5), Duration::from_millis(400)); 199 + assert_eq!(calculate_backoff(3, 5), Duration::from_millis(800)); 200 + assert_eq!(calculate_backoff(4, 5), Duration::from_millis(1600)); 201 + assert_eq!(calculate_backoff(5, 5), Duration::from_millis(3200)); 202 + assert_eq!(calculate_backoff(6, 5), Duration::from_millis(5000)); 203 + assert_eq!(calculate_backoff(7, 5), Duration::from_millis(5000)); 204 + assert_eq!(calculate_backoff(100, 5), Duration::from_millis(5000)); 205 + } 206 + }

+332

src/resilience/circuit_breaker.rs

··· 1 + use std::sync::Arc; 2 + use std::time::{Duration, Instant}; 3 + use tokio::sync::RwLock; 4 + use tracing::{debug, warn}; 5 + 6 + use crate::metrics::Metrics; 7 + 8 + #[derive(Debug, Clone, Copy, PartialEq, Eq)] 9 + pub enum CircuitState { 10 + Closed, 11 + Open, 12 + HalfOpen, 13 + } 14 + 15 + #[derive(Debug, Clone)] 16 + struct CircuitBreakerState { 17 + state: CircuitState, 18 + failure_count: u32, 19 + success_count: u32, 20 + last_failure_time: Option<Instant>, 21 + half_open_attempts: u32, 22 + } 23 + 24 + #[derive(Clone)] 25 + pub struct CircuitBreaker { 26 + name: String, 27 + failure_threshold: u32, 28 + timeout: Duration, 29 + half_open_max_calls: u32, 30 + state: Arc<RwLock<CircuitBreakerState>>, 31 + metrics: Option<Metrics>, 32 + } 33 + 34 + impl CircuitBreaker { 35 + pub fn new( 36 + name: impl Into<String>, 37 + failure_threshold: u32, 38 + timeout_secs: u64, 39 + half_open_max_calls: u32, 40 + ) -> Self { 41 + Self { 42 + name: name.into(), 43 + failure_threshold, 44 + timeout: Duration::from_secs(timeout_secs), 45 + half_open_max_calls, 46 + state: Arc::new(RwLock::new(CircuitBreakerState { 47 + state: CircuitState::Closed, 48 + failure_count: 0, 49 + success_count: 0, 50 + last_failure_time: None, 51 + half_open_attempts: 0, 52 + })), 53 + metrics: None, 54 + } 55 + } 56 + 57 + pub fn with_metrics( 58 + name: impl Into<String>, 59 + failure_threshold: u32, 60 + timeout_secs: u64, 61 + half_open_max_calls: u32, 62 + metrics: Metrics, 63 + ) -> Self { 64 + Self { 65 + name: name.into(), 66 + failure_threshold, 67 + timeout: Duration::from_secs(timeout_secs), 68 + half_open_max_calls, 69 + state: Arc::new(RwLock::new(CircuitBreakerState { 70 + state: CircuitState::Closed, 71 + failure_count: 0, 72 + success_count: 0, 73 + last_failure_time: None, 74 + half_open_attempts: 0, 75 + })), 76 + metrics: Some(metrics), 77 + } 78 + } 79 + 80 + pub async fn is_available(&self) -> bool { 81 + let mut state = self.state.write().await; 82 + 83 + match state.state { 84 + CircuitState::Closed => true, 85 + CircuitState::Open => { 86 + if let Some(last_failure) = state.last_failure_time { 87 + if last_failure.elapsed() >= self.timeout { 88 + debug!( 89 + "Circuit breaker '{}' transitioning from Open to HalfOpen (timeout elapsed)", 90 + self.name 91 + ); 92 + state.state = CircuitState::HalfOpen; 93 + state.half_open_attempts = 1; 94 + state.success_count = 0; 95 + if let Some(metrics) = &self.metrics { 96 + metrics.inc_circuit_breaker_transitions(); 97 + } 98 + true 99 + } else { 100 + if let Some(metrics) = &self.metrics { 101 + metrics.inc_circuit_breaker_rejections(); 102 + } 103 + false 104 + } 105 + } else { 106 + if let Some(metrics) = &self.metrics { 107 + metrics.inc_circuit_breaker_rejections(); 108 + } 109 + false 110 + } 111 + } 112 + CircuitState::HalfOpen => { 113 + if state.half_open_attempts < self.half_open_max_calls { 114 + state.half_open_attempts += 1; 115 + true 116 + } else { 117 + if let Some(metrics) = &self.metrics { 118 + metrics.inc_circuit_breaker_rejections(); 119 + } 120 + false 121 + } 122 + } 123 + } 124 + } 125 + 126 + pub async fn record_success(&self) { 127 + let mut state = self.state.write().await; 128 + 129 + match state.state { 130 + CircuitState::Closed => { 131 + state.failure_count = 0; 132 + } 133 + CircuitState::HalfOpen => { 134 + state.success_count += 1; 135 + if state.success_count >= 1 { 136 + debug!( 137 + "Circuit breaker '{}' transitioning from HalfOpen to Closed (success threshold met)", 138 + self.name 139 + ); 140 + state.state = CircuitState::Closed; 141 + state.failure_count = 0; 142 + state.success_count = 0; 143 + state.half_open_attempts = 0; 144 + state.last_failure_time = None; 145 + if let Some(metrics) = &self.metrics { 146 + metrics.inc_circuit_breaker_transitions(); 147 + } 148 + } 149 + } 150 + CircuitState::Open => {} 151 + } 152 + } 153 + 154 + pub async fn record_failure(&self) { 155 + let mut state = self.state.write().await; 156 + 157 + match state.state { 158 + CircuitState::Closed => { 159 + state.failure_count += 1; 160 + state.last_failure_time = Some(Instant::now()); 161 + 162 + if state.failure_count >= self.failure_threshold { 163 + warn!( 164 + "Circuit breaker '{}' transitioning from Closed to Open (failure threshold {} reached)", 165 + self.name, self.failure_threshold 166 + ); 167 + state.state = CircuitState::Open; 168 + if let Some(metrics) = &self.metrics { 169 + metrics.inc_circuit_breaker_transitions(); 170 + } 171 + } 172 + } 173 + CircuitState::HalfOpen => { 174 + warn!( 175 + "Circuit breaker '{}' transitioning from HalfOpen to Open (failure during half-open)", 176 + self.name 177 + ); 178 + state.state = CircuitState::Open; 179 + state.failure_count = self.failure_threshold; 180 + state.success_count = 0; 181 + state.half_open_attempts = 0; 182 + state.last_failure_time = Some(Instant::now()); 183 + if let Some(metrics) = &self.metrics { 184 + metrics.inc_circuit_breaker_transitions(); 185 + } 186 + } 187 + CircuitState::Open => { 188 + state.last_failure_time = Some(Instant::now()); 189 + } 190 + } 191 + } 192 + 193 + pub async fn get_state(&self) -> CircuitState { 194 + self.state.read().await.state 195 + } 196 + 197 + pub fn name(&self) -> &str { 198 + &self.name 199 + } 200 + } 201 + 202 + #[cfg(test)] 203 + mod tests { 204 + use super::*; 205 + 206 + #[tokio::test] 207 + async fn test_circuit_breaker_starts_closed() { 208 + let cb = CircuitBreaker::new("test", 3, 60, 1); 209 + assert_eq!(cb.get_state().await, CircuitState::Closed); 210 + assert!(cb.is_available().await); 211 + } 212 + 213 + #[tokio::test] 214 + async fn test_circuit_breaker_opens_after_threshold() { 215 + let cb = CircuitBreaker::new("test", 3, 60, 1); 216 + 217 + assert!(cb.is_available().await); 218 + cb.record_failure().await; 219 + assert_eq!(cb.get_state().await, CircuitState::Closed); 220 + 221 + assert!(cb.is_available().await); 222 + cb.record_failure().await; 223 + assert_eq!(cb.get_state().await, CircuitState::Closed); 224 + 225 + assert!(cb.is_available().await); 226 + cb.record_failure().await; 227 + assert_eq!(cb.get_state().await, CircuitState::Open); 228 + 229 + assert!(!cb.is_available().await); 230 + } 231 + 232 + #[tokio::test] 233 + async fn test_circuit_breaker_half_open_after_timeout() { 234 + let cb = CircuitBreaker::new("test", 3, 1, 1); 235 + 236 + for _ in 0..3 { 237 + assert!(cb.is_available().await); 238 + cb.record_failure().await; 239 + } 240 + 241 + assert_eq!(cb.get_state().await, CircuitState::Open); 242 + assert!(!cb.is_available().await); 243 + 244 + tokio::time::sleep(Duration::from_secs(2)).await; 245 + 246 + assert!(cb.is_available().await); 247 + assert_eq!(cb.get_state().await, CircuitState::HalfOpen); 248 + } 249 + 250 + #[tokio::test] 251 + async fn test_circuit_breaker_closes_on_success_during_half_open() { 252 + let cb = CircuitBreaker::new("test", 3, 1, 1); 253 + 254 + for _ in 0..3 { 255 + assert!(cb.is_available().await); 256 + cb.record_failure().await; 257 + } 258 + 259 + assert_eq!(cb.get_state().await, CircuitState::Open); 260 + 261 + tokio::time::sleep(Duration::from_secs(2)).await; 262 + 263 + assert!(cb.is_available().await); 264 + assert_eq!(cb.get_state().await, CircuitState::HalfOpen); 265 + 266 + cb.record_success().await; 267 + assert_eq!(cb.get_state().await, CircuitState::Closed); 268 + assert!(cb.is_available().await); 269 + } 270 + 271 + #[tokio::test] 272 + async fn test_circuit_breaker_reopens_on_failure_during_half_open() { 273 + let cb = CircuitBreaker::new("test", 3, 1, 1); 274 + 275 + for _ in 0..3 { 276 + assert!(cb.is_available().await); 277 + cb.record_failure().await; 278 + } 279 + 280 + assert_eq!(cb.get_state().await, CircuitState::Open); 281 + 282 + tokio::time::sleep(Duration::from_secs(2)).await; 283 + 284 + assert!(cb.is_available().await); 285 + assert_eq!(cb.get_state().await, CircuitState::HalfOpen); 286 + 287 + cb.record_failure().await; 288 + assert_eq!(cb.get_state().await, CircuitState::Open); 289 + assert!(!cb.is_available().await); 290 + } 291 + 292 + #[tokio::test] 293 + async fn test_circuit_breaker_half_open_max_calls() { 294 + let cb = CircuitBreaker::new("test", 3, 1, 2); 295 + 296 + for _ in 0..3 { 297 + assert!(cb.is_available().await); 298 + cb.record_failure().await; 299 + } 300 + 301 + tokio::time::sleep(Duration::from_secs(2)).await; 302 + 303 + assert!(cb.is_available().await); 304 + assert_eq!(cb.get_state().await, CircuitState::HalfOpen); 305 + 306 + assert!(cb.is_available().await); 307 + assert_eq!(cb.get_state().await, CircuitState::HalfOpen); 308 + 309 + assert!(!cb.is_available().await); 310 + } 311 + 312 + #[tokio::test] 313 + async fn test_circuit_breaker_success_resets_failures_when_closed() { 314 + let cb = CircuitBreaker::new("test", 3, 60, 1); 315 + 316 + assert!(cb.is_available().await); 317 + cb.record_failure().await; 318 + assert!(cb.is_available().await); 319 + cb.record_failure().await; 320 + 321 + assert_eq!(cb.get_state().await, CircuitState::Closed); 322 + 323 + cb.record_success().await; 324 + 325 + assert!(cb.is_available().await); 326 + cb.record_failure().await; 327 + assert!(cb.is_available().await); 328 + cb.record_failure().await; 329 + 330 + assert_eq!(cb.get_state().await, CircuitState::Closed); 331 + } 332 + }

+3

src/resilience/mod.rs

··· 1 + pub mod circuit_breaker; 2 + 3 + pub use circuit_breaker::CircuitBreaker;

+192

tests/integration/blob_download_test.rs

··· 1 + use mockito::Server; 2 + use reqwest::Client; 3 + use std::time::Duration; 4 + 5 + use super::helpers::{create_test_config, create_test_image_bytes}; 6 + 7 + /// Test successful CDN download (first attempt) 8 + #[tokio::test] 9 + async fn test_cdn_download_success_jpeg() { 10 + let mut server = Server::new_async().await; 11 + let image_bytes = create_test_image_bytes(); 12 + 13 + // Mock CDN endpoint for JPEG format 14 + let mock = server 15 + .mock("GET", "/img/feed_fullsize/plain/did:plc:test123/bafytest@jpeg") 16 + .with_status(200) 17 + .with_body(image_bytes.clone()) 18 + .create_async() 19 + .await; 20 + 21 + let _config = create_test_config(); 22 + let client = Client::new(); 23 + 24 + // Override CDN URL to point to mock server 25 + let cdn_url = format!( 26 + "{}/img/feed_fullsize/plain/did:plc:test123/bafytest@jpeg", 27 + server.url() 28 + ); 29 + 30 + let response = client.get(&cdn_url).send().await.unwrap(); 31 + assert!(response.status().is_success()); 32 + 33 + let downloaded = response.bytes().await.unwrap(); 34 + assert_eq!(downloaded.to_vec(), image_bytes); 35 + 36 + mock.assert_async().await; 37 + } 38 + 39 + /// Test CDN failure falls back to PDS 40 + #[tokio::test] 41 + async fn test_cdn_failure_pds_fallback() { 42 + let mut cdn_server = Server::new_async().await; 43 + let mut pds_server = Server::new_async().await; 44 + 45 + let image_bytes = create_test_image_bytes(); 46 + 47 + // CDN returns 404 for all formats 48 + let _cdn_jpeg = cdn_server 49 + .mock("GET", mockito::Matcher::Any) 50 + .with_status(404) 51 + .create_async() 52 + .await; 53 + 54 + // PDS succeeds 55 + let pds_mock = pds_server 56 + .mock( 57 + "GET", 58 + "/xrpc/com.atproto.sync.getBlob?did=did:plc:test123&cid=bafytest", 59 + ) 60 + .with_status(200) 61 + .with_body(image_bytes.clone()) 62 + .create_async() 63 + .await; 64 + 65 + let mut config = create_test_config(); 66 + config.pds.endpoint = pds_server.url(); 67 + 68 + let client = Client::builder() 69 + .timeout(Duration::from_secs(5)) 70 + .build() 71 + .unwrap(); 72 + 73 + // Simulate the fallback logic: Try CDN (fails) then PDS (succeeds) 74 + let cdn_url = format!( 75 + "{}/img/feed_fullsize/plain/did:plc:test123/bafytest@jpeg", 76 + cdn_server.url() 77 + ); 78 + let cdn_result = client.get(&cdn_url).send().await; 79 + assert!(cdn_result.is_err() || !cdn_result.unwrap().status().is_success()); 80 + 81 + // Now try PDS 82 + let pds_url = format!( 83 + "{}/xrpc/com.atproto.sync.getBlob?did=did:plc:test123&cid=bafytest", 84 + pds_server.url() 85 + ); 86 + let pds_response = client.get(&pds_url).send().await.unwrap(); 87 + assert!(pds_response.status().is_success()); 88 + 89 + let downloaded = pds_response.bytes().await.unwrap(); 90 + assert_eq!(downloaded.to_vec(), image_bytes); 91 + 92 + pds_mock.assert_async().await; 93 + } 94 + 95 + /// Test blob download timeout 96 + #[tokio::test] 97 + async fn test_blob_download_timeout() { 98 + let mut server = Server::new_async().await; 99 + 100 + // Note: mockito v1 doesn't easily support delay simulation 101 + // This test would work with a real slow server, but we'll simplify 102 + // to just test that timeout mechanism works with a quick response 103 + let image_bytes = create_test_image_bytes(); 104 + let _mock = server 105 + .mock("GET", "/img/feed_fullsize/plain/did:plc:test123/bafytest@jpeg") 106 + .with_status(200) 107 + .with_body(image_bytes) 108 + .create_async() 109 + .await; 110 + 111 + let client = Client::builder() 112 + .timeout(Duration::from_millis(500)) 113 + .build() 114 + .unwrap(); 115 + 116 + let url = format!( 117 + "{}/img/feed_fullsize/plain/did:plc:test123/bafytest@jpeg", 118 + server.url() 119 + ); 120 + 121 + // With mockito not supporting delays easily, we just verify timeout is configured 122 + let result = client.get(&url).send().await; 123 + // In practice, this would timeout with a real slow server 124 + // For now, just verify the request completes (since mock is fast) 125 + assert!(result.is_ok() || result.unwrap_err().is_timeout()); 126 + } 127 + 128 + /// Test CDN tries multiple formats before falling back to PDS 129 + #[tokio::test] 130 + async fn test_cdn_tries_all_formats_before_pds() { 131 + let mut cdn_server = Server::new_async().await; 132 + let mut pds_server = Server::new_async().await; 133 + 134 + let image_bytes = create_test_image_bytes(); 135 + 136 + // CDN 404s for jpeg and png, succeeds for webp 137 + let _jpeg_mock = cdn_server 138 + .mock("GET", "/img/feed_fullsize/plain/did:plc:test123/bafytest@jpeg") 139 + .with_status(404) 140 + .create_async() 141 + .await; 142 + 143 + let _png_mock = cdn_server 144 + .mock("GET", "/img/feed_fullsize/plain/did:plc:test123/bafytest@png") 145 + .with_status(404) 146 + .create_async() 147 + .await; 148 + 149 + let webp_mock = cdn_server 150 + .mock("GET", "/img/feed_fullsize/plain/did:plc:test123/bafytest@webp") 151 + .with_status(200) 152 + .with_body(image_bytes.clone()) 153 + .create_async() 154 + .await; 155 + 156 + let mut config = create_test_config(); 157 + config.pds.endpoint = pds_server.url(); 158 + 159 + let client = Client::builder() 160 + .timeout(Duration::from_secs(5)) 161 + .build() 162 + .unwrap(); 163 + 164 + // Try JPEG (fails) 165 + let jpeg_url = format!( 166 + "{}/img/feed_fullsize/plain/did:plc:test123/bafytest@jpeg", 167 + cdn_server.url() 168 + ); 169 + let jpeg_result = client.get(&jpeg_url).send().await.unwrap(); 170 + assert!(!jpeg_result.status().is_success()); 171 + 172 + // Try PNG (fails) 173 + let png_url = format!( 174 + "{}/img/feed_fullsize/plain/did:plc:test123/bafytest@png", 175 + cdn_server.url() 176 + ); 177 + let png_result = client.get(&png_url).send().await.unwrap(); 178 + assert!(!png_result.status().is_success()); 179 + 180 + // Try WebP (succeeds) 181 + let webp_url = format!( 182 + "{}/img/feed_fullsize/plain/did:plc:test123/bafytest@webp", 183 + cdn_server.url() 184 + ); 185 + let webp_response = client.get(&webp_url).send().await.unwrap(); 186 + assert!(webp_response.status().is_success()); 187 + 188 + let downloaded = webp_response.bytes().await.unwrap(); 189 + assert_eq!(downloaded.to_vec(), image_bytes); 190 + 191 + webp_mock.assert_async().await; 192 + }

+214

tests/integration/cache_test.rs

··· 1 + use skywatch_phash_rs::cache::PhashCache; 2 + use skywatch_phash_rs::config::RedisConfig; 3 + use skywatch_phash_rs::metrics::Metrics; 4 + use skywatch_phash_rs::redis_pool::RedisPool; 5 + 6 + /// Helper to check if Redis is available 7 + async fn redis_available() -> bool { 8 + let url = std::env::var("REDIS_URL").unwrap_or_else(|_| "redis://localhost:6379".to_string()); 9 + match redis::Client::open(url.as_str()) { 10 + Ok(client) => client.get_multiplexed_async_connection().await.is_ok(), 11 + Err(_) => false, 12 + } 13 + } 14 + 15 + /// Helper to create test Redis pool 16 + async fn create_test_redis_pool() -> Option<RedisPool> { 17 + if !redis_available().await { 18 + eprintln!("Skipping test: Redis not available"); 19 + return None; 20 + } 21 + 22 + let config = RedisConfig { 23 + url: std::env::var("REDIS_URL") 24 + .unwrap_or_else(|_| "redis://localhost:6379".to_string()), 25 + health_check_interval_secs: 30, 26 + max_backoff_secs: 10, 27 + }; 28 + 29 + let metrics = Metrics::new(); 30 + RedisPool::new(config, metrics).await.ok() 31 + } 32 + 33 + /// Test cache miss scenario 34 + #[tokio::test] 35 + async fn test_cache_miss() { 36 + let Some(pool) = create_test_redis_pool().await else { 37 + return; 38 + }; 39 + 40 + let cache = PhashCache::new(pool, 60, true); 41 + 42 + // Clear any existing value 43 + let _ = cache.delete("test-cid-miss").await; 44 + 45 + let result = cache.get("test-cid-miss").await.unwrap(); 46 + assert!(result.is_none()); 47 + } 48 + 49 + /// Test cache hit scenario 50 + #[tokio::test] 51 + async fn test_cache_hit() { 52 + let Some(pool) = create_test_redis_pool().await else { 53 + return; 54 + }; 55 + 56 + let cache = PhashCache::new(pool, 60, true); 57 + 58 + let cid = "test-cid-hit"; 59 + let phash = "deadbeefdeadbeef"; 60 + 61 + // Set value 62 + cache.set(cid, phash).await.unwrap(); 63 + 64 + // Get value 65 + let result = cache.get(cid).await.unwrap(); 66 + assert_eq!(result, Some(phash.to_string())); 67 + 68 + // Cleanup 69 + let _ = cache.delete(cid).await; 70 + } 71 + 72 + /// Test cache set and delete 73 + #[tokio::test] 74 + async fn test_cache_set_delete() { 75 + let Some(pool) = create_test_redis_pool().await else { 76 + return; 77 + }; 78 + 79 + let cache = PhashCache::new(pool, 60, true); 80 + 81 + let cid = "test-cid-delete"; 82 + let phash = "cafebabecafebabe"; 83 + 84 + // Set value 85 + cache.set(cid, phash).await.unwrap(); 86 + 87 + // Verify it exists 88 + let result = cache.get(cid).await.unwrap(); 89 + assert_eq!(result, Some(phash.to_string())); 90 + 91 + // Delete it 92 + cache.delete(cid).await.unwrap(); 93 + 94 + // Verify it's gone 95 + let result = cache.get(cid).await.unwrap(); 96 + assert!(result.is_none()); 97 + } 98 + 99 + /// Test get_or_compute with cache miss 100 + #[tokio::test] 101 + async fn test_get_or_compute_miss() { 102 + let Some(pool) = create_test_redis_pool().await else { 103 + return; 104 + }; 105 + 106 + let cache = PhashCache::new(pool, 60, true); 107 + 108 + let cid = "test-cid-compute-miss"; 109 + let expected_phash = "1234567812345678"; 110 + 111 + // Clear any existing value 112 + let _ = cache.delete(cid).await; 113 + 114 + let mut compute_called = false; 115 + let result = cache 116 + .get_or_compute(cid, || async { 117 + compute_called = true; 118 + Ok::<String, miette::Report>(expected_phash.to_string()) 119 + }) 120 + .await 121 + .unwrap(); 122 + 123 + assert_eq!(result, expected_phash); 124 + assert!(compute_called, "Compute function should have been called"); 125 + 126 + // Verify it was cached 127 + let cached = cache.get(cid).await.unwrap(); 128 + assert_eq!(cached, Some(expected_phash.to_string())); 129 + 130 + // Cleanup 131 + let _ = cache.delete(cid).await; 132 + } 133 + 134 + /// Test get_or_compute with cache hit 135 + #[tokio::test] 136 + async fn test_get_or_compute_hit() { 137 + let Some(pool) = create_test_redis_pool().await else { 138 + return; 139 + }; 140 + 141 + let cache = PhashCache::new(pool, 60, true); 142 + 143 + let cid = "test-cid-compute-hit"; 144 + let cached_phash = "abcdef0123456789"; 145 + 146 + // Pre-populate cache 147 + cache.set(cid, cached_phash).await.unwrap(); 148 + 149 + let mut compute_called = false; 150 + let result = cache 151 + .get_or_compute(cid, || async { 152 + compute_called = true; 153 + Ok::<String, miette::Report>("should-not-be-returned".to_string()) 154 + }) 155 + .await 156 + .unwrap(); 157 + 158 + assert_eq!(result, cached_phash); 159 + assert!(!compute_called, "Compute function should NOT have been called"); 160 + 161 + // Cleanup 162 + let _ = cache.delete(cid).await; 163 + } 164 + 165 + /// Test cache disabled behavior 166 + #[tokio::test] 167 + async fn test_cache_disabled() { 168 + let Some(pool) = create_test_redis_pool().await else { 169 + return; 170 + }; 171 + 172 + let cache = PhashCache::new(pool, 60, false); // disabled 173 + 174 + let cid = "test-cid-disabled"; 175 + let phash = "0000111100001111"; 176 + 177 + // Set should be no-op when disabled 178 + cache.set(cid, phash).await.unwrap(); 179 + 180 + // Get should return None when disabled 181 + let result = cache.get(cid).await.unwrap(); 182 + assert!(result.is_none()); 183 + 184 + // is_enabled should return false 185 + assert!(!cache.is_enabled()); 186 + } 187 + 188 + /// Test cache TTL expiration (requires waiting) 189 + #[tokio::test] 190 + #[ignore] // Ignored by default as it takes time 191 + async fn test_cache_ttl_expiration() { 192 + let Some(pool) = create_test_redis_pool().await else { 193 + return; 194 + }; 195 + 196 + let cache = PhashCache::new(pool, 2, true); // 2 second TTL 197 + 198 + let cid = "test-cid-ttl"; 199 + let phash = "fedcbafedcba9876"; 200 + 201 + // Set value 202 + cache.set(cid, phash).await.unwrap(); 203 + 204 + // Verify it exists 205 + let result = cache.get(cid).await.unwrap(); 206 + assert_eq!(result, Some(phash.to_string())); 207 + 208 + // Wait for TTL to expire 209 + tokio::time::sleep(tokio::time::Duration::from_secs(3)).await; 210 + 211 + // Verify it's gone 212 + let result = cache.get(cid).await.unwrap(); 213 + assert!(result.is_none()); 214 + }

+123

tests/integration/helpers.rs

··· 1 + use jacquard_common::types::string::{AtUri, Cid, Did}; 2 + use jacquard_common::CowStr; 3 + use jacquard_common::IntoStatic; 4 + use skywatch_phash_rs::types::{BlobCheck, BlobReference, ImageJob}; 5 + 6 + /// Create a test blob check 7 + pub fn create_test_blob_check( 8 + phashes: Vec<&str>, 9 + label: &str, 10 + report_post: bool, 11 + to_label: bool, 12 + hamming_threshold: Option<u32>, 13 + ) -> BlobCheck { 14 + BlobCheck { 15 + phashes: phashes 16 + .into_iter() 17 + .map(|p| CowStr::from(p.to_string()).into_static()) 18 + .collect(), 19 + label: CowStr::from(label.to_string()).into_static(), 20 + comment: CowStr::from("Test comment".to_string()).into_static(), 21 + report_acct: false, 22 + label_acct: false, 23 + report_post, 24 + to_label, 25 + takedown_post: false, 26 + takedown_acct: false, 27 + hamming_threshold, 28 + description: None, 29 + ignore_did: None, 30 + } 31 + } 32 + 33 + /// Create a test image job 34 + pub fn create_test_image_job( 35 + post_uri: &str, 36 + post_cid: &str, 37 + post_did: &str, 38 + blob_cids: Vec<&str>, 39 + ) -> ImageJob { 40 + ImageJob { 41 + post_uri: AtUri::new(post_uri).unwrap().into_static(), 42 + post_cid: Cid::str(post_cid).into_static(), 43 + post_did: Did::new(post_did).unwrap().into_static(), 44 + blobs: blob_cids 45 + .into_iter() 46 + .map(|cid| BlobReference { 47 + cid: Cid::str(cid).into_static(), 48 + mime_type: Some(CowStr::from("image/jpeg").into_static()), 49 + }) 50 + .collect(), 51 + timestamp: chrono::Utc::now().timestamp(), 52 + attempts: 0, 53 + } 54 + } 55 + 56 + /// Generate a valid phash string (16 hex characters) 57 + pub fn generate_phash(seed: u64) -> String { 58 + format!("{:016x}", seed) 59 + } 60 + 61 + /// Create a 1x1 black PNG image bytes (valid PNG) 62 + pub fn create_test_image_bytes() -> Vec<u8> { 63 + // Valid 1x1 black pixel PNG (generated from image crate) 64 + use image::{ImageBuffer, Rgb}; 65 + 66 + let img: ImageBuffer<Rgb<u8>, Vec<u8>> = ImageBuffer::from_pixel(1, 1, Rgb([0, 0, 0])); 67 + let mut bytes: Vec<u8> = Vec::new(); 68 + img.write_to(&mut std::io::Cursor::new(&mut bytes), image::ImageFormat::Png) 69 + .expect("Failed to encode test image"); 70 + bytes 71 + } 72 + 73 + /// Create a test config for integration tests 74 + pub fn create_test_config() -> skywatch_phash_rs::config::Config { 75 + skywatch_phash_rs::config::Config { 76 + moderation: skywatch_phash_rs::config::ModerationConfig { 77 + labeler_did: "did:plc:test".to_string(), 78 + rate_limit: 100, 79 + }, 80 + ozone: skywatch_phash_rs::config::OzoneConfig { 81 + url: "http://localhost:8080".to_string(), 82 + pds: "http://localhost:8081".to_string(), 83 + }, 84 + automod: skywatch_phash_rs::config::AutomodConfig { 85 + handle: "test.bsky.social".to_string(), 86 + password: "test".to_string(), 87 + }, 88 + pds: skywatch_phash_rs::config::PdsConfig { 89 + endpoint: "http://localhost:8081".to_string(), 90 + }, 91 + phash: skywatch_phash_rs::config::PhashConfig { 92 + default_hamming_threshold: 3, 93 + }, 94 + processing: skywatch_phash_rs::config::ProcessingConfig { 95 + concurrency: 10, 96 + retry_attempts: 3, 97 + retry_delay: 100, 98 + blob_download_timeout_secs: 5, 99 + blob_total_timeout_secs: 15, 100 + }, 101 + cache: skywatch_phash_rs::config::CacheConfig { 102 + enabled: true, 103 + ttl: 3600, 104 + }, 105 + redis: skywatch_phash_rs::config::RedisConfig { 106 + url: "redis://localhost:6379".to_string(), 107 + health_check_interval_secs: 30, 108 + max_backoff_secs: 10, 109 + }, 110 + jetstream: skywatch_phash_rs::config::JetstreamConfig { 111 + url: "ws://localhost:6008".to_string(), 112 + fallback_urls: vec![], 113 + wanted_collections: vec!["app.bsky.feed.post".to_string()], 114 + cursor_update_interval: 10000, 115 + retry_delay_secs: 5, 116 + max_retry_delay_secs: 300, 117 + }, 118 + plc: skywatch_phash_rs::config::PlcConfig { 119 + endpoint: "http://localhost:8082".to_string(), 120 + fallback_endpoints: vec![], 121 + }, 122 + } 123 + }

+5

tests/integration/mod.rs

··· 1 + mod helpers; 2 + mod blob_download_test; 3 + mod cache_test; 4 + mod moderation_test; 5 + mod worker_test;

+270

tests/integration/moderation_test.rs

··· 1 + use mockito::Server; 2 + use skywatch_phash_rs::moderation::rate_limiter::RateLimiter; 3 + use skywatch_phash_rs::resilience::CircuitBreaker; 4 + use std::sync::Arc; 5 + use std::time::Duration; 6 + 7 + /// Test rate limiter allows requests within limit 8 + #[tokio::test] 9 + async fn test_rate_limiter_allows_requests() { 10 + let rate_limiter = RateLimiter::new(100).unwrap(); // 100ms between requests = 10/s 11 + 12 + // Should complete quickly 13 + let start = std::time::Instant::now(); 14 + for _ in 0..5 { 15 + rate_limiter.wait().await; 16 + } 17 + let elapsed = start.elapsed(); 18 + 19 + // 5 requests at 10/s should take < 1s 20 + assert!( 21 + elapsed < Duration::from_secs(1), 22 + "Rate limiter blocked unnecessarily: {:?}", 23 + elapsed 24 + ); 25 + } 26 + 27 + /// Test rate limiter enforces rate limit 28 + #[tokio::test] 29 + async fn test_rate_limiter_enforces_limit() { 30 + // Very low rate: 2 per second = 500ms between requests 31 + let rate_limiter = RateLimiter::new(500).unwrap(); 32 + 33 + let start = std::time::Instant::now(); 34 + 35 + // First 2 should be immediate 36 + rate_limiter.wait().await; 37 + rate_limiter.wait().await; 38 + 39 + // 3rd request should be delayed 40 + rate_limiter.wait().await; 41 + 42 + let elapsed = start.elapsed(); 43 + 44 + // Should take at least 500ms for the 3rd request 45 + assert!( 46 + elapsed >= Duration::from_millis(400), 47 + "Rate limiter didn't enforce limit: {:?}", 48 + elapsed 49 + ); 50 + } 51 + 52 + /// Test circuit breaker integration with retry logic 53 + #[tokio::test] 54 + async fn test_circuit_breaker_blocks_after_failures() { 55 + let cb = CircuitBreaker::new("test-ozone", 3, 60, 1); 56 + 57 + // First 3 failures should open circuit 58 + for i in 0..3 { 59 + assert!( 60 + cb.is_available().await, 61 + "Circuit should be available for failure {}", 62 + i + 1 63 + ); 64 + cb.record_failure().await; 65 + } 66 + 67 + // Circuit should now be open 68 + assert!( 69 + !cb.is_available().await, 70 + "Circuit should be open after threshold" 71 + ); 72 + } 73 + 74 + /// Test circuit breaker allows requests when closed 75 + #[tokio::test] 76 + async fn test_circuit_breaker_allows_when_closed() { 77 + let cb = CircuitBreaker::new("test-ozone", 5, 60, 1); 78 + 79 + // Should allow requests when closed 80 + for _ in 0..10 { 81 + assert!(cb.is_available().await); 82 + cb.record_success().await; 83 + } 84 + } 85 + 86 + /// Test circuit breaker transitions to half-open 87 + #[tokio::test] 88 + async fn test_circuit_breaker_half_open_transition() { 89 + let cb = CircuitBreaker::new("test-ozone", 3, 1, 1); 90 + 91 + // Open the circuit 92 + for _ in 0..3 { 93 + assert!(cb.is_available().await); 94 + cb.record_failure().await; 95 + } 96 + 97 + assert!(!cb.is_available().await); 98 + 99 + // Wait for timeout 100 + tokio::time::sleep(Duration::from_secs(2)).await; 101 + 102 + // Should transition to half-open and allow one request 103 + assert!( 104 + cb.is_available().await, 105 + "Circuit should allow request in half-open state" 106 + ); 107 + } 108 + 109 + /// Test moderation retry logic with transient errors 110 + #[tokio::test] 111 + async fn test_moderation_retry_on_transient_error() { 112 + let mut server = Server::new_async().await; 113 + 114 + // Mock first request fails, second succeeds 115 + let _mock1 = server 116 + .mock("POST", "/xrpc/tools.ozone.moderation.emitEvent") 117 + .with_status(503) 118 + .expect(1) 119 + .create_async() 120 + .await; 121 + 122 + let _mock2 = server 123 + .mock("POST", "/xrpc/tools.ozone.moderation.emitEvent") 124 + .with_status(200) 125 + .expect(1) 126 + .create_async() 127 + .await; 128 + 129 + // This tests that retry logic would work 130 + // In practice, the retry happens in send_moderation_event 131 + let client = reqwest::Client::new(); 132 + 133 + // First attempt fails 134 + let response1 = client 135 + .post(format!( 136 + "{}/xrpc/tools.ozone.moderation.emitEvent", 137 + server.url() 138 + )) 139 + .send() 140 + .await 141 + .unwrap(); 142 + assert_eq!(response1.status(), 503); 143 + 144 + // Second attempt succeeds 145 + let response2 = client 146 + .post(format!( 147 + "{}/xrpc/tools.ozone.moderation.emitEvent", 148 + server.url() 149 + )) 150 + .send() 151 + .await 152 + .unwrap(); 153 + assert_eq!(response2.status(), 200); 154 + } 155 + 156 + /// Test moderation gives up after max retries 157 + #[tokio::test] 158 + async fn test_moderation_exhausts_retries() { 159 + let mut server = Server::new_async().await; 160 + 161 + // Always return 503 162 + let _mock = server 163 + .mock("POST", "/xrpc/tools.ozone.moderation.emitEvent") 164 + .with_status(503) 165 + .expect(3) // MAX_RETRIES in helpers.rs is 3 166 + .create_async() 167 + .await; 168 + 169 + let client = reqwest::Client::new(); 170 + 171 + // Simulate 3 retry attempts 172 + for i in 0..3 { 173 + let response = client 174 + .post(format!( 175 + "{}/xrpc/tools.ozone.moderation.emitEvent", 176 + server.url() 177 + )) 178 + .send() 179 + .await 180 + .unwrap(); 181 + assert_eq!( 182 + response.status(), 183 + 503, 184 + "Attempt {} should fail with 503", 185 + i + 1 186 + ); 187 + } 188 + } 189 + 190 + /// Test circuit breaker prevents cascading failures 191 + #[tokio::test] 192 + async fn test_circuit_breaker_prevents_cascade() { 193 + let cb = Arc::new(CircuitBreaker::new("test-cascade", 2, 1, 1)); 194 + let mut server = Server::new_async().await; 195 + 196 + // Server always fails 197 + let _mock = server 198 + .mock("POST", "/xrpc/tools.ozone.moderation.emitEvent") 199 + .with_status(500) 200 + .expect(2) // Only 2 requests should make it through before circuit opens 201 + .create_async() 202 + .await; 203 + 204 + let client = reqwest::Client::new(); 205 + 206 + // First 2 requests go through and fail 207 + for i in 0..2 { 208 + if cb.is_available().await { 209 + let response = client 210 + .post(format!( 211 + "{}/xrpc/tools.ozone.moderation.emitEvent", 212 + server.url() 213 + )) 214 + .send() 215 + .await 216 + .unwrap(); 217 + assert_eq!(response.status(), 500, "Request {} should fail", i + 1); 218 + cb.record_failure().await; 219 + } 220 + } 221 + 222 + // Circuit should now be open 223 + assert!( 224 + !cb.is_available().await, 225 + "Circuit should be open after 2 failures" 226 + ); 227 + 228 + // Subsequent requests should be blocked without hitting the server 229 + for _ in 0..5 { 230 + assert!(!cb.is_available().await); 231 + } 232 + } 233 + 234 + /// Test exponential backoff increases correctly 235 + #[tokio::test] 236 + async fn test_exponential_backoff_timing() { 237 + let backoffs = vec![ 238 + Duration::from_millis(100), 239 + Duration::from_millis(200), 240 + Duration::from_millis(400), 241 + ]; 242 + 243 + let start = std::time::Instant::now(); 244 + 245 + for (i, expected_backoff) in backoffs.iter().enumerate() { 246 + let iteration_start = std::time::Instant::now(); 247 + tokio::time::sleep(*expected_backoff).await; 248 + let iteration_elapsed = iteration_start.elapsed(); 249 + 250 + // Allow 50ms margin for timing variance 251 + assert!( 252 + iteration_elapsed >= *expected_backoff 253 + && iteration_elapsed < *expected_backoff + Duration::from_millis(50), 254 + "Backoff {} should be ~{:?}, was {:?}", 255 + i + 1, 256 + expected_backoff, 257 + iteration_elapsed 258 + ); 259 + } 260 + 261 + let total_elapsed = start.elapsed(); 262 + let expected_total = Duration::from_millis(700); // 100 + 200 + 400 263 + 264 + assert!( 265 + total_elapsed >= expected_total && total_elapsed < expected_total + Duration::from_millis(100), 266 + "Total backoff should be ~{:?}, was {:?}", 267 + expected_total, 268 + total_elapsed 269 + ); 270 + }

+304

tests/integration/worker_test.rs

··· 1 + use mockito::Server; 2 + use skywatch_phash_rs::processor::matcher; 3 + use skywatch_phash_rs::processor::phash; 4 + 5 + use super::helpers::{ 6 + create_test_blob_check, create_test_image_bytes, create_test_image_job, 7 + generate_phash, 8 + }; 9 + 10 + /// Test match_phash finds exact match 11 + #[tokio::test] 12 + async fn test_match_phash_exact_match() { 13 + let phash = generate_phash(0xdeadbeefdeadbeef); 14 + let checks = vec![create_test_blob_check( 15 + vec![&phash], 16 + "test-label", 17 + true, 18 + true, 19 + Some(3), 20 + )]; 21 + 22 + let result = matcher::match_phash(&phash, &checks, "did:plc:test", 3); 23 + 24 + assert!(result.is_some()); 25 + let match_result = result.unwrap(); 26 + assert_eq!(match_result.hamming_distance, 0); 27 + assert_eq!(match_result.matched_check.label.as_str(), "test-label"); 28 + } 29 + 30 + /// Test match_phash finds match within threshold 31 + #[tokio::test] 32 + async fn test_match_phash_within_threshold() { 33 + let target_phash = generate_phash(0xdeadbeefdeadbeef); 34 + let similar_phash = generate_phash(0xdeadbeefdeadbeee); // 1 bit different 35 + 36 + let checks = vec![create_test_blob_check( 37 + vec![&target_phash], 38 + "test-label", 39 + true, 40 + true, 41 + Some(3), 42 + )]; 43 + 44 + let result = matcher::match_phash(&similar_phash, &checks, "did:plc:test", 3); 45 + 46 + assert!(result.is_some()); 47 + let match_result = result.unwrap(); 48 + assert!(match_result.hamming_distance <= 3); 49 + } 50 + 51 + /// Test match_phash rejects match exceeding threshold 52 + #[tokio::test] 53 + async fn test_match_phash_exceeds_threshold() { 54 + let target_phash = generate_phash(0xdeadbeefdeadbeef); 55 + let different_phash = generate_phash(0x0000000000000000); 56 + 57 + let checks = vec![create_test_blob_check( 58 + vec![&target_phash], 59 + "test-label", 60 + true, 61 + true, 62 + Some(3), 63 + )]; 64 + 65 + let result = matcher::match_phash(&different_phash, &checks, "did:plc:test", 3); 66 + 67 + assert!(result.is_none()); 68 + } 69 + 70 + /// Test match_phash respects ignore_did list 71 + #[tokio::test] 72 + async fn test_match_phash_ignores_did() { 73 + use jacquard_common::types::string::Did; 74 + use jacquard_common::IntoStatic; 75 + 76 + let phash = generate_phash(0xdeadbeefdeadbeef); 77 + let ignored_did = "did:plc:ignored"; 78 + 79 + let mut check = create_test_blob_check(vec![&phash], "test-label", true, true, Some(3)); 80 + check.ignore_did = Some(vec![Did::new(ignored_did).unwrap().into_static()]); 81 + 82 + let checks = vec![check]; 83 + 84 + let result = matcher::match_phash(&phash, &checks, ignored_did, 3); 85 + 86 + assert!(result.is_none()); 87 + } 88 + 89 + /// Test blob download succeeds from CDN 90 + #[tokio::test] 91 + async fn test_download_blob_cdn_success() { 92 + let mut server = Server::new_async().await; 93 + let image_bytes = create_test_image_bytes(); 94 + 95 + let _mock = server 96 + .mock("GET", "/img/feed_fullsize/plain/did:plc:test/bafytest@jpeg") 97 + .with_status(200) 98 + .with_body(image_bytes.clone()) 99 + .create_async() 100 + .await; 101 + 102 + // Note: This test demonstrates the download pattern, but doesn't actually test 103 + // download_blob directly since it hardcodes cdn.bsky.app 104 + // In a real implementation, we'd need dependency injection for the CDN URL 105 + } 106 + 107 + /// Test phash computation produces valid hash 108 + #[tokio::test] 109 + async fn test_compute_phash_valid() { 110 + let image_bytes = create_test_image_bytes(); 111 + let phash = phash::compute_phash(&image_bytes).unwrap(); 112 + 113 + // Should be 16 hex characters 114 + assert_eq!(phash.len(), 16); 115 + 116 + // Should be valid hex 117 + assert!(u64::from_str_radix(&phash, 16).is_ok()); 118 + } 119 + 120 + /// Test phash computation is deterministic 121 + #[tokio::test] 122 + async fn test_compute_phash_deterministic() { 123 + let image_bytes = create_test_image_bytes(); 124 + 125 + let phash1 = phash::compute_phash(&image_bytes).unwrap(); 126 + let phash2 = phash::compute_phash(&image_bytes).unwrap(); 127 + 128 + assert_eq!(phash1, phash2); 129 + } 130 + 131 + /// Test hamming distance calculation 132 + #[tokio::test] 133 + async fn test_hamming_distance() { 134 + let hash1 = "deadbeefdeadbeef"; 135 + let hash2 = "deadbeefdeadbeef"; 136 + let distance = phash::hamming_distance(hash1, hash2).unwrap(); 137 + assert_eq!(distance, 0); 138 + 139 + let hash3 = "deadbeefdeadbeee"; 140 + let distance2 = phash::hamming_distance(hash1, hash3).unwrap(); 141 + assert!(distance2 > 0 && distance2 <= 4); 142 + } 143 + 144 + /// Test job processing flow with cache miss 145 + #[tokio::test] 146 + async fn test_job_processing_cache_miss_flow() { 147 + // This test demonstrates the worker flow: 148 + // 1. Receive job 149 + // 2. Check cache (miss) 150 + // 3. Download blob 151 + // 4. Compute phash 152 + // 5. Store in cache 153 + // 6. Check for matches 154 + // 7. Take moderation actions 155 + 156 + let job = create_test_image_job( 157 + "at://did:plc:test/app.bsky.feed.post/123", 158 + "bafytest", 159 + "did:plc:test", 160 + vec!["bafyblob123"], 161 + ); 162 + 163 + assert_eq!(job.blobs.len(), 1); 164 + assert_eq!(job.attempts, 0); 165 + } 166 + 167 + /// Test multiple blobs in single job 168 + #[tokio::test] 169 + async fn test_job_processing_multiple_blobs() { 170 + let job = create_test_image_job( 171 + "at://did:plc:test/app.bsky.feed.post/456", 172 + "bafytest", 173 + "did:plc:test", 174 + vec!["bafyblob1", "bafyblob2", "bafyblob3"], 175 + ); 176 + 177 + assert_eq!(job.blobs.len(), 3); 178 + 179 + // Each blob would be processed independently 180 + for blob in &job.blobs { 181 + assert!(!blob.cid.is_empty()); 182 + } 183 + } 184 + 185 + /// Test job retry increment 186 + #[tokio::test] 187 + async fn test_job_retry_increment() { 188 + let mut job = create_test_image_job( 189 + "at://did:plc:test/app.bsky.feed.post/789", 190 + "bafytest", 191 + "did:plc:test", 192 + vec!["bafyblob"], 193 + ); 194 + 195 + assert_eq!(job.attempts, 0); 196 + 197 + // Simulate retry 198 + job.attempts += 1; 199 + assert_eq!(job.attempts, 1); 200 + 201 + job.attempts += 1; 202 + assert_eq!(job.attempts, 2); 203 + } 204 + 205 + /// Test moderation action selection based on check flags 206 + #[tokio::test] 207 + async fn test_moderation_action_selection() { 208 + // Check with report_post=true, to_label=false 209 + let check1 = create_test_blob_check(vec!["abc123"], "spam", true, false, Some(3)); 210 + assert!(check1.report_post); 211 + assert!(!check1.to_label); 212 + 213 + // Check with report_post=false, to_label=true 214 + let check2 = create_test_blob_check(vec!["def456"], "nsfw", false, true, Some(3)); 215 + assert!(!check2.report_post); 216 + assert!(check2.to_label); 217 + 218 + // Check with both enabled 219 + let check3 = create_test_blob_check(vec!["ghi789"], "csam", true, true, Some(1)); 220 + assert!(check3.report_post); 221 + assert!(check3.to_label); 222 + } 223 + 224 + /// Test worker handles empty blob list gracefully 225 + #[tokio::test] 226 + async fn test_job_with_no_blobs() { 227 + let job = create_test_image_job( 228 + "at://did:plc:test/app.bsky.feed.post/empty", 229 + "bafytest", 230 + "did:plc:test", 231 + vec![], 232 + ); 233 + 234 + assert_eq!(job.blobs.len(), 0); 235 + } 236 + 237 + /// Test blob check uses default threshold when not specified 238 + #[tokio::test] 239 + async fn test_blob_check_default_threshold() { 240 + let check = create_test_blob_check(vec!["test"], "label", true, true, None); 241 + assert!(check.hamming_threshold.is_none()); 242 + 243 + // In actual matching, config.phash.default_hamming_threshold would be used 244 + let default_threshold = 3; 245 + let effective_threshold = check.hamming_threshold.unwrap_or(default_threshold); 246 + assert_eq!(effective_threshold, 3); 247 + } 248 + 249 + /// Test blob check with custom threshold 250 + #[tokio::test] 251 + async fn test_blob_check_custom_threshold() { 252 + let check = create_test_blob_check(vec!["test"], "label", true, true, Some(5)); 253 + assert_eq!(check.hamming_threshold, Some(5)); 254 + } 255 + 256 + /// Test worker metrics tracking 257 + #[tokio::test] 258 + async fn test_worker_metrics_tracking() { 259 + use skywatch_phash_rs::metrics::Metrics; 260 + 261 + let metrics = Metrics::new(); 262 + 263 + // Simulate processing 264 + metrics.inc_jobs_processed(); 265 + metrics.inc_blobs_processed(); 266 + metrics.inc_blobs_downloaded(); 267 + 268 + // Cache operations 269 + metrics.inc_cache_hits(); 270 + metrics.inc_cache_misses(); 271 + 272 + // Matches 273 + metrics.inc_matches_found(); 274 + 275 + // Moderation actions 276 + metrics.inc_posts_reported(); 277 + metrics.inc_posts_labeled(); 278 + 279 + // All metrics should increment without panicking 280 + } 281 + 282 + /// Test match result structure 283 + #[tokio::test] 284 + async fn test_match_result_structure() { 285 + let phash = generate_phash(0xdeadbeef); 286 + let checks = vec![create_test_blob_check( 287 + vec![&phash], 288 + "test-label", 289 + true, 290 + true, 291 + Some(3), 292 + )]; 293 + 294 + let result = matcher::match_phash(&phash, &checks, "did:plc:test", 3); 295 + 296 + assert!(result.is_some()); 297 + let match_result = result.unwrap(); 298 + 299 + // Verify all fields are populated 300 + assert_eq!(match_result.phash.as_str(), &phash); 301 + assert_eq!(match_result.matched_check.label.as_str(), "test-label"); 302 + assert_eq!(match_result.matched_phash.as_str(), &phash); 303 + assert_eq!(match_result.hamming_distance, 0); 304 + }

+1

tests/integration_tests.rs

··· 1 + mod integration;

Compare changes