fix: flag copyright matches by presence, not score (#384)

AuDD enterprise API doesn't return confidence scores (all scores are 0),
so we now flag tracks if any matches are found instead of comparing against
a score threshold.

Also adds --max-duration flag to scan script to skip long DJ sets.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: Claude <noreply@anthropic.com>

authored by zzstoatzz.io Claude and committed by GitHub c72fceba 434fa34f

Changed files
+94 -13
moderation
src
scripts
+2 -8
moderation/src/main.rs
··· 21 21 auth_token: Option<String>, 22 22 audd_api_token: String, 23 23 audd_api_url: String, 24 - score_threshold: i32, 25 24 } 26 25 27 26 impl Config { ··· 37 36 .map_err(|_| anyhow!("MODERATION_AUDD_API_TOKEN is required"))?, 38 37 audd_api_url: env::var("MODERATION_AUDD_API_URL") 39 38 .unwrap_or_else(|_| "https://enterprise.audd.io/".to_string()), 40 - score_threshold: env::var("MODERATION_SCORE_THRESHOLD") 41 - .ok() 42 - .and_then(|v| v.parse().ok()) 43 - .unwrap_or(70), 44 39 }) 45 40 } 46 41 } ··· 136 131 .with_state(AppState { 137 132 audd_api_token: config.audd_api_token, 138 133 audd_api_url: config.audd_api_url, 139 - score_threshold: config.score_threshold, 140 134 }); 141 135 142 136 let addr: SocketAddr = format!("{}:{}", config.host, config.port) ··· 155 149 struct AppState { 156 150 audd_api_token: String, 157 151 audd_api_url: String, 158 - score_threshold: i32, 159 152 } 160 153 161 154 // --- middleware --- ··· 233 226 234 227 let matches = extract_matches(&audd_response); 235 228 let highest_score = matches.iter().map(|m| m.score).max().unwrap_or(0); 236 - let is_flagged = highest_score >= state.score_threshold; 229 + // flag if any matches are found - audd enterprise doesn't return confidence scores 230 + let is_flagged = !matches.is_empty(); 237 231 238 232 info!( 239 233 match_count = matches.len(),
+92 -5
scripts/scan_tracks_copyright.py
··· 5 5 # "httpx", 6 6 # "pydantic-settings", 7 7 # "sqlalchemy[asyncio]", 8 - # "psycopg[binary]", 8 + # "asyncpg", 9 9 # "logfire[sqlalchemy]", 10 10 # ] 11 11 # /// ··· 15 15 uv run scripts/scan_tracks_copyright.py --env staging 16 16 uv run scripts/scan_tracks_copyright.py --env prod --dry-run 17 17 uv run scripts/scan_tracks_copyright.py --env staging --limit 10 18 + uv run scripts/scan_tracks_copyright.py --env prod --max-duration 5 18 19 19 20 this will: 20 21 - fetch all tracks that haven't been scanned yet ··· 91 92 os.environ["DATABASE_URL"] = settings.get_database_url(env) 92 93 93 94 95 + async def get_file_size(client: httpx.AsyncClient, url: str) -> int | None: 96 + """get file size from HTTP HEAD request.""" 97 + try: 98 + response = await client.head(url, timeout=10.0) 99 + content_length = response.headers.get("content-length") 100 + if content_length: 101 + return int(content_length) 102 + except Exception: 103 + pass 104 + return None 105 + 106 + 107 + def estimate_duration_minutes(file_size_bytes: int, file_type: str) -> float: 108 + """estimate audio duration from file size. 109 + 110 + uses high bitrate estimates to avoid OVERestimating duration: 111 + - mp3: ~320 kbps (2.4 MB ≈ 1 minute) 112 + - m4a/aac: ~256 kbps (1.9 MB ≈ 1 minute) 113 + - wav: ~1411 kbps for 16-bit 44.1kHz stereo (10 MB ≈ 1 minute) 114 + - flac: ~1000 kbps high quality (7.5 MB ≈ 1 minute) 115 + """ 116 + mb = file_size_bytes / (1024 * 1024) 117 + 118 + if file_type == "mp3": 119 + return mb / 2.4 # ~2.4 MB per minute at 320kbps 120 + elif file_type in ("m4a", "aac"): 121 + return mb / 1.9 # ~1.9 MB per minute at 256kbps 122 + elif file_type == "wav": 123 + return mb / 10 # ~10 MB per minute for CD quality 124 + elif file_type == "flac": 125 + return mb / 7.5 # ~7.5 MB per minute high quality 126 + else: 127 + return mb / 2.4 # default to mp3-like estimate 128 + 129 + 94 130 async def scan_track( 95 131 client: httpx.AsyncClient, 96 132 settings: ScanSettings, ··· 111 147 env: Environment, 112 148 dry_run: bool = False, 113 149 limit: int | None = None, 150 + max_duration: float | None = None, 114 151 ) -> None: 115 152 """scan all tracks for copyright.""" 116 153 # load settings ··· 165 202 return 166 203 167 204 print(f"\n📋 found {len(tracks)} tracks to scan") 205 + if max_duration: 206 + print(f"⏱️ skipping tracks > {max_duration} minutes") 168 207 169 208 if dry_run: 170 - print("\n[DRY RUN] would scan:") 171 - for track in tracks: 172 - print(f" - {track.id}: {track.title} by @{track.artist.handle}") 209 + print("\n[DRY RUN] checking tracks...") 210 + async with httpx.AsyncClient() as client: 211 + would_scan = [] 212 + would_skip = [] 213 + for track in tracks: 214 + if max_duration and track.r2_url: 215 + file_size = await get_file_size(client, track.r2_url) 216 + if file_size: 217 + est_duration = estimate_duration_minutes( 218 + file_size, track.file_type 219 + ) 220 + if est_duration > max_duration: 221 + would_skip.append((track, file_size, est_duration)) 222 + continue 223 + would_scan.append(track) 224 + 225 + print(f"\nwould scan ({len(would_scan)}):") 226 + for track in would_scan: 227 + print(f" - {track.id}: {track.title} by @{track.artist.handle}") 228 + 229 + if would_skip: 230 + print(f"\nwould skip ({len(would_skip)}):") 231 + for track, size, duration in would_skip: 232 + print( 233 + f" - {track.id}: {track.title} " 234 + f"({size / (1024 * 1024):.1f} MB, ~{duration:.1f} min)" 235 + ) 173 236 return 174 237 175 238 # scan tracks 176 239 async with httpx.AsyncClient() as client: 177 240 scanned = 0 241 + skipped = 0 178 242 failed = 0 179 243 flagged = 0 180 244 ··· 183 247 print(f" artist: @{track.artist.handle}") 184 248 print(f" url: {track.r2_url}") 185 249 250 + # check duration if max_duration is set 251 + if max_duration and track.r2_url: 252 + file_size = await get_file_size(client, track.r2_url) 253 + if file_size: 254 + est_duration = estimate_duration_minutes( 255 + file_size, track.file_type 256 + ) 257 + print( 258 + f" size: {file_size / (1024 * 1024):.1f} MB, " 259 + f"est. duration: {est_duration:.1f} min" 260 + ) 261 + if est_duration > max_duration: 262 + print(f" ⏭️ skipped (>{max_duration} min)") 263 + skipped += 1 264 + continue 265 + 186 266 try: 187 267 result = await scan_track(client, settings, track.r2_url) 188 268 ··· 224 304 print("✅ scan complete") 225 305 print(f" scanned: {scanned}") 226 306 print(f" flagged: {flagged}") 307 + print(f" skipped: {skipped}") 227 308 print(f" failed: {failed}") 228 309 229 310 ··· 250 331 default=None, 251 332 help="limit number of tracks to scan", 252 333 ) 334 + parser.add_argument( 335 + "--max-duration", 336 + type=float, 337 + default=None, 338 + help="skip tracks longer than this many minutes (estimated from file size)", 339 + ) 253 340 254 341 args = parser.parse_args() 255 342 256 343 print(f"🔍 copyright scan - {args.env}") 257 344 print("=" * 50) 258 345 259 - asyncio.run(run_scan(args.env, args.dry_run, args.limit)) 346 + asyncio.run(run_scan(args.env, args.dry_run, args.limit, args.max_duration)) 260 347 261 348 262 349 if __name__ == "__main__":