+2
-8
moderation/src/main.rs
+2
-8
moderation/src/main.rs
···
21
21
auth_token: Option<String>,
22
22
audd_api_token: String,
23
23
audd_api_url: String,
24
-
score_threshold: i32,
25
24
}
26
25
27
26
impl Config {
···
37
36
.map_err(|_| anyhow!("MODERATION_AUDD_API_TOKEN is required"))?,
38
37
audd_api_url: env::var("MODERATION_AUDD_API_URL")
39
38
.unwrap_or_else(|_| "https://enterprise.audd.io/".to_string()),
40
-
score_threshold: env::var("MODERATION_SCORE_THRESHOLD")
41
-
.ok()
42
-
.and_then(|v| v.parse().ok())
43
-
.unwrap_or(70),
44
39
})
45
40
}
46
41
}
···
136
131
.with_state(AppState {
137
132
audd_api_token: config.audd_api_token,
138
133
audd_api_url: config.audd_api_url,
139
-
score_threshold: config.score_threshold,
140
134
});
141
135
142
136
let addr: SocketAddr = format!("{}:{}", config.host, config.port)
···
155
149
struct AppState {
156
150
audd_api_token: String,
157
151
audd_api_url: String,
158
-
score_threshold: i32,
159
152
}
160
153
161
154
// --- middleware ---
···
233
226
234
227
let matches = extract_matches(&audd_response);
235
228
let highest_score = matches.iter().map(|m| m.score).max().unwrap_or(0);
236
-
let is_flagged = highest_score >= state.score_threshold;
229
+
// flag if any matches are found - audd enterprise doesn't return confidence scores
230
+
let is_flagged = !matches.is_empty();
237
231
238
232
info!(
239
233
match_count = matches.len(),
+92
-5
scripts/scan_tracks_copyright.py
+92
-5
scripts/scan_tracks_copyright.py
···
5
5
# "httpx",
6
6
# "pydantic-settings",
7
7
# "sqlalchemy[asyncio]",
8
-
# "psycopg[binary]",
8
+
# "asyncpg",
9
9
# "logfire[sqlalchemy]",
10
10
# ]
11
11
# ///
···
15
15
uv run scripts/scan_tracks_copyright.py --env staging
16
16
uv run scripts/scan_tracks_copyright.py --env prod --dry-run
17
17
uv run scripts/scan_tracks_copyright.py --env staging --limit 10
18
+
uv run scripts/scan_tracks_copyright.py --env prod --max-duration 5
18
19
19
20
this will:
20
21
- fetch all tracks that haven't been scanned yet
···
91
92
os.environ["DATABASE_URL"] = settings.get_database_url(env)
92
93
93
94
95
+
async def get_file_size(client: httpx.AsyncClient, url: str) -> int | None:
96
+
"""get file size from HTTP HEAD request."""
97
+
try:
98
+
response = await client.head(url, timeout=10.0)
99
+
content_length = response.headers.get("content-length")
100
+
if content_length:
101
+
return int(content_length)
102
+
except Exception:
103
+
pass
104
+
return None
105
+
106
+
107
+
def estimate_duration_minutes(file_size_bytes: int, file_type: str) -> float:
108
+
"""estimate audio duration from file size.
109
+
110
+
uses high bitrate estimates to avoid OVERestimating duration:
111
+
- mp3: ~320 kbps (2.4 MB ≈ 1 minute)
112
+
- m4a/aac: ~256 kbps (1.9 MB ≈ 1 minute)
113
+
- wav: ~1411 kbps for 16-bit 44.1kHz stereo (10 MB ≈ 1 minute)
114
+
- flac: ~1000 kbps high quality (7.5 MB ≈ 1 minute)
115
+
"""
116
+
mb = file_size_bytes / (1024 * 1024)
117
+
118
+
if file_type == "mp3":
119
+
return mb / 2.4 # ~2.4 MB per minute at 320kbps
120
+
elif file_type in ("m4a", "aac"):
121
+
return mb / 1.9 # ~1.9 MB per minute at 256kbps
122
+
elif file_type == "wav":
123
+
return mb / 10 # ~10 MB per minute for CD quality
124
+
elif file_type == "flac":
125
+
return mb / 7.5 # ~7.5 MB per minute high quality
126
+
else:
127
+
return mb / 2.4 # default to mp3-like estimate
128
+
129
+
94
130
async def scan_track(
95
131
client: httpx.AsyncClient,
96
132
settings: ScanSettings,
···
111
147
env: Environment,
112
148
dry_run: bool = False,
113
149
limit: int | None = None,
150
+
max_duration: float | None = None,
114
151
) -> None:
115
152
"""scan all tracks for copyright."""
116
153
# load settings
···
165
202
return
166
203
167
204
print(f"\n📋 found {len(tracks)} tracks to scan")
205
+
if max_duration:
206
+
print(f"⏱️ skipping tracks > {max_duration} minutes")
168
207
169
208
if dry_run:
170
-
print("\n[DRY RUN] would scan:")
171
-
for track in tracks:
172
-
print(f" - {track.id}: {track.title} by @{track.artist.handle}")
209
+
print("\n[DRY RUN] checking tracks...")
210
+
async with httpx.AsyncClient() as client:
211
+
would_scan = []
212
+
would_skip = []
213
+
for track in tracks:
214
+
if max_duration and track.r2_url:
215
+
file_size = await get_file_size(client, track.r2_url)
216
+
if file_size:
217
+
est_duration = estimate_duration_minutes(
218
+
file_size, track.file_type
219
+
)
220
+
if est_duration > max_duration:
221
+
would_skip.append((track, file_size, est_duration))
222
+
continue
223
+
would_scan.append(track)
224
+
225
+
print(f"\nwould scan ({len(would_scan)}):")
226
+
for track in would_scan:
227
+
print(f" - {track.id}: {track.title} by @{track.artist.handle}")
228
+
229
+
if would_skip:
230
+
print(f"\nwould skip ({len(would_skip)}):")
231
+
for track, size, duration in would_skip:
232
+
print(
233
+
f" - {track.id}: {track.title} "
234
+
f"({size / (1024 * 1024):.1f} MB, ~{duration:.1f} min)"
235
+
)
173
236
return
174
237
175
238
# scan tracks
176
239
async with httpx.AsyncClient() as client:
177
240
scanned = 0
241
+
skipped = 0
178
242
failed = 0
179
243
flagged = 0
180
244
···
183
247
print(f" artist: @{track.artist.handle}")
184
248
print(f" url: {track.r2_url}")
185
249
250
+
# check duration if max_duration is set
251
+
if max_duration and track.r2_url:
252
+
file_size = await get_file_size(client, track.r2_url)
253
+
if file_size:
254
+
est_duration = estimate_duration_minutes(
255
+
file_size, track.file_type
256
+
)
257
+
print(
258
+
f" size: {file_size / (1024 * 1024):.1f} MB, "
259
+
f"est. duration: {est_duration:.1f} min"
260
+
)
261
+
if est_duration > max_duration:
262
+
print(f" ⏭️ skipped (>{max_duration} min)")
263
+
skipped += 1
264
+
continue
265
+
186
266
try:
187
267
result = await scan_track(client, settings, track.r2_url)
188
268
···
224
304
print("✅ scan complete")
225
305
print(f" scanned: {scanned}")
226
306
print(f" flagged: {flagged}")
307
+
print(f" skipped: {skipped}")
227
308
print(f" failed: {failed}")
228
309
229
310
···
250
331
default=None,
251
332
help="limit number of tracks to scan",
252
333
)
334
+
parser.add_argument(
335
+
"--max-duration",
336
+
type=float,
337
+
default=None,
338
+
help="skip tracks longer than this many minutes (estimated from file size)",
339
+
)
253
340
254
341
args = parser.parse_args()
255
342
256
343
print(f"🔍 copyright scan - {args.env}")
257
344
print("=" * 50)
258
345
259
-
asyncio.run(run_scan(args.env, args.dry_run, args.limit))
346
+
asyncio.run(run_scan(args.env, args.dry_run, args.limit, args.max_duration))
260
347
261
348
262
349
if __name__ == "__main__":