music on atproto
plyr.fm
1#!/usr/bin/env -S uv run --script --quiet
2# /// script
3# requires-python = ">=3.12"
4# dependencies = [
5# "httpx",
6# "pydantic-settings",
7# "sqlalchemy[asyncio]",
8# "asyncpg",
9# "logfire[sqlalchemy]",
10# ]
11# ///
12"""scan all tracks for copyright using the moderation service.
13
14usage:
15 uv run scripts/scan_tracks_copyright.py --env staging
16 uv run scripts/scan_tracks_copyright.py --env prod --dry-run
17 uv run scripts/scan_tracks_copyright.py --env staging --limit 10
18 uv run scripts/scan_tracks_copyright.py --env prod --max-duration 5
19
20this will:
21- fetch all tracks that haven't been scanned yet
22- call the moderation service for each track
23- store results in copyright_scans table
24
25environment variables (set in .env or export):
26 # database URLs per environment
27 DEV_DATABASE_URL - dev database connection string
28 STAGING_DATABASE_URL - staging database connection string
29 PROD_DATABASE_URL - production database connection string
30
31 # moderation service
32 MODERATION_SERVICE_URL - URL of moderation service (default: https://plyr-moderation.fly.dev)
33 MODERATION_AUTH_TOKEN - auth token for moderation service
34"""
35
36import asyncio
37import os
38import sys
39from datetime import UTC, datetime
40from pathlib import Path
41from typing import Literal
42
43import httpx
44from pydantic import Field
45from pydantic_settings import BaseSettings, SettingsConfigDict
46
47# add src to path
48sys.path.insert(0, str(Path(__file__).parent.parent / "backend" / "src"))
49
50
51Environment = Literal["dev", "staging", "prod"]
52
53
54class ScanSettings(BaseSettings):
55 """settings for copyright scan script."""
56
57 model_config = SettingsConfigDict(
58 env_file=".env",
59 case_sensitive=False,
60 extra="ignore",
61 )
62
63 dev_database_url: str = Field(default="", validation_alias="DEV_DATABASE_URL")
64 staging_database_url: str = Field(
65 default="", validation_alias="STAGING_DATABASE_URL"
66 )
67 prod_database_url: str = Field(default="", validation_alias="PROD_DATABASE_URL")
68
69 moderation_service_url: str = Field(
70 default="https://plyr-moderation.fly.dev",
71 validation_alias="MODERATION_SERVICE_URL",
72 )
73 moderation_auth_token: str = Field(
74 default="", validation_alias="MODERATION_AUTH_TOKEN"
75 )
76
77 def get_database_url(self, env: Environment) -> str:
78 """get database URL for environment."""
79 urls = {
80 "dev": self.dev_database_url,
81 "staging": self.staging_database_url,
82 "prod": self.prod_database_url,
83 }
84 url = urls.get(env, "")
85 if not url:
86 raise ValueError(f"no database URL configured for {env}")
87 return url
88
89
90def setup_env(settings: ScanSettings, env: Environment) -> None:
91 """setup environment variables for backend imports."""
92 os.environ["DATABASE_URL"] = settings.get_database_url(env)
93
94
95async def get_file_size(client: httpx.AsyncClient, url: str) -> int | None:
96 """get file size from HTTP HEAD request."""
97 try:
98 response = await client.head(url, timeout=10.0)
99 content_length = response.headers.get("content-length")
100 if content_length:
101 return int(content_length)
102 except Exception:
103 pass
104 return None
105
106
107def estimate_duration_minutes(file_size_bytes: int, file_type: str) -> float:
108 """estimate audio duration from file size.
109
110 uses high bitrate estimates to avoid OVERestimating duration:
111 - mp3: ~320 kbps (2.4 MB ≈ 1 minute)
112 - m4a/aac: ~256 kbps (1.9 MB ≈ 1 minute)
113 - wav: ~1411 kbps for 16-bit 44.1kHz stereo (10 MB ≈ 1 minute)
114 - flac: ~1000 kbps high quality (7.5 MB ≈ 1 minute)
115 """
116 mb = file_size_bytes / (1024 * 1024)
117
118 if file_type == "mp3":
119 return mb / 2.4 # ~2.4 MB per minute at 320kbps
120 elif file_type in ("m4a", "aac"):
121 return mb / 1.9 # ~1.9 MB per minute at 256kbps
122 elif file_type == "wav":
123 return mb / 10 # ~10 MB per minute for CD quality
124 elif file_type == "flac":
125 return mb / 7.5 # ~7.5 MB per minute high quality
126 else:
127 return mb / 2.4 # default to mp3-like estimate
128
129
130async def scan_track(
131 client: httpx.AsyncClient,
132 settings: ScanSettings,
133 audio_url: str,
134) -> dict:
135 """call moderation service to scan a track."""
136 response = await client.post(
137 f"{settings.moderation_service_url}/scan",
138 json={"audio_url": audio_url},
139 headers={"X-Moderation-Key": settings.moderation_auth_token},
140 timeout=120.0, # scans can take a while
141 )
142 response.raise_for_status()
143 return response.json()
144
145
146async def run_scan(
147 env: Environment,
148 dry_run: bool = False,
149 limit: int | None = None,
150 max_duration: float | None = None,
151) -> None:
152 """scan all tracks for copyright."""
153 # load settings
154 settings = ScanSettings()
155
156 # validate settings
157 try:
158 db_url = settings.get_database_url(env)
159 print(
160 f"✓ database: {db_url.split('@')[1].split('/')[0] if '@' in db_url else 'configured'}"
161 )
162 except ValueError as e:
163 print(f"❌ {e}")
164 print(f"\nset {env.upper()}_DATABASE_URL in .env")
165 sys.exit(1)
166
167 if not settings.moderation_auth_token:
168 print("❌ MODERATION_AUTH_TOKEN not set")
169 sys.exit(1)
170
171 print(f"✓ moderation service: {settings.moderation_service_url}")
172
173 # setup env before backend imports
174 setup_env(settings, env)
175
176 # import backend after env setup
177 from sqlalchemy import select
178 from sqlalchemy.orm import joinedload
179
180 from backend.models import CopyrightScan, Track
181 from backend.utilities.database import db_session
182
183 async with db_session() as db:
184 # find tracks without scans
185 scanned_subq = select(CopyrightScan.track_id)
186 stmt = (
187 select(Track)
188 .options(joinedload(Track.artist))
189 .where(Track.id.notin_(scanned_subq))
190 .where(Track.r2_url.isnot(None))
191 .order_by(Track.created_at.desc())
192 )
193
194 if limit:
195 stmt = stmt.limit(limit)
196
197 result = await db.execute(stmt)
198 tracks = result.scalars().unique().all()
199
200 if not tracks:
201 print("\n✅ all tracks have been scanned")
202 return
203
204 print(f"\n📋 found {len(tracks)} tracks to scan")
205 if max_duration:
206 print(f"⏱️ skipping tracks > {max_duration} minutes")
207
208 if dry_run:
209 print("\n[DRY RUN] checking tracks...")
210 async with httpx.AsyncClient() as client:
211 would_scan = []
212 would_skip = []
213 for track in tracks:
214 if max_duration and track.r2_url:
215 file_size = await get_file_size(client, track.r2_url)
216 if file_size:
217 est_duration = estimate_duration_minutes(
218 file_size, track.file_type
219 )
220 if est_duration > max_duration:
221 would_skip.append((track, file_size, est_duration))
222 continue
223 would_scan.append(track)
224
225 print(f"\nwould scan ({len(would_scan)}):")
226 for track in would_scan:
227 print(f" - {track.id}: {track.title} by @{track.artist.handle}")
228
229 if would_skip:
230 print(f"\nwould skip ({len(would_skip)}):")
231 for track, size, duration in would_skip:
232 print(
233 f" - {track.id}: {track.title} "
234 f"({size / (1024 * 1024):.1f} MB, ~{duration:.1f} min)"
235 )
236 return
237
238 # scan tracks
239 async with httpx.AsyncClient() as client:
240 scanned = 0
241 skipped = 0
242 failed = 0
243 flagged = 0
244
245 for i, track in enumerate(tracks, 1):
246 print(f"\n[{i}/{len(tracks)}] scanning: {track.title}")
247 print(f" artist: @{track.artist.handle}")
248 print(f" url: {track.r2_url}")
249
250 # check duration if max_duration is set
251 if max_duration and track.r2_url:
252 file_size = await get_file_size(client, track.r2_url)
253 if file_size:
254 est_duration = estimate_duration_minutes(
255 file_size, track.file_type
256 )
257 print(
258 f" size: {file_size / (1024 * 1024):.1f} MB, "
259 f"est. duration: {est_duration:.1f} min"
260 )
261 if est_duration > max_duration:
262 print(f" ⏭️ skipped (>{max_duration} min)")
263 skipped += 1
264 continue
265
266 try:
267 result = await scan_track(client, settings, track.r2_url)
268
269 # create scan record
270 scan = CopyrightScan(
271 track_id=track.id,
272 scanned_at=datetime.now(UTC),
273 is_flagged=result["is_flagged"],
274 highest_score=result["highest_score"],
275 matches=result["matches"],
276 raw_response=result["raw_response"],
277 )
278 db.add(scan)
279 await db.commit()
280
281 scanned += 1
282 if result["is_flagged"]:
283 flagged += 1
284 print(f" ⚠️ FLAGGED (score: {result['highest_score']})")
285 for match in result["matches"][:3]:
286 print(
287 f" - {match['artist']} - {match['title']} ({match['score']})"
288 )
289 else:
290 print(f" ✓ clear (score: {result['highest_score']})")
291
292 except httpx.HTTPStatusError as e:
293 failed += 1
294 print(f" ❌ HTTP error: {e.response.status_code}")
295 try:
296 print(f" {e.response.json()}")
297 except Exception:
298 print(f" {e.response.text[:200]}")
299 except Exception as e:
300 failed += 1
301 print(f" ❌ error: {e}")
302
303 print(f"\n{'=' * 50}")
304 print("✅ scan complete")
305 print(f" scanned: {scanned}")
306 print(f" flagged: {flagged}")
307 print(f" skipped: {skipped}")
308 print(f" failed: {failed}")
309
310
311def main() -> None:
312 """main entry point."""
313 import argparse
314
315 parser = argparse.ArgumentParser(description="scan tracks for copyright")
316 parser.add_argument(
317 "--env",
318 type=str,
319 required=True,
320 choices=["dev", "staging", "prod"],
321 help="environment to scan",
322 )
323 parser.add_argument(
324 "--dry-run",
325 action="store_true",
326 help="show what would be scanned without making changes",
327 )
328 parser.add_argument(
329 "--limit",
330 type=int,
331 default=None,
332 help="limit number of tracks to scan",
333 )
334 parser.add_argument(
335 "--max-duration",
336 type=float,
337 default=None,
338 help="skip tracks longer than this many minutes (estimated from file size)",
339 )
340
341 args = parser.parse_args()
342
343 print(f"🔍 copyright scan - {args.env}")
344 print("=" * 50)
345
346 asyncio.run(run_scan(args.env, args.dry_run, args.limit, args.max_duration))
347
348
349if __name__ == "__main__":
350 main()