at main 4.3 kB view raw
1#!/usr/bin/env -S uv run --script --quiet 2"""backfill image_url for tracks with image_id but missing image_url. 3 4## Context 5 6PR #184 added image_url column to tracks table to eliminate N+1 R2 API calls. 7New uploads automatically populate image_url at creation time, but 15 legacy 8tracks uploaded before the PR still have image_url = NULL. 9 10This causes slow /tracks/liked endpoint performance because we fall back to 11calling track.get_image_url() which hits R2 API on every request. 12 13## What This Script Does 14 151. Finds all tracks with image_id but NULL image_url 162. Computes image_url by calling storage.get_url(image_id) 173. Updates database with computed URLs 184. Runs concurrently for performance 19 20## Usage 21 22```bash 23# dry run (show what would be updated) 24uv run scripts/backfill_image_urls.py --dry-run 25 26# actually update the database 27uv run scripts/backfill_image_urls.py 28 29# target specific environment 30DATABASE_URL=postgresql://... uv run scripts/backfill_image_urls.py 31``` 32""" 33 34import asyncio 35import logging 36import sys 37from pathlib import Path 38 39# add src to path so we can import backend modules 40sys.path.insert(0, str(Path(__file__).parent.parent / "src")) 41 42from sqlalchemy import select 43 44from backend.config import settings 45from backend.models import Track 46from backend.storage import storage 47from backend.storage.r2 import R2Storage 48from backend.utilities.database import db_session 49 50logging.basicConfig( 51 level=logging.INFO, 52 format="%(asctime)s - %(levelname)s - %(message)s", 53) 54logger = logging.getLogger(__name__) 55 56 57async def backfill_image_urls(dry_run: bool = False) -> None: 58 """backfill image_url for tracks with image_id but missing image_url.""" 59 60 if not isinstance(storage, R2Storage): 61 logger.error("storage backend is not R2, cannot compute image URLs") 62 return 63 64 logger.info(f"storage backend: {settings.storage.backend}") 65 66 async with db_session() as db: 67 # find tracks with image_id but no image_url 68 stmt = select(Track).where( 69 Track.image_id.isnot(None), Track.image_url.is_(None) 70 ) 71 result = await db.execute(stmt) 72 tracks = result.scalars().all() 73 74 if not tracks: 75 logger.info("no tracks need backfilling") 76 return 77 78 logger.info(f"found {len(tracks)} tracks needing image_url backfill") 79 80 if dry_run: 81 logger.info("dry run mode - showing tracks that would be updated:") 82 for track in tracks: 83 logger.info( 84 f" track {track.id}: {track.title} (image_id: {track.image_id})" 85 ) 86 return 87 88 # compute image URLs concurrently 89 logger.info("computing image URLs from R2...") 90 91 async def compute_and_update( 92 track: Track, 93 ) -> tuple[int, str | None, Exception | None]: 94 """compute image_url for a track and return (track_id, url, error).""" 95 try: 96 url = await storage.get_url(track.image_id, file_type="image") 97 return (track.id, url, None) 98 except Exception as e: 99 return (track.id, None, e) 100 101 results = await asyncio.gather(*[compute_and_update(t) for t in tracks]) 102 103 # update database with computed URLs 104 updated = 0 105 failed = 0 106 107 for track_id, image_url, error in results: 108 if image_url: 109 # find the track object 110 track = next(t for t in tracks if t.id == track_id) 111 track.image_url = image_url 112 updated += 1 113 logger.info(f"updated track {track_id}: {track.title}") 114 else: 115 failed += 1 116 track = next(t for t in tracks if t.id == track_id) 117 logger.error( 118 f"failed to compute URL for track {track_id} ({track.title}): {error}" 119 ) 120 121 await db.commit() 122 123 logger.info(f"backfill complete: {updated} updated, {failed} failed") 124 125 126async def main() -> None: 127 """main entry point.""" 128 dry_run = "--dry-run" in sys.argv 129 130 if dry_run: 131 logger.info("running in DRY RUN mode - no changes will be made") 132 133 await backfill_image_urls(dry_run=dry_run) 134 135 136if __name__ == "__main__": 137 asyncio.run(main())