[WIP] music platform user data scraper
teal-fm atproto
at main 10 kB view raw
1package musicbrainz 2 3import ( 4 "context" 5 "encoding/json" 6 "errors" 7 "fmt" 8 "log" 9 "net/http" 10 "net/url" 11 "os" 12 "sort" 13 "strings" 14 "sync" // Added for mutex 15 "time" 16 17 "github.com/teal-fm/piper/db" 18 "github.com/teal-fm/piper/models" 19 "golang.org/x/time/rate" 20) 21 22// MusicBrainz API Types 23type MusicBrainzArtistCredit struct { 24 Artist struct { 25 ID string `json:"id"` 26 Name string `json:"name"` 27 SortName string `json:"sort-name,omitempty"` 28 } `json:"artist"` 29 Joinphrase string `json:"joinphrase,omitempty"` 30 Name string `json:"name"` 31} 32 33type MusicBrainzRelease struct { 34 ID string `json:"id"` 35 Title string `json:"title"` 36 Status string `json:"status,omitempty"` 37 Date string `json:"date,omitempty"` // YYYY-MM-DD, YYYY-MM, or YYYY 38 Country string `json:"country,omitempty"` 39 Disambiguation string `json:"disambiguation,omitempty"` 40 TrackCount int `json:"track-count,omitempty"` 41} 42 43type MusicBrainzRecording struct { 44 ID string `json:"id"` 45 Title string `json:"title"` 46 Length int `json:"length,omitempty"` // milliseconds 47 ISRCs []string `json:"isrcs,omitempty"` 48 ArtistCredit []MusicBrainzArtistCredit `json:"artist-credit,omitempty"` 49 Releases []MusicBrainzRelease `json:"releases,omitempty"` 50} 51 52type MusicBrainzSearchResponse struct { 53 Created time.Time `json:"created"` 54 Count int `json:"count"` 55 Offset int `json:"offset"` 56 Recordings []MusicBrainzRecording `json:"recordings"` 57} 58 59type SearchParams struct { 60 Track string 61 Artist string 62 Release string 63} 64 65// cacheEntry holds the cached data and its expiration time. 66type cacheEntry struct { 67 recordings []MusicBrainzRecording 68 expiresAt time.Time 69} 70 71type MusicBrainzService struct { 72 db *db.DB 73 httpClient *http.Client 74 limiter *rate.Limiter 75 searchCache map[string]cacheEntry // In-memory cache for search results 76 cacheMutex sync.RWMutex // Mutex to protect the cache 77 cacheTTL time.Duration // Time-to-live for cache entries 78 cleaner MetadataCleaner // Cleaner for cleaning up expired cache entries 79 logger *log.Logger // Logger for logging 80} 81 82// NewMusicBrainzService creates a new service instance with rate limiting and caching. 83func NewMusicBrainzService(db *db.DB) *MusicBrainzService { 84 // MusicBrainz allows 1 request per second 85 limiter := rate.NewLimiter(rate.Every(time.Second), 1) 86 // Set a default cache TTL (e.g., 1 hour) 87 defaultCacheTTL := 1 * time.Hour 88 logger := log.New(os.Stdout, "musicbrainz: ", log.LstdFlags|log.Lmsgprefix) 89 return &MusicBrainzService{ 90 db: db, 91 httpClient: &http.Client{ 92 Timeout: 10 * time.Second, 93 }, 94 limiter: limiter, 95 searchCache: make(map[string]cacheEntry), // Initialize the cache map 96 cacheTTL: defaultCacheTTL, // Set the cache TTL 97 cleaner: *NewMetadataCleaner("Latin"), // Initialize the cleaner 98 // cacheMutex is zero-value ready 99 logger: logger, 100 } 101} 102 103// generateCacheKey creates a unique string key for caching based on search parameters. 104func generateCacheKey(params SearchParams) string { 105 // Use a structured format to avoid collisions and ensure order doesn't matter implicitly 106 // url.QueryEscape handles potential special characters in parameters 107 return fmt.Sprintf("track=%s&artist=%s&release=%s", 108 url.QueryEscape(params.Track), 109 url.QueryEscape(params.Artist), 110 url.QueryEscape(params.Release)) 111} 112 113// SearchMusicBrainz searches the MusicBrainz API for recordings, using an in-memory cache. 114func (s *MusicBrainzService) SearchMusicBrainz(ctx context.Context, params SearchParams) ([]MusicBrainzRecording, error) { 115 // Validate parameters first 116 if params.Track == "" && params.Artist == "" && params.Release == "" { 117 return nil, fmt.Errorf("at least one search parameter (Track, Artist, Release) must be provided") 118 } 119 120 // clean params 121 params.Track, _ = s.cleaner.CleanRecording(params.Track) 122 params.Artist, _ = s.cleaner.CleanArtist(params.Artist) 123 124 cacheKey := generateCacheKey(params) 125 now := time.Now().UTC() 126 127 // --- Check Cache (Read Lock) --- 128 s.cacheMutex.RLock() 129 entry, found := s.searchCache[cacheKey] 130 s.cacheMutex.RUnlock() 131 132 if found && now.Before(entry.expiresAt) { 133 s.logger.Printf("Cache hit for MusicBrainz search: key=%s", cacheKey) 134 // Return the cached data directly. Consider if a deep copy is needed if callers modify results. 135 return entry.recordings, nil 136 } 137 // --- Cache Miss or Expired --- 138 if found { 139 s.logger.Printf("Cache expired for MusicBrainz search: key=%s", cacheKey) 140 } else { 141 s.logger.Printf("Cache miss for MusicBrainz search: key=%s", cacheKey) 142 } 143 144 // --- Proceed with API call --- 145 queryParts := []string{} 146 if params.Track != "" { 147 queryParts = append(queryParts, fmt.Sprintf(`recording:"%s"`, params.Track)) 148 } 149 if params.Artist != "" { 150 queryParts = append(queryParts, fmt.Sprintf(`artist:"%s"`, params.Artist)) 151 } 152 if params.Release != "" { 153 queryParts = append(queryParts, fmt.Sprintf(`release:"%s"`, params.Release)) 154 } 155 query := strings.Join(queryParts, " AND ") 156 endpoint := fmt.Sprintf("https://musicbrainz.org/ws/2/recording?query=%s&fmt=json&inc=artists+releases+isrcs", url.QueryEscape(query)) 157 158 if err := s.limiter.Wait(ctx); err != nil { 159 if ctx.Err() != nil { 160 return nil, fmt.Errorf("context cancelled during rate limiter wait: %w", ctx.Err()) 161 } 162 return nil, fmt.Errorf("rate limiter error: %w", err) 163 } 164 165 req, err := http.NewRequestWithContext(ctx, "GET", endpoint, nil) 166 if err != nil { 167 return nil, fmt.Errorf("failed to create request: %w", err) 168 } 169 req.Header.Set("User-Agent", "piper/0.0.1 ( https://github.com/teal-fm/piper )") 170 171 resp, err := s.httpClient.Do(req) 172 if err != nil { 173 if ctx.Err() != nil { 174 return nil, fmt.Errorf("context error during request execution: %w", ctx.Err()) 175 } 176 return nil, fmt.Errorf("failed to execute request to %s: %w", endpoint, err) 177 } 178 defer resp.Body.Close() 179 180 if resp.StatusCode != http.StatusOK { 181 // TODO: read body for detailed error message 182 return nil, fmt.Errorf("MusicBrainz API request to %s returned status %d", endpoint, resp.StatusCode) 183 } 184 185 var result MusicBrainzSearchResponse 186 if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { 187 return nil, fmt.Errorf("failed to decode response from %s: %w", endpoint, err) 188 } 189 190 // cache result for later 191 s.cacheMutex.Lock() 192 s.searchCache[cacheKey] = cacheEntry{ 193 recordings: result.Recordings, 194 expiresAt: time.Now().UTC().Add(s.cacheTTL), 195 } 196 s.cacheMutex.Unlock() 197 s.logger.Printf("Cached MusicBrainz search result for key=%s, TTL=%s", cacheKey, s.cacheTTL) 198 199 // Return the newly fetched results 200 return result.Recordings, nil 201} 202 203// GetBestRelease selects the 'best' release from a list based on specific criteria. 204func (s *MusicBrainzService) GetBestRelease(releases []MusicBrainzRelease, trackTitle string) *MusicBrainzRelease { 205 if len(releases) == 0 { 206 return nil 207 } 208 if len(releases) == 1 { 209 // Return a pointer to the single element 210 r := releases[0] 211 return &r 212 } 213 214 // Sort releases: Prefer valid dates first, then sort by date, title, id. 215 sort.SliceStable(releases, func(i, j int) bool { 216 dateA := releases[i].Date 217 dateB := releases[j].Date 218 validDateA := len(dateA) >= 4 // Basic check for YYYY format or longer 219 validDateB := len(dateB) >= 4 220 221 // Put invalid/empty dates at the end 222 if validDateA && !validDateB { 223 return true 224 } 225 if !validDateA && validDateB { 226 return false 227 } 228 // If both valid or both invalid, compare dates lexicographically 229 if dateA != dateB { 230 return dateA < dateB 231 } 232 // If dates are same, compare by title 233 if releases[i].Title != releases[j].Title { 234 return releases[i].Title < releases[j].Title 235 } 236 // If titles are same, compare by ID 237 return releases[i].ID < releases[j].ID 238 }) 239 240 // 1. Find oldest release where country is 'XW' or 'US' AND title is NOT track title 241 for i := range releases { 242 release := &releases[i] 243 if (release.Country == "XW" || release.Country == "US") && release.Title != trackTitle { 244 return release 245 } 246 } 247 248 // 2. If none, find oldest release where title is NOT track title 249 for i := range releases { 250 release := &releases[i] 251 if release.Title != trackTitle { 252 return release 253 } 254 } 255 256 // 3. If none found, return the oldest release overall (which is the first one after sorting) 257 s.logger.Printf("Could not find a suitable release for '%s', picking oldest: '%s' (%s)", trackTitle, releases[0].Title, releases[0].ID) 258 r := releases[0] 259 return &r 260} 261 262func HydrateTrack(mb *MusicBrainzService, track models.Track) (*models.Track, error) { 263 ctx := context.Background() 264 // array of strings 265 artistArray := make([]string, len(track.Artist)) // Assuming Name is string type 266 for i, a := range track.Artist { 267 artistArray[i] = a.Name 268 } 269 270 params := SearchParams{ 271 Track: track.Name, 272 Artist: strings.Join(artistArray, ", "), 273 Release: track.Album, 274 } 275 res, err := mb.SearchMusicBrainz(ctx, params) 276 if err != nil { 277 return nil, err 278 } 279 280 if len(res) == 0 { 281 return nil, errors.New("no results found") 282 } 283 284 firstResult := res[0] 285 firstResultAlbum := mb.GetBestRelease(firstResult.Releases, firstResult.Title) 286 287 // woof. we Might not have any ISRCs! 288 var bestISRC string 289 if len(firstResult.ISRCs) >= 1 { 290 bestISRC = firstResult.ISRCs[0] 291 } 292 293 artists := make([]models.Artist, len(firstResult.ArtistCredit)) 294 295 for i, a := range firstResult.ArtistCredit { 296 artists[i] = models.Artist{ 297 Name: a.Name, 298 ID: a.Artist.ID, 299 MBID: &a.Artist.ID, 300 } 301 } 302 303 resTrack := models.Track{ 304 HasStamped: track.HasStamped, 305 PlayID: track.PlayID, 306 Name: track.Name, 307 URL: track.URL, 308 ServiceBaseUrl: track.ServiceBaseUrl, 309 RecordingMBID: &firstResult.ID, 310 Album: firstResultAlbum.Title, 311 ReleaseMBID: &firstResultAlbum.ID, 312 ISRC: bestISRC, 313 Timestamp: track.Timestamp, 314 ProgressMs: track.ProgressMs, 315 DurationMs: int64(firstResult.Length), 316 Artist: artists, 317 } 318 319 return &resTrack, nil 320}