1-- Migration to add fuzzy text matching capabilities
2-- This enables better artist name matching using trigram similarity
3
4-- Enable pg_trgm extension for trigram similarity matching
5CREATE EXTENSION IF NOT EXISTS pg_trgm;
6
7-- Create indexes for efficient trigram matching on artist names
8CREATE INDEX idx_artists_extended_name_trgm ON artists_extended USING gin (name gin_trgm_ops);
9CREATE INDEX idx_artists_extended_name_normalized_trgm ON artists_extended USING gin (name_normalized gin_trgm_ops);
10
11-- Create a function to calculate comprehensive artist similarity
12CREATE OR REPLACE FUNCTION calculate_artist_similarity(
13 input_name TEXT,
14 existing_name TEXT,
15 input_album TEXT DEFAULT NULL,
16 existing_album TEXT DEFAULT NULL
17) RETURNS FLOAT AS $$
18DECLARE
19 name_similarity FLOAT;
20 album_similarity FLOAT := 0.0;
21 final_score FLOAT;
22BEGIN
23 -- Calculate trigram similarity for artist names
24 name_similarity := similarity(LOWER(TRIM(input_name)), LOWER(TRIM(existing_name)));
25
26 -- Boost for exact matches after normalization
27 IF LOWER(TRIM(regexp_replace(input_name, '[^a-zA-Z0-9\s]', '', 'g'))) =
28 LOWER(TRIM(regexp_replace(existing_name, '[^a-zA-Z0-9\s]', '', 'g'))) THEN
29 name_similarity := GREATEST(name_similarity, 0.95);
30 END IF;
31
32 -- Factor in album similarity if both are provided
33 IF input_album IS NOT NULL AND existing_album IS NOT NULL THEN
34 album_similarity := similarity(LOWER(TRIM(input_album)), LOWER(TRIM(existing_album)));
35 -- Weight: 80% name, 20% album
36 final_score := (name_similarity * 0.8) + (album_similarity * 0.2);
37 ELSE
38 final_score := name_similarity;
39 END IF;
40
41 RETURN final_score;
42END;
43$$ LANGUAGE plpgsql IMMUTABLE;
44
45-- Create a view for fuzzy artist matching with confidence scores
46CREATE VIEW fuzzy_artist_matches AS
47SELECT DISTINCT
48 ae1.id as query_artist_id,
49 ae1.name as query_artist_name,
50 ae1.mbid_type as query_mbid_type,
51 ae2.id as match_artist_id,
52 ae2.name as match_artist_name,
53 ae2.mbid as match_mbid,
54 ae2.mbid_type as match_mbid_type,
55 similarity(LOWER(TRIM(ae1.name)), LOWER(TRIM(ae2.name))) as name_similarity,
56 CASE
57 WHEN ae2.mbid_type = 'musicbrainz' THEN 'upgrade_to_mb'
58 WHEN ae1.mbid_type = 'musicbrainz' AND ae2.mbid_type = 'synthetic' THEN 'consolidate_to_mb'
59 ELSE 'merge_synthetic'
60 END as match_action
61FROM artists_extended ae1
62CROSS JOIN artists_extended ae2
63WHERE ae1.id != ae2.id
64AND similarity(LOWER(TRIM(ae1.name)), LOWER(TRIM(ae2.name))) > 0.8
65AND (
66 ae1.mbid_type = 'synthetic' OR ae2.mbid_type = 'musicbrainz'
67);
68
69-- Add comments
70COMMENT ON EXTENSION pg_trgm IS 'Trigram extension for fuzzy text matching';
71COMMENT ON INDEX idx_artists_extended_name_trgm IS 'GIN index for trigram similarity on artist names';
72COMMENT ON FUNCTION calculate_artist_similarity IS 'Calculates similarity score between artists considering name and optional album context';
73COMMENT ON VIEW fuzzy_artist_matches IS 'Shows potential artist matches with confidence scores and recommended actions';
74
75-- Create a function to suggest artist consolidations
76CREATE OR REPLACE FUNCTION suggest_artist_consolidations(min_similarity FLOAT DEFAULT 0.9)
77RETURNS TABLE(
78 action TEXT,
79 synthetic_artist TEXT,
80 target_artist TEXT,
81 similarity_score FLOAT,
82 synthetic_plays INTEGER,
83 target_plays INTEGER
84) AS $$
85BEGIN
86 RETURN QUERY
87 SELECT
88 fam.match_action as action,
89 fam.query_artist_name as synthetic_artist,
90 fam.match_artist_name as target_artist,
91 fam.name_similarity as similarity_score,
92 (SELECT COUNT(*)::INTEGER FROM play_to_artists_extended WHERE artist_id = fam.query_artist_id) as synthetic_plays,
93 (SELECT COUNT(*)::INTEGER FROM play_to_artists_extended WHERE artist_id = fam.match_artist_id) as target_plays
94 FROM fuzzy_artist_matches fam
95 WHERE fam.name_similarity >= min_similarity
96 AND fam.match_action = 'upgrade_to_mb'
97 ORDER BY fam.name_similarity DESC, synthetic_plays DESC;
98END;
99$$ LANGUAGE plpgsql;
100
101COMMENT ON FUNCTION suggest_artist_consolidations IS 'Returns suggestions for consolidating synthetic artists with MusicBrainz artists based on similarity';