1-- Enhanced discriminant extraction with comprehensive edition/version patterns
2-- This migration improves the auto-population of discriminants for better metadata handling
3
4-- Drop existing functions to replace them with enhanced versions
5DROP FUNCTION IF EXISTS extract_discriminant(TEXT);
6DROP FUNCTION IF EXISTS get_base_name(TEXT);
7
8-- Enhanced function to extract discriminants with comprehensive patterns
9CREATE OR REPLACE FUNCTION extract_discriminant(name_text TEXT) RETURNS TEXT AS $$
10DECLARE
11 -- Comprehensive patterns for discriminant extraction
12 discriminant_patterns TEXT[] := ARRAY[
13 -- Parentheses patterns
14 '\(([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\)',
15 '\(([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\)',
16 '\(([^)]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\)',
17 '\(([^)]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\)',
18 '\(([^)]*(?:from|soundtrack|ost|score|theme).*?)\)',
19
20 -- Brackets patterns
21 '\[([^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\]',
22 '\[([^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\]',
23 '\[([^]]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\]',
24 '\[([^]]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\]',
25 '\[([^]]*(?:from|soundtrack|ost|score|theme).*?)\]',
26
27 -- Braces patterns
28 '\{([^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\}',
29 '\{([^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\}',
30 '\{([^}]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\}',
31 '\{([^}]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\}',
32 '\{([^}]*(?:from|soundtrack|ost|score|theme).*?)\}',
33
34 -- Dash/hyphen patterns (common for editions)
35 '[-–—]\s*([^-–—]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray).*?)$',
36 '[-–—]\s*(\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$',
37
38 -- Colon patterns (common for subtitles and versions)
39 ':\s*([^:]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive).*?)$',
40 ':\s*(\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$'
41 ];
42
43 pattern TEXT;
44 match_result TEXT;
45BEGIN
46 -- Return early if input is null or empty
47 IF name_text IS NULL OR trim(name_text) = '' THEN
48 RETURN NULL;
49 END IF;
50
51 -- Try each pattern to find discriminant information
52 FOREACH pattern IN ARRAY discriminant_patterns
53 LOOP
54 SELECT substring(name_text FROM pattern COLLATE "C") INTO match_result;
55 IF match_result IS NOT NULL AND length(trim(match_result)) > 0 THEN
56 -- Clean up the match result
57 match_result := trim(match_result);
58 -- Remove leading/trailing punctuation
59 match_result := regexp_replace(match_result, '^[^\w]+|[^\w]+$', '', 'g');
60 -- Ensure it's not just whitespace or empty after cleanup
61 IF length(trim(match_result)) > 0 THEN
62 RETURN match_result;
63 END IF;
64 END IF;
65 END LOOP;
66
67 RETURN NULL;
68END;
69$$ LANGUAGE plpgsql IMMUTABLE;
70
71-- Enhanced function to get base name without discriminant
72CREATE OR REPLACE FUNCTION get_base_name(name_text TEXT) RETURNS TEXT AS $$
73DECLARE
74 -- Comprehensive cleanup patterns matching the extraction patterns
75 cleanup_patterns TEXT[] := ARRAY[
76 -- Remove parentheses content
77 '\s*\([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\)\s*',
78 '\s*\([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\)\s*',
79 '\s*\([^)]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\)\s*',
80 '\s*\([^)]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\)\s*',
81 '\s*\([^)]*(?:from|soundtrack|ost|score|theme).*?\)\s*',
82
83 -- Remove brackets content
84 '\s*\[[^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\]\s*',
85 '\s*\[[^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\]\s*',
86 '\s*\[[^]]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\]\s*',
87 '\s*\[[^]]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\]\s*',
88 '\s*\[[^]]*(?:from|soundtrack|ost|score|theme).*?\]\s*',
89
90 -- Remove braces content
91 '\s*\{[^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\}\s*',
92 '\s*\{[^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\}\s*',
93 '\s*\{[^}]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\}\s*',
94 '\s*\{[^}]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\}\s*',
95 '\s*\{[^}]*(?:from|soundtrack|ost|score|theme).*?\}\s*',
96
97 -- Remove dash/hyphen patterns
98 '\s*[-–—]\s*[^-–—]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray).*?$',
99 '\s*[-–—]\s*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$',
100
101 -- Remove colon patterns
102 '\s*:\s*[^:]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive).*?$',
103 '\s*:\s*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$'
104 ];
105
106 pattern TEXT;
107 result_text TEXT := name_text;
108BEGIN
109 -- Return early if input is null or empty
110 IF name_text IS NULL OR trim(name_text) = '' THEN
111 RETURN name_text;
112 END IF;
113
114 -- Remove discriminant patterns to get base name
115 FOREACH pattern IN ARRAY cleanup_patterns
116 LOOP
117 result_text := regexp_replace(result_text, pattern, ' ', 'gi');
118 END LOOP;
119
120 -- Clean up extra whitespace and normalize
121 result_text := regexp_replace(trim(result_text), '\s+', ' ', 'g');
122
123 -- Remove trailing punctuation that might be left after removal
124 result_text := regexp_replace(result_text, '[,;:\-–—]\s*$', '', 'g');
125 result_text := trim(result_text);
126
127 -- Ensure we don't return an empty string
128 IF length(result_text) = 0 THEN
129 RETURN name_text;
130 END IF;
131
132 RETURN result_text;
133END;
134$$ LANGUAGE plpgsql IMMUTABLE;
135
136-- Create function to extract discriminant specifically for editions and versions
137CREATE OR REPLACE FUNCTION extract_edition_discriminant(name_text TEXT) RETURNS TEXT AS $$
138DECLARE
139 -- Focused patterns for edition/version extraction
140 edition_patterns TEXT[] := ARRAY[
141 -- Edition patterns
142 '\(([^)]*edition[^)]*)\)',
143 '\[([^]]*edition[^]]*)\]',
144 '\{([^}]*edition[^}]*)\}',
145 '[-–—]\s*([^-–—]*edition[^-–—]*)$',
146 ':\s*([^:]*edition[^:]*)$',
147
148 -- Version patterns
149 '\(([^)]*version[^)]*)\)',
150 '\[([^]]*version[^]]*)\]',
151 '\{([^}]*version[^}]*)\}',
152 '[-–—]\s*([^-–—]*version[^-–—]*)$',
153 ':\s*([^:]*version[^:]*)$',
154
155 -- Remaster patterns
156 '\(([^)]*remaster[^)]*)\)',
157 '\[([^]]*remaster[^]]*)\]',
158 '\{([^}]*remaster[^}]*)\}',
159 '[-–—]\s*([^-–—]*remaster[^-–—]*)$',
160 ':\s*([^:]*remaster[^:]*)$',
161
162 -- Year-based patterns
163 '\(([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^)]*)\)',
164 '\[([^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^]]*)\]',
165 '\{([^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^}]*)\}'
166 ];
167
168 pattern TEXT;
169 match_result TEXT;
170BEGIN
171 -- Return early if input is null or empty
172 IF name_text IS NULL OR trim(name_text) = '' THEN
173 RETURN NULL;
174 END IF;
175
176 -- Try edition-specific patterns first
177 FOREACH pattern IN ARRAY edition_patterns
178 LOOP
179 SELECT substring(name_text FROM pattern COLLATE "C") INTO match_result;
180 IF match_result IS NOT NULL AND length(trim(match_result)) > 0 THEN
181 match_result := trim(match_result);
182 match_result := regexp_replace(match_result, '^[^\w]+|[^\w]+$', '', 'g');
183 IF length(trim(match_result)) > 0 THEN
184 RETURN match_result;
185 END IF;
186 END IF;
187 END LOOP;
188
189 RETURN NULL;
190END;
191$$ LANGUAGE plpgsql IMMUTABLE;
192
193-- Update recordings table to populate discriminants from existing names
194UPDATE recordings
195SET discriminant = extract_discriminant(name)
196WHERE discriminant IS NULL
197 AND extract_discriminant(name) IS NOT NULL;
198
199-- Update releases table to populate discriminants from existing names
200UPDATE releases
201SET discriminant = extract_discriminant(name)
202WHERE discriminant IS NULL
203 AND extract_discriminant(name) IS NOT NULL;
204
205-- Update plays table to populate discriminants from existing names where not already set
206UPDATE plays
207SET track_discriminant = extract_discriminant(track_name)
208WHERE track_discriminant IS NULL
209 AND extract_discriminant(track_name) IS NOT NULL;
210
211UPDATE plays
212SET release_discriminant = extract_discriminant(release_name)
213WHERE release_discriminant IS NULL
214 AND release_name IS NOT NULL
215 AND extract_discriminant(release_name) IS NOT NULL;
216
217-- Create indexes for efficient discriminant queries
218CREATE INDEX IF NOT EXISTS idx_recordings_name_discriminant ON recordings (name, discriminant);
219CREATE INDEX IF NOT EXISTS idx_releases_name_discriminant ON releases (name, discriminant);
220
221-- Add comments for the new function
222COMMENT ON FUNCTION extract_discriminant IS 'Enhanced discriminant extraction supporting comprehensive edition/version patterns including parentheses, brackets, braces, dashes, and colons';
223COMMENT ON FUNCTION get_base_name IS 'Enhanced base name extraction removing comprehensive discriminant patterns to enable proper grouping';
224COMMENT ON FUNCTION extract_edition_discriminant IS 'Specialized function for extracting edition and version discriminants with focused patterns';
225
226-- Create a view to show discriminant extraction results for analysis
227CREATE OR REPLACE VIEW discriminant_analysis AS
228SELECT
229 'recordings' as table_name,
230 name as original_name,
231 discriminant,
232 get_base_name(name) as base_name,
233 extract_discriminant(name) as extracted_discriminant,
234 extract_edition_discriminant(name) as edition_discriminant
235FROM recordings
236WHERE name IS NOT NULL
237UNION ALL
238SELECT
239 'releases' as table_name,
240 name as original_name,
241 discriminant,
242 get_base_name(name) as base_name,
243 extract_discriminant(name) as extracted_discriminant,
244 extract_edition_discriminant(name) as edition_discriminant
245FROM releases
246WHERE name IS NOT NULL;
247
248COMMENT ON VIEW discriminant_analysis IS 'Analysis view showing discriminant extraction results for quality assessment and debugging';
249
250-- Refresh materialized views to include discriminant information
251REFRESH MATERIALIZED VIEW mv_release_play_counts;
252REFRESH MATERIALIZED VIEW mv_recording_play_counts;
253
254-- Create summary statistics for discriminant usage
255CREATE OR REPLACE VIEW discriminant_stats AS
256SELECT
257 'recordings' as entity_type,
258 COUNT(*) as total_count,
259 COUNT(CASE WHEN discriminant IS NOT NULL THEN 1 END) as with_discriminant,
260 COUNT(CASE WHEN discriminant IS NULL AND extract_discriminant(name) IS NOT NULL THEN 1 END) as extractable_discriminant,
261 ROUND(
262 COUNT(CASE WHEN discriminant IS NOT NULL THEN 1 END) * 100.0 / COUNT(*), 2
263 ) as discriminant_percentage
264FROM recordings
265UNION ALL
266SELECT
267 'releases' as entity_type,
268 COUNT(*) as total_count,
269 COUNT(CASE WHEN discriminant IS NOT NULL THEN 1 END) as with_discriminant,
270 COUNT(CASE WHEN discriminant IS NULL AND extract_discriminant(name) IS NOT NULL THEN 1 END) as extractable_discriminant,
271 ROUND(
272 COUNT(CASE WHEN discriminant IS NOT NULL THEN 1 END) * 100.0 / COUNT(*), 2
273 ) as discriminant_percentage
274FROM releases;
275
276COMMENT ON VIEW discriminant_stats IS 'Statistics showing discriminant usage and extraction potential across entity types';