1-- Fix case sensitivity in discriminant extraction patterns
2-- This migration updates the discriminant extraction functions to properly handle case-insensitive matching
3
4-- Drop dependent views first, then functions, then recreate everything
5DROP VIEW IF EXISTS discriminant_analysis CASCADE;
6DROP VIEW IF EXISTS discriminant_stats CASCADE;
7
8-- Drop existing functions to replace with case-insensitive versions
9DROP FUNCTION IF EXISTS extract_discriminant(TEXT) CASCADE;
10DROP FUNCTION IF EXISTS get_base_name(TEXT) CASCADE;
11DROP FUNCTION IF EXISTS extract_edition_discriminant(TEXT) CASCADE;
12
13-- Enhanced function to extract discriminants with case-insensitive matching
14CREATE OR REPLACE FUNCTION extract_discriminant(name_text TEXT) RETURNS TEXT AS $$
15DECLARE
16 -- Comprehensive patterns for discriminant extraction with case-insensitive flags
17 discriminant_patterns TEXT[] := ARRAY[
18 -- Parentheses patterns
19 '(?i)\(([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\)',
20 '(?i)\(([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\)',
21 '(?i)\(([^)]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\)',
22 '(?i)\(([^)]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\)',
23 '(?i)\(([^)]*(?:from|soundtrack|ost|score|theme).*?)\)',
24
25 -- Brackets patterns
26 '(?i)\[([^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\]',
27 '(?i)\[([^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\]',
28 '(?i)\[([^]]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\]',
29 '(?i)\[([^]]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\]',
30 '(?i)\[([^]]*(?:from|soundtrack|ost|score|theme).*?)\]',
31
32 -- Braces patterns
33 '(?i)\{([^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\}',
34 '(?i)\{([^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\}',
35 '(?i)\{([^}]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\}',
36 '(?i)\{([^}]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\}',
37 '(?i)\{([^}]*(?:from|soundtrack|ost|score|theme).*?)\}',
38
39 -- Dash/hyphen patterns (common for editions)
40 '(?i)[-–—]\s*([^-–—]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray).*?)$',
41 '(?i)[-–—]\s*(\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$',
42
43 -- Colon patterns (common for subtitles and versions)
44 '(?i):\s*([^:]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive).*?)$',
45 '(?i):\s*(\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$'
46 ];
47
48 pattern TEXT;
49 match_result TEXT;
50BEGIN
51 -- Return early if input is null or empty
52 IF name_text IS NULL OR trim(name_text) = '' THEN
53 RETURN NULL;
54 END IF;
55
56 -- Try each pattern to find discriminant information
57 FOREACH pattern IN ARRAY discriminant_patterns
58 LOOP
59 SELECT substring(name_text FROM pattern) INTO match_result;
60 IF match_result IS NOT NULL AND length(trim(match_result)) > 0 THEN
61 -- Clean up the match result
62 match_result := trim(match_result);
63 -- Remove leading/trailing punctuation
64 match_result := regexp_replace(match_result, '^[^\w]+|[^\w]+$', '', 'g');
65 -- Ensure it's not just whitespace or empty after cleanup
66 IF length(trim(match_result)) > 0 THEN
67 RETURN match_result;
68 END IF;
69 END IF;
70 END LOOP;
71
72 RETURN NULL;
73END;
74$$ LANGUAGE plpgsql IMMUTABLE;
75
76-- Enhanced function to get base name without discriminant with case-insensitive matching
77CREATE OR REPLACE FUNCTION get_base_name(name_text TEXT) RETURNS TEXT AS $$
78DECLARE
79 -- Comprehensive cleanup patterns matching the extraction patterns
80 cleanup_patterns TEXT[] := ARRAY[
81 -- Remove parentheses content
82 '(?i)\s*\([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\)\s*',
83 '(?i)\s*\([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\)\s*',
84 '(?i)\s*\([^)]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\)\s*',
85 '(?i)\s*\([^)]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\)\s*',
86 '(?i)\s*\([^)]*(?:from|soundtrack|ost|score|theme).*?\)\s*',
87
88 -- Remove brackets content
89 '(?i)\s*\[[^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\]\s*',
90 '(?i)\s*\[[^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\]\s*',
91 '(?i)\s*\[[^]]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\]\s*',
92 '(?i)\s*\[[^]]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\]\s*',
93 '(?i)\s*\[[^]]*(?:from|soundtrack|ost|score|theme).*?\]\s*',
94
95 -- Remove braces content
96 '(?i)\s*\{[^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\}\s*',
97 '(?i)\s*\{[^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\}\s*',
98 '(?i)\s*\{[^}]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\}\s*',
99 '(?i)\s*\{[^}]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\}\s*',
100 '(?i)\s*\{[^}]*(?:from|soundtrack|ost|score|theme).*?\}\s*',
101
102 -- Remove dash/hyphen patterns
103 '(?i)\s*[-–—]\s*[^-–—]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray).*?$',
104 '(?i)\s*[-–—]\s*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$',
105
106 -- Remove colon patterns
107 '(?i)\s*:\s*[^:]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive).*?$',
108 '(?i)\s*:\s*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$'
109 ];
110
111 pattern TEXT;
112 result_text TEXT := name_text;
113BEGIN
114 -- Return early if input is null or empty
115 IF name_text IS NULL OR trim(name_text) = '' THEN
116 RETURN name_text;
117 END IF;
118
119 -- Remove discriminant patterns to get base name
120 FOREACH pattern IN ARRAY cleanup_patterns
121 LOOP
122 result_text := regexp_replace(result_text, pattern, ' ', 'g');
123 END LOOP;
124
125 -- Clean up extra whitespace and normalize
126 result_text := regexp_replace(trim(result_text), '\s+', ' ', 'g');
127
128 -- Remove trailing punctuation that might be left after removal
129 result_text := regexp_replace(result_text, '[,;:\-–—]\s*$', '', 'g');
130 result_text := trim(result_text);
131
132 -- Ensure we don't return an empty string
133 IF length(result_text) = 0 THEN
134 RETURN name_text;
135 END IF;
136
137 RETURN result_text;
138END;
139$$ LANGUAGE plpgsql IMMUTABLE;
140
141-- Enhanced function to extract discriminant specifically for editions and versions with case-insensitive matching
142CREATE OR REPLACE FUNCTION extract_edition_discriminant(name_text TEXT) RETURNS TEXT AS $$
143DECLARE
144 -- Focused patterns for edition/version extraction with case-insensitive flags
145 edition_patterns TEXT[] := ARRAY[
146 -- Edition patterns
147 '(?i)\(([^)]*edition[^)]*)\)',
148 '(?i)\[([^]]*edition[^]]*)\]',
149 '(?i)\{([^}]*edition[^}]*)\}',
150 '(?i)[-–—]\s*([^-–—]*edition[^-–—]*)$',
151 '(?i):\s*([^:]*edition[^:]*)$',
152
153 -- Version patterns
154 '(?i)\(([^)]*version[^)]*)\)',
155 '(?i)\[([^]]*version[^]]*)\]',
156 '(?i)\{([^}]*version[^}]*)\}',
157 '(?i)[-–—]\s*([^-–—]*version[^-–—]*)$',
158 '(?i):\s*([^:]*version[^:]*)$',
159
160 -- Remaster patterns
161 '(?i)\(([^)]*remaster[^)]*)\)',
162 '(?i)\[([^]]*remaster[^]]*)\]',
163 '(?i)\{([^}]*remaster[^}]*)\}',
164 '(?i)[-–—]\s*([^-–—]*remaster[^-–—]*)$',
165 '(?i):\s*([^:]*remaster[^:]*)$',
166
167 -- Year-based patterns
168 '(?i)\(([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^)]*)\)',
169 '(?i)\[([^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^]]*)\]',
170 '(?i)\{([^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^}]*)\}'
171 ];
172
173 pattern TEXT;
174 match_result TEXT;
175BEGIN
176 -- Return early if input is null or empty
177 IF name_text IS NULL OR trim(name_text) = '' THEN
178 RETURN NULL;
179 END IF;
180
181 -- Try edition-specific patterns first
182 FOREACH pattern IN ARRAY edition_patterns
183 LOOP
184 SELECT substring(name_text FROM pattern) INTO match_result;
185 IF match_result IS NOT NULL AND length(trim(match_result)) > 0 THEN
186 match_result := trim(match_result);
187 match_result := regexp_replace(match_result, '^[^\w]+|[^\w]+$', '', 'g');
188 IF length(trim(match_result)) > 0 THEN
189 RETURN match_result;
190 END IF;
191 END IF;
192 END LOOP;
193
194 RETURN NULL;
195END;
196$$ LANGUAGE plpgsql IMMUTABLE;
197
198-- Update existing records with newly extracted discriminants (case-insensitive)
199UPDATE recordings
200SET discriminant = extract_discriminant(name)
201WHERE discriminant IS NULL
202 AND extract_discriminant(name) IS NOT NULL;
203
204UPDATE releases
205SET discriminant = extract_discriminant(name)
206WHERE discriminant IS NULL
207 AND extract_discriminant(name) IS NOT NULL;
208
209UPDATE plays
210SET track_discriminant = extract_discriminant(track_name)
211WHERE track_discriminant IS NULL
212 AND extract_discriminant(track_name) IS NOT NULL;
213
214UPDATE plays
215SET release_discriminant = extract_discriminant(release_name)
216WHERE release_discriminant IS NULL
217 AND release_name IS NOT NULL
218 AND extract_discriminant(release_name) IS NOT NULL;
219
220-- Update comments for the enhanced functions
221COMMENT ON FUNCTION extract_discriminant IS 'Enhanced case-insensitive discriminant extraction supporting comprehensive edition/version patterns including parentheses, brackets, braces, dashes, and colons';
222COMMENT ON FUNCTION get_base_name IS 'Enhanced case-insensitive base name extraction removing comprehensive discriminant patterns to enable proper grouping';
223COMMENT ON FUNCTION extract_edition_discriminant IS 'Specialized case-insensitive function for extracting edition and version discriminants with focused patterns';
224
225-- Refresh materialized views to reflect the case-insensitive improvements
226REFRESH MATERIALIZED VIEW mv_release_play_counts;
227REFRESH MATERIALIZED VIEW mv_recording_play_counts;
228
229-- Update discriminant analysis view to include case-insensitive results
230DROP VIEW IF EXISTS discriminant_analysis;
231CREATE OR REPLACE VIEW discriminant_analysis AS
232SELECT
233 'recordings' as table_name,
234 name as original_name,
235 discriminant,
236 get_base_name(name) as base_name,
237 extract_discriminant(name) as extracted_discriminant,
238 extract_edition_discriminant(name) as edition_discriminant
239FROM recordings
240WHERE name IS NOT NULL
241UNION ALL
242SELECT
243 'releases' as table_name,
244 name as original_name,
245 discriminant,
246 get_base_name(name) as base_name,
247 extract_discriminant(name) as extracted_discriminant,
248 extract_edition_discriminant(name) as edition_discriminant
249FROM releases
250WHERE name IS NOT NULL;
251
252COMMENT ON VIEW discriminant_analysis IS 'Analysis view showing case-insensitive discriminant extraction results for quality assessment and debugging';