+52
.dockerignore
+52
.dockerignore
···
1
+
# Rust build artifacts
2
+
target/
3
+
**/target/
4
+
services/target/
5
+
apps/*/target/
6
+
7
+
# Node.js dependencies and build artifacts
8
+
node_modules/
9
+
**/node_modules/
10
+
.turbo/
11
+
**/.turbo/
12
+
build/
13
+
dist/
14
+
.next/
15
+
16
+
# Development and cache files
17
+
.git/
18
+
.gitignore
19
+
**/.DS_Store
20
+
*.log
21
+
*.tmp
22
+
*.temp
23
+
24
+
# IDE and editor files
25
+
.vscode/
26
+
.idea/
27
+
*.swp
28
+
*.swo
29
+
*~
30
+
31
+
# Environment and config files
32
+
.env
33
+
.env.local
34
+
.env.*.local
35
+
36
+
# Database files
37
+
*.db
38
+
*.sqlite
39
+
*.sqlite3
40
+
41
+
# Test coverage
42
+
coverage/
43
+
**/coverage/
44
+
45
+
# Temporary files
46
+
tmp/
47
+
temp/
48
+
49
+
# SQLx offline query cache
50
+
# Include workspace-level cache for monorepo builds
51
+
# Uncomment the line below if you want to force online compilation
52
+
# .sqlx/
+4
-12
.github/workflows/amethyst.yml
+4
-12
.github/workflows/amethyst.yml
···
46
46
run: pnpm lex:gen-server
47
47
48
48
- name: Build web
49
-
run: |
50
-
cd apps/amethyst
51
-
pnpm build:web
49
+
run: pnpm turbo build:web --filter=@teal/amethyst
52
50
53
51
- name: Upload web build artifacts
54
52
uses: actions/upload-artifact@v4
···
84
82
run: npm install -g @expo/cli
85
83
86
84
- name: Build iOS
87
-
run: |
88
-
cd apps/amethyst
89
-
pnpm build:ios
85
+
run: pnpm turbo build:ios --filter=@teal/amethyst
90
86
91
87
- name: Upload iOS build artifacts
92
88
uses: actions/upload-artifact@v4
···
118
114
run: pnpm lex:gen-server
119
115
120
116
- name: Type check
121
-
run: |
122
-
cd apps/amethyst
123
-
npx tsc --noEmit
117
+
run: pnpm turbo check-types --filter=@teal/amethyst
124
118
125
119
- name: Run tests
126
-
run: |
127
-
cd apps/amethyst
128
-
pnpm test --watchAll=false
120
+
run: pnpm turbo test --filter=@teal/amethyst
+1
-1
.github/workflows/aqua.yml
+1
-1
.github/workflows/aqua.yml
-16
.gitignore
-16
.gitignore
···
65
65
vendor/**/*.d.ts
66
66
vendor/**/dist/
67
67
vendor/**/node_modules/
68
-
69
-
# lexicons directory structure
70
-
!lexicons/
71
-
# Track our custom lexicons
72
-
!lexicons/fm.teal.alpha/
73
-
!lexicons/fm.teal.alpha/**/*.json
74
-
# Ignore symlinks to atproto lexicons (created during setup)
75
-
lexicons/app
76
-
lexicons/chat
77
-
lexicons/com
78
-
lexicons/tools
79
-
# But ignore any generated files within lexicons
80
-
lexicons/**/*.js
81
-
lexicons/**/*.d.ts
82
-
lexicons/**/dist/
83
-
lexicons/**/node_modules/
+30
-66
Cargo.lock
+30
-66
Cargo.lock
···
128
128
"chrono",
129
129
"clap",
130
130
"dotenvy",
131
-
"iroh-car 0.4.0",
131
+
"iroh-car",
132
132
"redis",
133
133
"reqwest",
134
134
"serde",
···
199
199
"dashmap",
200
200
"futures",
201
201
"ipld-core",
202
-
"iroh-car 0.5.1",
202
+
"iroh-car",
203
203
"log",
204
204
"multihash 0.19.3",
205
205
"serde",
···
372
372
]
373
373
374
374
[[package]]
375
+
name = "backon"
376
+
version = "1.5.2"
377
+
source = "registry+https://github.com/rust-lang/crates.io-index"
378
+
checksum = "592277618714fbcecda9a02ba7a8781f319d26532a88553bbacc77ba5d2b3a8d"
379
+
dependencies = [
380
+
"fastrand",
381
+
]
382
+
383
+
[[package]]
375
384
name = "backtrace"
376
385
version = "0.3.75"
377
386
source = "registry+https://github.com/rust-lang/crates.io-index"
···
552
561
"dotenvy",
553
562
"flume",
554
563
"futures",
555
-
"iroh-car 0.4.0",
564
+
"iroh-car",
556
565
"libipld",
557
566
"metrics 0.23.1",
558
567
"metrics-exporter-prometheus",
···
1875
1884
1876
1885
[[package]]
1877
1886
name = "iroh-car"
1878
-
version = "0.4.0"
1879
-
source = "registry+https://github.com/rust-lang/crates.io-index"
1880
-
checksum = "475a6f0ebd64c87ea011021c67f10b57930f6c286e0163807066bfb83553b1b6"
1881
-
dependencies = [
1882
-
"anyhow",
1883
-
"cid 0.10.1",
1884
-
"futures",
1885
-
"libipld",
1886
-
"thiserror 1.0.69",
1887
-
"tokio",
1888
-
"unsigned-varint 0.7.2",
1889
-
]
1890
-
1891
-
[[package]]
1892
-
name = "iroh-car"
1893
1887
version = "0.5.1"
1894
1888
source = "registry+https://github.com/rust-lang/crates.io-index"
1895
1889
checksum = "cb7f8cd4cb9aa083fba8b52e921764252d0b4dcb1cd6d120b809dbfe1106e81a"
···
2497
2491
]
2498
2492
2499
2493
[[package]]
2494
+
name = "num-bigint"
2495
+
version = "0.4.6"
2496
+
source = "registry+https://github.com/rust-lang/crates.io-index"
2497
+
checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
2498
+
dependencies = [
2499
+
"num-integer",
2500
+
"num-traits",
2501
+
]
2502
+
2503
+
[[package]]
2500
2504
name = "num-bigint-dig"
2501
2505
version = "0.8.4"
2502
2506
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2689
2693
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
2690
2694
2691
2695
[[package]]
2692
-
name = "pin-project"
2693
-
version = "1.1.10"
2694
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2695
-
checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
2696
-
dependencies = [
2697
-
"pin-project-internal",
2698
-
]
2699
-
2700
-
[[package]]
2701
-
name = "pin-project-internal"
2702
-
version = "1.1.10"
2703
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2704
-
checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
2705
-
dependencies = [
2706
-
"proc-macro2",
2707
-
"quote",
2708
-
"syn 2.0.104",
2709
-
]
2710
-
2711
-
[[package]]
2712
2696
name = "pin-project-lite"
2713
2697
version = "0.2.16"
2714
2698
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2915
2899
"once_cell",
2916
2900
"socket2 0.5.10",
2917
2901
"tracing",
2918
-
"windows-sys 0.52.0",
2902
+
"windows-sys 0.59.0",
2919
2903
]
2920
2904
2921
2905
[[package]]
···
3012
2996
3013
2997
[[package]]
3014
2998
name = "redis"
3015
-
version = "0.24.0"
2999
+
version = "0.32.4"
3016
3000
source = "registry+https://github.com/rust-lang/crates.io-index"
3017
-
checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
3001
+
checksum = "e1f66bf4cac9733a23bcdf1e0e01effbaaad208567beba68be8f67e5f4af3ee1"
3018
3002
dependencies = [
3019
3003
"arc-swap",
3020
-
"async-trait",
3004
+
"backon",
3021
3005
"bytes",
3006
+
"cfg-if",
3022
3007
"combine",
3023
-
"futures",
3008
+
"futures-channel",
3024
3009
"futures-util",
3025
3010
"itoa",
3011
+
"num-bigint",
3026
3012
"percent-encoding",
3027
3013
"pin-project-lite",
3028
3014
"ryu",
3029
3015
"sha1_smol",
3030
-
"socket2 0.4.10",
3016
+
"socket2 0.6.0",
3031
3017
"tokio",
3032
-
"tokio-retry",
3033
3018
"tokio-util",
3034
3019
"url",
3035
3020
]
···
3605
3590
3606
3591
[[package]]
3607
3592
name = "socket2"
3608
-
version = "0.4.10"
3609
-
source = "registry+https://github.com/rust-lang/crates.io-index"
3610
-
checksum = "9f7916fc008ca5542385b89a3d3ce689953c143e9304a9bf8beec1de48994c0d"
3611
-
dependencies = [
3612
-
"libc",
3613
-
"winapi",
3614
-
]
3615
-
3616
-
[[package]]
3617
-
name = "socket2"
3618
3593
version = "0.5.10"
3619
3594
source = "registry+https://github.com/rust-lang/crates.io-index"
3620
3595
checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678"
···
4171
4146
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
4172
4147
dependencies = [
4173
4148
"native-tls",
4174
-
"tokio",
4175
-
]
4176
-
4177
-
[[package]]
4178
-
name = "tokio-retry"
4179
-
version = "0.3.0"
4180
-
source = "registry+https://github.com/rust-lang/crates.io-index"
4181
-
checksum = "7f57eb36ecbe0fc510036adff84824dd3c24bb781e21bfa67b69d556aa85214f"
4182
-
dependencies = [
4183
-
"pin-project",
4184
-
"rand 0.8.5",
4185
4149
"tokio",
4186
4150
]
4187
4151
+2
-2
Cargo.toml
+2
-2
Cargo.toml
···
34
34
rocketman = { path = "services/rocketman" }
35
35
36
36
# CAR and IPLD dependencies
37
-
iroh-car = "0.4"
37
+
iroh-car = "0.5"
38
38
libipld = { version = "0.16", features = ["dag-cbor", "dag-json"] }
39
39
cid = "0.11"
40
40
base64 = "0.22"
41
41
atmst = "0.0.1"
42
42
43
43
# Redis for job queues and caching
44
-
redis = { version = "0.24", features = ["tokio-comp", "connection-manager"] }
44
+
redis = { version = "0.32", features = ["tokio-comp", "connection-manager"] }
+4
-4
apps/aqua/Dockerfile
+4
-4
apps/aqua/Dockerfile
···
41
41
# Set up cross-compilation environment
42
42
ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc
43
43
44
+
44
45
# Debug platform detection and run build
45
-
RUN echo "DEBUG Before target.sh: TARGETPLATFORM=$TARGETPLATFORM TARGETARCH=$TARGETARCH" && \
46
-
. ./target.sh && \
47
-
touch src/main.rs && \
46
+
RUN . ./target.sh && \
47
+
touch apps/aqua/src/main.rs && \
48
48
echo "Building for $TARGET_ARCH" && \
49
-
cargo build --release --target $RUST_TARGET && \
49
+
cargo build --release --target $RUST_TARGET --package aqua && \
50
50
cp target/$RUST_TARGET/release/aqua target/aqua
51
51
52
52
FROM --platform=${TARGETARCH:-$BUILDPLATFORM} gcr.io/distroless/cc
apps/aqua/target.sh
target.sh
apps/aqua/target.sh
target.sh
+1
lexicons/app
+1
lexicons/app
···
1
+
../vendor/atproto/lexicons/app
+1
lexicons/chat
+1
lexicons/chat
···
1
+
../vendor/atproto/lexicons/chat
+1
lexicons/com
+1
lexicons/com
···
1
+
../vendor/atproto/lexicons/com
+1
lexicons/tools
+1
lexicons/tools
···
1
+
../vendor/atproto/lexicons/tools
+8
-2
services/cadet/Dockerfile
+8
-2
services/cadet/Dockerfile
···
41
41
# Set up cross-compilation environment
42
42
ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc
43
43
44
+
# Force SQLx to use offline mode with workspace cache
45
+
ENV SQLX_OFFLINE=true
46
+
47
+
# copy sqlx in
48
+
COPY .sqlx ./services/cadet/.sqlx
49
+
44
50
# Debug platform detection and run build
45
51
RUN echo "DEBUG Before target.sh: TARGETPLATFORM=$TARGETPLATFORM TARGETARCH=$TARGETARCH" && \
46
52
. ./target.sh && \
47
-
touch src/main.rs && \
53
+
touch services/cadet/src/main.rs && \
48
54
echo "Building for $TARGET_ARCH" && \
49
-
cargo build --release --target $RUST_TARGET && \
55
+
cargo build --release --target $RUST_TARGET --package cadet && \
50
56
cp target/$RUST_TARGET/release/cadet target/cadet
51
57
52
58
FROM --platform=${TARGETARCH:-$BUILDPLATFORM} gcr.io/distroless/cc
-226
services/migrations/20241220000001_initial_schema.sql
-226
services/migrations/20241220000001_initial_schema.sql
···
1
-
-- Initial comprehensive schema for Teal music platform
2
-
-- Based on services/cadet/sql/base.sql
3
-
4
-
CREATE TABLE artists (
5
-
mbid UUID PRIMARY KEY,
6
-
name TEXT NOT NULL,
7
-
play_count INTEGER DEFAULT 0
8
-
);
9
-
10
-
-- releases are synologous to 'albums'
11
-
CREATE TABLE releases (
12
-
mbid UUID PRIMARY KEY,
13
-
name TEXT NOT NULL,
14
-
play_count INTEGER DEFAULT 0
15
-
);
16
-
17
-
-- recordings are synologous to 'tracks' BUT tracks can be in multiple releases!
18
-
CREATE TABLE recordings (
19
-
mbid UUID PRIMARY KEY,
20
-
name TEXT NOT NULL,
21
-
play_count INTEGER DEFAULT 0
22
-
);
23
-
24
-
CREATE TABLE plays (
25
-
uri TEXT PRIMARY KEY,
26
-
did TEXT NOT NULL,
27
-
rkey TEXT NOT NULL,
28
-
cid TEXT NOT NULL,
29
-
isrc TEXT,
30
-
duration INTEGER,
31
-
track_name TEXT NOT NULL,
32
-
played_time TIMESTAMP WITH TIME ZONE,
33
-
processed_time TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
34
-
release_mbid UUID,
35
-
release_name TEXT,
36
-
recording_mbid UUID,
37
-
submission_client_agent TEXT,
38
-
music_service_base_domain TEXT,
39
-
origin_url TEXT,
40
-
FOREIGN KEY (release_mbid) REFERENCES releases (mbid),
41
-
FOREIGN KEY (recording_mbid) REFERENCES recordings (mbid)
42
-
);
43
-
44
-
CREATE INDEX idx_plays_release_mbid ON plays (release_mbid);
45
-
CREATE INDEX idx_plays_recording_mbid ON plays (recording_mbid);
46
-
CREATE INDEX idx_plays_played_time ON plays (played_time);
47
-
CREATE INDEX idx_plays_did ON plays (did);
48
-
49
-
CREATE TABLE play_to_artists (
50
-
play_uri TEXT, -- references plays(uri)
51
-
artist_mbid UUID REFERENCES artists (mbid),
52
-
artist_name TEXT, -- storing here for ease of use when joining
53
-
PRIMARY KEY (play_uri, artist_mbid),
54
-
FOREIGN KEY (play_uri) REFERENCES plays (uri)
55
-
);
56
-
57
-
CREATE INDEX idx_play_to_artists_artist ON play_to_artists (artist_mbid);
58
-
59
-
-- Profiles table
60
-
CREATE TABLE profiles (
61
-
did TEXT PRIMARY KEY,
62
-
handle TEXT,
63
-
display_name TEXT,
64
-
description TEXT,
65
-
description_facets JSONB,
66
-
avatar TEXT, -- IPLD of the image, bafy...
67
-
banner TEXT,
68
-
created_at TIMESTAMP WITH TIME ZONE
69
-
);
70
-
71
-
-- User featured items table
72
-
CREATE TABLE featured_items (
73
-
did TEXT PRIMARY KEY,
74
-
mbid TEXT NOT NULL,
75
-
type TEXT NOT NULL
76
-
);
77
-
78
-
-- Statii table (status records)
79
-
CREATE TABLE statii (
80
-
uri TEXT PRIMARY KEY,
81
-
did TEXT NOT NULL,
82
-
rkey TEXT NOT NULL,
83
-
cid TEXT NOT NULL,
84
-
record JSONB NOT NULL,
85
-
indexed_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
86
-
);
87
-
88
-
CREATE INDEX idx_statii_did_rkey ON statii (did, rkey);
89
-
90
-
-- Materialized view for artists' play counts
91
-
CREATE MATERIALIZED VIEW mv_artist_play_counts AS
92
-
SELECT
93
-
a.mbid AS artist_mbid,
94
-
a.name AS artist_name,
95
-
COUNT(p.uri) AS play_count
96
-
FROM
97
-
artists a
98
-
LEFT JOIN play_to_artists pta ON a.mbid = pta.artist_mbid
99
-
LEFT JOIN plays p ON p.uri = pta.play_uri
100
-
GROUP BY
101
-
a.mbid,
102
-
a.name;
103
-
104
-
CREATE UNIQUE INDEX idx_mv_artist_play_counts ON mv_artist_play_counts (artist_mbid);
105
-
106
-
-- Materialized view for releases' play counts
107
-
CREATE MATERIALIZED VIEW mv_release_play_counts AS
108
-
SELECT
109
-
r.mbid AS release_mbid,
110
-
r.name AS release_name,
111
-
COUNT(p.uri) AS play_count
112
-
FROM
113
-
releases r
114
-
LEFT JOIN plays p ON p.release_mbid = r.mbid
115
-
GROUP BY
116
-
r.mbid,
117
-
r.name;
118
-
119
-
CREATE UNIQUE INDEX idx_mv_release_play_counts ON mv_release_play_counts (release_mbid);
120
-
121
-
-- Materialized view for recordings' play counts
122
-
CREATE MATERIALIZED VIEW mv_recording_play_counts AS
123
-
SELECT
124
-
rec.mbid AS recording_mbid,
125
-
rec.name AS recording_name,
126
-
COUNT(p.uri) AS play_count
127
-
FROM
128
-
recordings rec
129
-
LEFT JOIN plays p ON p.recording_mbid = rec.mbid
130
-
GROUP BY
131
-
rec.mbid,
132
-
rec.name;
133
-
134
-
CREATE UNIQUE INDEX idx_mv_recording_play_counts ON mv_recording_play_counts (recording_mbid);
135
-
136
-
-- Global play count materialized view
137
-
CREATE MATERIALIZED VIEW mv_global_play_count AS
138
-
SELECT
139
-
COUNT(uri) AS total_plays,
140
-
COUNT(DISTINCT did) AS unique_listeners
141
-
FROM plays;
142
-
143
-
CREATE UNIQUE INDEX idx_mv_global_play_count ON mv_global_play_count(total_plays);
144
-
145
-
-- Top artists in the last 30 days
146
-
CREATE MATERIALIZED VIEW mv_top_artists_30days AS
147
-
SELECT
148
-
a.mbid AS artist_mbid,
149
-
a.name AS artist_name,
150
-
COUNT(p.uri) AS play_count
151
-
FROM artists a
152
-
INNER JOIN play_to_artists pta ON a.mbid = pta.artist_mbid
153
-
INNER JOIN plays p ON p.uri = pta.play_uri
154
-
WHERE p.played_time >= NOW() - INTERVAL '30 days'
155
-
GROUP BY a.mbid, a.name
156
-
ORDER BY COUNT(p.uri) DESC;
157
-
158
-
-- Top releases in the last 30 days
159
-
CREATE MATERIALIZED VIEW mv_top_releases_30days AS
160
-
SELECT
161
-
r.mbid AS release_mbid,
162
-
r.name AS release_name,
163
-
COUNT(p.uri) AS play_count
164
-
FROM releases r
165
-
INNER JOIN plays p ON p.release_mbid = r.mbid
166
-
WHERE p.played_time >= NOW() - INTERVAL '30 days'
167
-
GROUP BY r.mbid, r.name
168
-
ORDER BY COUNT(p.uri) DESC;
169
-
170
-
-- Top artists for user in the last 30 days
171
-
CREATE MATERIALIZED VIEW mv_top_artists_for_user_30days AS
172
-
SELECT
173
-
prof.did,
174
-
a.mbid AS artist_mbid,
175
-
a.name AS artist_name,
176
-
COUNT(p.uri) AS play_count
177
-
FROM artists a
178
-
INNER JOIN play_to_artists pta ON a.mbid = pta.artist_mbid
179
-
INNER JOIN plays p ON p.uri = pta.play_uri
180
-
INNER JOIN profiles prof ON prof.did = p.did
181
-
WHERE p.played_time >= NOW() - INTERVAL '30 days'
182
-
GROUP BY prof.did, a.mbid, a.name
183
-
ORDER BY COUNT(p.uri) DESC;
184
-
185
-
-- Top artists for user in the last 7 days
186
-
CREATE MATERIALIZED VIEW mv_top_artists_for_user_7days AS
187
-
SELECT
188
-
prof.did,
189
-
a.mbid AS artist_mbid,
190
-
a.name AS artist_name,
191
-
COUNT(p.uri) AS play_count
192
-
FROM artists a
193
-
INNER JOIN play_to_artists pta ON a.mbid = pta.artist_mbid
194
-
INNER JOIN plays p ON p.uri = pta.play_uri
195
-
INNER JOIN profiles prof ON prof.did = p.did
196
-
WHERE p.played_time >= NOW() - INTERVAL '7 days'
197
-
GROUP BY prof.did, a.mbid, a.name
198
-
ORDER BY COUNT(p.uri) DESC;
199
-
200
-
-- Top releases for user in the last 30 days
201
-
CREATE MATERIALIZED VIEW mv_top_releases_for_user_30days AS
202
-
SELECT
203
-
prof.did,
204
-
r.mbid AS release_mbid,
205
-
r.name AS release_name,
206
-
COUNT(p.uri) AS play_count
207
-
FROM releases r
208
-
INNER JOIN plays p ON p.release_mbid = r.mbid
209
-
INNER JOIN profiles prof ON prof.did = p.did
210
-
WHERE p.played_time >= NOW() - INTERVAL '30 days'
211
-
GROUP BY prof.did, r.mbid, r.name
212
-
ORDER BY COUNT(p.uri) DESC;
213
-
214
-
-- Top releases for user in the last 7 days
215
-
CREATE MATERIALIZED VIEW mv_top_releases_for_user_7days AS
216
-
SELECT
217
-
prof.did,
218
-
r.mbid AS release_mbid,
219
-
r.name AS release_name,
220
-
COUNT(p.uri) AS play_count
221
-
FROM releases r
222
-
INNER JOIN plays p ON p.release_mbid = r.mbid
223
-
INNER JOIN profiles prof ON prof.did = p.did
224
-
WHERE p.played_time >= NOW() - INTERVAL '7 days'
225
-
GROUP BY prof.did, r.mbid, r.name
226
-
ORDER BY COUNT(p.uri) DESC;
-59
services/migrations/20241220000002_car_import_tables.sql
-59
services/migrations/20241220000002_car_import_tables.sql
···
1
-
-- CAR import functionality tables
2
-
-- For handling AT Protocol CAR file imports and processing
3
-
4
-
-- Tracks uploaded CAR files that are queued for processing
5
-
CREATE TABLE IF NOT EXISTS car_import_requests (
6
-
import_id TEXT PRIMARY KEY,
7
-
car_data_base64 TEXT NOT NULL,
8
-
status TEXT NOT NULL DEFAULT 'pending', -- pending, processing, completed, failed
9
-
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
10
-
processed_at TIMESTAMP WITH TIME ZONE,
11
-
error_message TEXT,
12
-
file_size_bytes INTEGER,
13
-
block_count INTEGER,
14
-
extracted_records_count INTEGER DEFAULT 0
15
-
);
16
-
17
-
CREATE INDEX idx_car_import_requests_status ON car_import_requests (status);
18
-
CREATE INDEX idx_car_import_requests_created_at ON car_import_requests (created_at);
19
-
20
-
-- Tracks raw IPLD blocks extracted from CAR files
21
-
CREATE TABLE IF NOT EXISTS car_blocks (
22
-
cid TEXT PRIMARY KEY,
23
-
import_id TEXT NOT NULL REFERENCES car_import_requests(import_id),
24
-
block_data BYTEA NOT NULL,
25
-
decoded_successfully BOOLEAN DEFAULT FALSE,
26
-
collection_type TEXT, -- e.g., 'fm.teal.alpha.feed.play', 'commit', etc.
27
-
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
28
-
);
29
-
30
-
CREATE INDEX idx_car_blocks_import_id ON car_blocks (import_id);
31
-
CREATE INDEX idx_car_blocks_collection_type ON car_blocks (collection_type);
32
-
33
-
-- Tracks records extracted from CAR imports that were successfully processed
34
-
CREATE TABLE IF NOT EXISTS car_extracted_records (
35
-
id SERIAL PRIMARY KEY,
36
-
import_id TEXT NOT NULL REFERENCES car_import_requests(import_id),
37
-
cid TEXT NOT NULL REFERENCES car_blocks(cid),
38
-
collection_type TEXT NOT NULL,
39
-
record_uri TEXT, -- AT URI if applicable (e.g., for play records)
40
-
synthetic_did TEXT, -- DID assigned for CAR imports (e.g., 'car-import:123')
41
-
rkey TEXT,
42
-
extracted_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
43
-
processing_notes TEXT
44
-
);
45
-
46
-
CREATE INDEX idx_car_extracted_records_import_id ON car_extracted_records (import_id);
47
-
CREATE INDEX idx_car_extracted_records_collection_type ON car_extracted_records (collection_type);
48
-
CREATE INDEX idx_car_extracted_records_record_uri ON car_extracted_records (record_uri);
49
-
50
-
-- Tracks import metadata and commit information
51
-
CREATE TABLE IF NOT EXISTS car_import_metadata (
52
-
import_id TEXT NOT NULL REFERENCES car_import_requests(import_id),
53
-
metadata_key TEXT NOT NULL,
54
-
metadata_value JSONB NOT NULL,
55
-
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
56
-
PRIMARY KEY (import_id, metadata_key)
57
-
);
58
-
59
-
CREATE INDEX idx_car_import_metadata_key ON car_import_metadata (metadata_key);
-112
services/migrations/20241220000003_artists_without_mbids.sql
-112
services/migrations/20241220000003_artists_without_mbids.sql
···
1
-
-- Migration to support artists without MusicBrainz IDs
2
-
-- This allows the system to comply with the Teal lexicon where only trackName is required
3
-
4
-
-- Add a field to plays table to store raw artist names for records without MBIDs
5
-
ALTER TABLE plays ADD COLUMN artist_names_raw JSONB;
6
-
7
-
-- Create a new artists table that doesn't require MBID as primary key
8
-
CREATE TABLE artists_extended (
9
-
id SERIAL PRIMARY KEY,
10
-
mbid UUID UNIQUE, -- Optional MusicBrainz ID
11
-
name TEXT NOT NULL,
12
-
name_normalized TEXT GENERATED ALWAYS AS (LOWER(TRIM(name))) STORED,
13
-
play_count INTEGER DEFAULT 0,
14
-
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
15
-
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
16
-
);
17
-
18
-
-- Create index for efficient lookups
19
-
CREATE INDEX idx_artists_extended_mbid ON artists_extended (mbid) WHERE mbid IS NOT NULL;
20
-
CREATE INDEX idx_artists_extended_name_normalized ON artists_extended (name_normalized);
21
-
CREATE UNIQUE INDEX idx_artists_extended_name_unique ON artists_extended (name_normalized) WHERE mbid IS NULL;
22
-
23
-
-- Create a new junction table that can handle both MBID and non-MBID artists
24
-
CREATE TABLE play_to_artists_extended (
25
-
play_uri TEXT NOT NULL REFERENCES plays(uri),
26
-
artist_id INTEGER NOT NULL REFERENCES artists_extended(id),
27
-
artist_name TEXT NOT NULL, -- Denormalized for performance
28
-
PRIMARY KEY (play_uri, artist_id)
29
-
);
30
-
31
-
CREATE INDEX idx_play_to_artists_extended_artist ON play_to_artists_extended (artist_id);
32
-
33
-
-- Migrate existing data from old tables to new structure
34
-
INSERT INTO artists_extended (mbid, name, play_count)
35
-
SELECT mbid, name, play_count FROM artists;
36
-
37
-
INSERT INTO play_to_artists_extended (play_uri, artist_id, artist_name)
38
-
SELECT
39
-
pta.play_uri,
40
-
ae.id,
41
-
pta.artist_name
42
-
FROM play_to_artists pta
43
-
JOIN artists_extended ae ON ae.mbid = pta.artist_mbid;
44
-
45
-
-- Update materialized views to use new structure
46
-
DROP MATERIALIZED VIEW IF EXISTS mv_artist_play_counts;
47
-
CREATE MATERIALIZED VIEW mv_artist_play_counts AS
48
-
SELECT
49
-
ae.id AS artist_id,
50
-
ae.mbid AS artist_mbid,
51
-
ae.name AS artist_name,
52
-
COUNT(p.uri) AS play_count
53
-
FROM
54
-
artists_extended ae
55
-
LEFT JOIN play_to_artists_extended ptae ON ae.id = ptae.artist_id
56
-
LEFT JOIN plays p ON p.uri = ptae.play_uri
57
-
GROUP BY
58
-
ae.id, ae.mbid, ae.name;
59
-
60
-
CREATE UNIQUE INDEX idx_mv_artist_play_counts_new ON mv_artist_play_counts (artist_id);
61
-
62
-
-- Update other materialized views that reference artists
63
-
DROP MATERIALIZED VIEW IF EXISTS mv_top_artists_30days;
64
-
CREATE MATERIALIZED VIEW mv_top_artists_30days AS
65
-
SELECT
66
-
ae.id AS artist_id,
67
-
ae.mbid AS artist_mbid,
68
-
ae.name AS artist_name,
69
-
COUNT(p.uri) AS play_count
70
-
FROM artists_extended ae
71
-
INNER JOIN play_to_artists_extended ptae ON ae.id = ptae.artist_id
72
-
INNER JOIN plays p ON p.uri = ptae.play_uri
73
-
WHERE p.played_time >= NOW() - INTERVAL '30 days'
74
-
GROUP BY ae.id, ae.mbid, ae.name
75
-
ORDER BY COUNT(p.uri) DESC;
76
-
77
-
DROP MATERIALIZED VIEW IF EXISTS mv_top_artists_for_user_30days;
78
-
CREATE MATERIALIZED VIEW mv_top_artists_for_user_30days AS
79
-
SELECT
80
-
prof.did,
81
-
ae.id AS artist_id,
82
-
ae.mbid AS artist_mbid,
83
-
ae.name AS artist_name,
84
-
COUNT(p.uri) AS play_count
85
-
FROM artists_extended ae
86
-
INNER JOIN play_to_artists_extended ptae ON ae.id = ptae.artist_id
87
-
INNER JOIN plays p ON p.uri = ptae.play_uri
88
-
INNER JOIN profiles prof ON prof.did = p.did
89
-
WHERE p.played_time >= NOW() - INTERVAL '30 days'
90
-
GROUP BY prof.did, ae.id, ae.mbid, ae.name
91
-
ORDER BY COUNT(p.uri) DESC;
92
-
93
-
DROP MATERIALIZED VIEW IF EXISTS mv_top_artists_for_user_7days;
94
-
CREATE MATERIALIZED VIEW mv_top_artists_for_user_7days AS
95
-
SELECT
96
-
prof.did,
97
-
ae.id AS artist_id,
98
-
ae.mbid AS artist_mbid,
99
-
ae.name AS artist_name,
100
-
COUNT(p.uri) AS play_count
101
-
FROM artists_extended ae
102
-
INNER JOIN play_to_artists_extended ptae ON ae.id = ptae.artist_id
103
-
INNER JOIN plays p ON p.uri = ptae.play_uri
104
-
INNER JOIN profiles prof ON prof.did = p.did
105
-
WHERE p.played_time >= NOW() - INTERVAL '7 days'
106
-
GROUP BY prof.did, ae.id, ae.mbid, ae.name
107
-
ORDER BY COUNT(p.uri) DESC;
108
-
109
-
-- Comment explaining the migration strategy
110
-
COMMENT ON TABLE artists_extended IS 'Extended artists table that supports both MusicBrainz and non-MusicBrainz artists. Uses serial ID as primary key with optional MBID.';
111
-
COMMENT ON TABLE play_to_artists_extended IS 'Junction table linking plays to artists using the new artists_extended table structure.';
112
-
COMMENT ON COLUMN plays.artist_names_raw IS 'Raw artist names as JSON array for plays without MusicBrainz data, used as fallback when artist relationships cannot be established.';
-76
services/migrations/20241220000004_synthetic_mbids.sql
-76
services/migrations/20241220000004_synthetic_mbids.sql
···
1
-
-- Migration to support synthetic MBIDs for artists without MusicBrainz data
2
-
-- This ensures all artists have some form of ID while maintaining uniqueness
3
-
4
-
-- Enable UUID extension for v5 UUID generation
5
-
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
6
-
7
-
-- Add a column to track MBID type (musicbrainz, synthetic, unknown)
8
-
ALTER TABLE artists_extended ADD COLUMN mbid_type TEXT DEFAULT 'unknown' NOT NULL;
9
-
10
-
-- Add check constraint for valid MBID types
11
-
ALTER TABLE artists_extended ADD CONSTRAINT chk_mbid_type
12
-
CHECK (mbid_type IN ('musicbrainz', 'synthetic', 'unknown'));
13
-
14
-
-- Update existing records to set proper MBID type
15
-
UPDATE artists_extended SET mbid_type = 'musicbrainz' WHERE mbid IS NOT NULL;
16
-
17
-
-- Drop the unique constraint on name_normalized for null MBIDs since we'll handle duplicates differently
18
-
DROP INDEX IF EXISTS idx_artists_extended_name_unique;
19
-
20
-
-- Add index for efficient querying by MBID type
21
-
CREATE INDEX idx_artists_extended_mbid_type ON artists_extended (mbid_type);
22
-
23
-
-- Create a view to easily work with different artist types
24
-
CREATE VIEW artists_with_type AS
25
-
SELECT
26
-
id,
27
-
mbid,
28
-
name,
29
-
mbid_type,
30
-
play_count,
31
-
created_at,
32
-
updated_at,
33
-
-- For synthetic MBIDs, we can show the source name used for generation
34
-
CASE
35
-
WHEN mbid_type = 'synthetic' THEN 'Generated from: ' || name
36
-
WHEN mbid_type = 'musicbrainz' THEN 'MusicBrainz: ' || mbid::text
37
-
ELSE 'No MBID available'
38
-
END as mbid_info
39
-
FROM artists_extended;
40
-
41
-
-- Update materialized views to include MBID type information
42
-
DROP MATERIALIZED VIEW IF EXISTS mv_artist_play_counts;
43
-
CREATE MATERIALIZED VIEW mv_artist_play_counts AS
44
-
SELECT
45
-
ae.id AS artist_id,
46
-
ae.mbid AS artist_mbid,
47
-
ae.name AS artist_name,
48
-
ae.mbid_type,
49
-
COUNT(p.uri) AS play_count
50
-
FROM
51
-
artists_extended ae
52
-
LEFT JOIN play_to_artists_extended ptae ON ae.id = ptae.artist_id
53
-
LEFT JOIN plays p ON p.uri = ptae.play_uri
54
-
GROUP BY
55
-
ae.id, ae.mbid, ae.name, ae.mbid_type;
56
-
57
-
CREATE UNIQUE INDEX idx_mv_artist_play_counts_with_type ON mv_artist_play_counts (artist_id);
58
-
59
-
-- Add comments explaining the synthetic MBID system
60
-
COMMENT ON COLUMN artists_extended.mbid_type IS 'Type of MBID: musicbrainz (real), synthetic (generated), or unknown (legacy data)';
61
-
COMMENT ON COLUMN artists_extended.mbid IS 'MusicBrainz ID (for musicbrainz type) or synthetic UUID (for synthetic type)';
62
-
COMMENT ON VIEW artists_with_type IS 'View that provides human-readable information about artist MBID sources';
63
-
64
-
-- Add a function to generate synthetic MBIDs
65
-
CREATE OR REPLACE FUNCTION generate_synthetic_mbid(artist_name TEXT) RETURNS UUID AS $$
66
-
DECLARE
67
-
namespace_uuid UUID := '6ba7b810-9dad-11d1-80b4-00c04fd430c8'; -- DNS namespace
68
-
result_uuid UUID;
69
-
BEGIN
70
-
-- Generate deterministic UUID v5 based on artist name
71
-
SELECT uuid_generate_v5(namespace_uuid, artist_name) INTO result_uuid;
72
-
RETURN result_uuid;
73
-
END;
74
-
$$ LANGUAGE plpgsql IMMUTABLE;
75
-
76
-
COMMENT ON FUNCTION generate_synthetic_mbid IS 'Generates a deterministic UUID v5 for artist names without MusicBrainz IDs';
-101
services/migrations/20241220000005_fuzzy_matching.sql
-101
services/migrations/20241220000005_fuzzy_matching.sql
···
1
-
-- Migration to add fuzzy text matching capabilities
2
-
-- This enables better artist name matching using trigram similarity
3
-
4
-
-- Enable pg_trgm extension for trigram similarity matching
5
-
CREATE EXTENSION IF NOT EXISTS pg_trgm;
6
-
7
-
-- Create indexes for efficient trigram matching on artist names
8
-
CREATE INDEX idx_artists_extended_name_trgm ON artists_extended USING gin (name gin_trgm_ops);
9
-
CREATE INDEX idx_artists_extended_name_normalized_trgm ON artists_extended USING gin (name_normalized gin_trgm_ops);
10
-
11
-
-- Create a function to calculate comprehensive artist similarity
12
-
CREATE OR REPLACE FUNCTION calculate_artist_similarity(
13
-
input_name TEXT,
14
-
existing_name TEXT,
15
-
input_album TEXT DEFAULT NULL,
16
-
existing_album TEXT DEFAULT NULL
17
-
) RETURNS FLOAT AS $$
18
-
DECLARE
19
-
name_similarity FLOAT;
20
-
album_similarity FLOAT := 0.0;
21
-
final_score FLOAT;
22
-
BEGIN
23
-
-- Calculate trigram similarity for artist names
24
-
name_similarity := similarity(LOWER(TRIM(input_name)), LOWER(TRIM(existing_name)));
25
-
26
-
-- Boost for exact matches after normalization
27
-
IF LOWER(TRIM(regexp_replace(input_name, '[^a-zA-Z0-9\s]', '', 'g'))) =
28
-
LOWER(TRIM(regexp_replace(existing_name, '[^a-zA-Z0-9\s]', '', 'g'))) THEN
29
-
name_similarity := GREATEST(name_similarity, 0.95);
30
-
END IF;
31
-
32
-
-- Factor in album similarity if both are provided
33
-
IF input_album IS NOT NULL AND existing_album IS NOT NULL THEN
34
-
album_similarity := similarity(LOWER(TRIM(input_album)), LOWER(TRIM(existing_album)));
35
-
-- Weight: 80% name, 20% album
36
-
final_score := (name_similarity * 0.8) + (album_similarity * 0.2);
37
-
ELSE
38
-
final_score := name_similarity;
39
-
END IF;
40
-
41
-
RETURN final_score;
42
-
END;
43
-
$$ LANGUAGE plpgsql IMMUTABLE;
44
-
45
-
-- Create a view for fuzzy artist matching with confidence scores
46
-
CREATE VIEW fuzzy_artist_matches AS
47
-
SELECT DISTINCT
48
-
ae1.id as query_artist_id,
49
-
ae1.name as query_artist_name,
50
-
ae1.mbid_type as query_mbid_type,
51
-
ae2.id as match_artist_id,
52
-
ae2.name as match_artist_name,
53
-
ae2.mbid as match_mbid,
54
-
ae2.mbid_type as match_mbid_type,
55
-
similarity(LOWER(TRIM(ae1.name)), LOWER(TRIM(ae2.name))) as name_similarity,
56
-
CASE
57
-
WHEN ae2.mbid_type = 'musicbrainz' THEN 'upgrade_to_mb'
58
-
WHEN ae1.mbid_type = 'musicbrainz' AND ae2.mbid_type = 'synthetic' THEN 'consolidate_to_mb'
59
-
ELSE 'merge_synthetic'
60
-
END as match_action
61
-
FROM artists_extended ae1
62
-
CROSS JOIN artists_extended ae2
63
-
WHERE ae1.id != ae2.id
64
-
AND similarity(LOWER(TRIM(ae1.name)), LOWER(TRIM(ae2.name))) > 0.8
65
-
AND (
66
-
ae1.mbid_type = 'synthetic' OR ae2.mbid_type = 'musicbrainz'
67
-
);
68
-
69
-
-- Add comments
70
-
COMMENT ON EXTENSION pg_trgm IS 'Trigram extension for fuzzy text matching';
71
-
COMMENT ON INDEX idx_artists_extended_name_trgm IS 'GIN index for trigram similarity on artist names';
72
-
COMMENT ON FUNCTION calculate_artist_similarity IS 'Calculates similarity score between artists considering name and optional album context';
73
-
COMMENT ON VIEW fuzzy_artist_matches IS 'Shows potential artist matches with confidence scores and recommended actions';
74
-
75
-
-- Create a function to suggest artist consolidations
76
-
CREATE OR REPLACE FUNCTION suggest_artist_consolidations(min_similarity FLOAT DEFAULT 0.9)
77
-
RETURNS TABLE(
78
-
action TEXT,
79
-
synthetic_artist TEXT,
80
-
target_artist TEXT,
81
-
similarity_score FLOAT,
82
-
synthetic_plays INTEGER,
83
-
target_plays INTEGER
84
-
) AS $$
85
-
BEGIN
86
-
RETURN QUERY
87
-
SELECT
88
-
fam.match_action as action,
89
-
fam.query_artist_name as synthetic_artist,
90
-
fam.match_artist_name as target_artist,
91
-
fam.name_similarity as similarity_score,
92
-
(SELECT COUNT(*)::INTEGER FROM play_to_artists_extended WHERE artist_id = fam.query_artist_id) as synthetic_plays,
93
-
(SELECT COUNT(*)::INTEGER FROM play_to_artists_extended WHERE artist_id = fam.match_artist_id) as target_plays
94
-
FROM fuzzy_artist_matches fam
95
-
WHERE fam.name_similarity >= min_similarity
96
-
AND fam.match_action = 'upgrade_to_mb'
97
-
ORDER BY fam.name_similarity DESC, synthetic_plays DESC;
98
-
END;
99
-
$$ LANGUAGE plpgsql;
100
-
101
-
COMMENT ON FUNCTION suggest_artist_consolidations IS 'Returns suggestions for consolidating synthetic artists with MusicBrainz artists based on similarity';
-138
services/migrations/20241220000006_discriminant_fields.sql
-138
services/migrations/20241220000006_discriminant_fields.sql
···
1
-
-- Migration to add discriminant fields for track and release variants
2
-
-- This enables proper handling of different versions while maintaining grouping capabilities
3
-
4
-
-- Add discriminant fields to plays table
5
-
ALTER TABLE plays ADD COLUMN track_discriminant TEXT;
6
-
ALTER TABLE plays ADD COLUMN release_discriminant TEXT;
7
-
8
-
-- Add discriminant field to releases table
9
-
ALTER TABLE releases ADD COLUMN discriminant TEXT;
10
-
11
-
-- Add discriminant field to recordings table
12
-
ALTER TABLE recordings ADD COLUMN discriminant TEXT;
13
-
14
-
-- Create indexes for efficient searching and filtering
15
-
CREATE INDEX idx_plays_track_discriminant ON plays (track_discriminant);
16
-
CREATE INDEX idx_plays_release_discriminant ON plays (release_discriminant);
17
-
CREATE INDEX idx_releases_discriminant ON releases (discriminant);
18
-
CREATE INDEX idx_recordings_discriminant ON recordings (discriminant);
19
-
20
-
-- Create composite indexes for grouping by base name + discriminant
21
-
CREATE INDEX idx_plays_track_name_discriminant ON plays (track_name, track_discriminant);
22
-
CREATE INDEX idx_plays_release_name_discriminant ON plays (release_name, release_discriminant);
23
-
24
-
-- Update materialized views to include discriminant information
25
-
DROP MATERIALIZED VIEW IF EXISTS mv_release_play_counts;
26
-
CREATE MATERIALIZED VIEW mv_release_play_counts AS
27
-
SELECT
28
-
r.mbid AS release_mbid,
29
-
r.name AS release_name,
30
-
r.discriminant AS release_discriminant,
31
-
COUNT(p.uri) AS play_count
32
-
FROM
33
-
releases r
34
-
LEFT JOIN plays p ON p.release_mbid = r.mbid
35
-
GROUP BY
36
-
r.mbid, r.name, r.discriminant;
37
-
38
-
CREATE UNIQUE INDEX idx_mv_release_play_counts_discriminant ON mv_release_play_counts (release_mbid);
39
-
40
-
DROP MATERIALIZED VIEW IF EXISTS mv_recording_play_counts;
41
-
CREATE MATERIALIZED VIEW mv_recording_play_counts AS
42
-
SELECT
43
-
rec.mbid AS recording_mbid,
44
-
rec.name AS recording_name,
45
-
rec.discriminant AS recording_discriminant,
46
-
COUNT(p.uri) AS play_count
47
-
FROM
48
-
recordings rec
49
-
LEFT JOIN plays p ON p.recording_mbid = rec.mbid
50
-
GROUP BY
51
-
rec.mbid, rec.name, rec.discriminant;
52
-
53
-
CREATE UNIQUE INDEX idx_mv_recording_play_counts_discriminant ON mv_recording_play_counts (recording_mbid);
54
-
55
-
-- Create views for analyzing track/release variants
56
-
CREATE VIEW track_variants AS
57
-
SELECT
58
-
track_name,
59
-
track_discriminant,
60
-
COUNT(*) AS play_count,
61
-
COUNT(DISTINCT did) AS unique_listeners,
62
-
COUNT(DISTINCT recording_mbid) AS unique_recordings
63
-
FROM plays
64
-
WHERE track_name IS NOT NULL
65
-
GROUP BY track_name, track_discriminant
66
-
ORDER BY track_name, play_count DESC;
67
-
68
-
CREATE VIEW release_variants AS
69
-
SELECT
70
-
release_name,
71
-
release_discriminant,
72
-
COUNT(*) AS play_count,
73
-
COUNT(DISTINCT did) AS unique_listeners,
74
-
COUNT(DISTINCT release_mbid) AS unique_releases
75
-
FROM plays
76
-
WHERE release_name IS NOT NULL
77
-
GROUP BY release_name, release_discriminant
78
-
ORDER BY release_name, play_count DESC;
79
-
80
-
-- Create function to extract potential discriminants from existing names
81
-
CREATE OR REPLACE FUNCTION extract_discriminant(name_text TEXT) RETURNS TEXT AS $$
82
-
DECLARE
83
-
discriminant_patterns TEXT[] := ARRAY[
84
-
'\(([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus).*?)\)',
85
-
'\[([^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus).*?)\]',
86
-
'\{([^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus).*?)\}'
87
-
];
88
-
pattern TEXT;
89
-
match_result TEXT;
90
-
BEGIN
91
-
-- Try each pattern to find discriminant information
92
-
FOREACH pattern IN ARRAY discriminant_patterns
93
-
LOOP
94
-
SELECT substring(name_text FROM pattern) INTO match_result;
95
-
IF match_result IS NOT NULL AND length(trim(match_result)) > 0 THEN
96
-
RETURN trim(match_result);
97
-
END IF;
98
-
END LOOP;
99
-
100
-
RETURN NULL;
101
-
END;
102
-
$$ LANGUAGE plpgsql IMMUTABLE;
103
-
104
-
-- Create function to get base name without discriminant
105
-
CREATE OR REPLACE FUNCTION get_base_name(name_text TEXT) RETURNS TEXT AS $$
106
-
DECLARE
107
-
cleanup_patterns TEXT[] := ARRAY[
108
-
'\s*\([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus).*?\)\s*',
109
-
'\s*\[[^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus).*?\]\s*',
110
-
'\s*\{[^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus).*?\}\s*'
111
-
];
112
-
pattern TEXT;
113
-
result_text TEXT := name_text;
114
-
BEGIN
115
-
-- Remove discriminant patterns to get base name
116
-
FOREACH pattern IN ARRAY cleanup_patterns
117
-
LOOP
118
-
result_text := regexp_replace(result_text, pattern, ' ', 'gi');
119
-
END LOOP;
120
-
121
-
-- Clean up extra whitespace
122
-
result_text := regexp_replace(trim(result_text), '\s+', ' ', 'g');
123
-
124
-
RETURN result_text;
125
-
END;
126
-
$$ LANGUAGE plpgsql IMMUTABLE;
127
-
128
-
-- Add comments explaining the discriminant system
129
-
COMMENT ON COLUMN plays.track_discriminant IS 'Distinguishing information for track variants (e.g., "Acoustic Version", "Live at Wembley", "Radio Edit")';
130
-
COMMENT ON COLUMN plays.release_discriminant IS 'Distinguishing information for release variants (e.g., "Deluxe Edition", "Remastered", "2023 Remaster")';
131
-
COMMENT ON COLUMN releases.discriminant IS 'Distinguishing information for release variants to enable proper grouping';
132
-
COMMENT ON COLUMN recordings.discriminant IS 'Distinguishing information for recording variants to enable proper grouping';
133
-
134
-
COMMENT ON VIEW track_variants IS 'Shows all variants of tracks with their play counts and unique listeners';
135
-
COMMENT ON VIEW release_variants IS 'Shows all variants of releases with their play counts and unique listeners';
136
-
137
-
COMMENT ON FUNCTION extract_discriminant IS 'Extracts discriminant information from track/release names for migration purposes';
138
-
COMMENT ON FUNCTION get_base_name IS 'Returns the base name without discriminant information for grouping purposes';
-276
services/migrations/20241220000007_enhanced_discriminant_extraction.sql
-276
services/migrations/20241220000007_enhanced_discriminant_extraction.sql
···
1
-
-- Enhanced discriminant extraction with comprehensive edition/version patterns
2
-
-- This migration improves the auto-population of discriminants for better metadata handling
3
-
4
-
-- Drop existing functions to replace them with enhanced versions
5
-
DROP FUNCTION IF EXISTS extract_discriminant(TEXT);
6
-
DROP FUNCTION IF EXISTS get_base_name(TEXT);
7
-
8
-
-- Enhanced function to extract discriminants with comprehensive patterns
9
-
CREATE OR REPLACE FUNCTION extract_discriminant(name_text TEXT) RETURNS TEXT AS $$
10
-
DECLARE
11
-
-- Comprehensive patterns for discriminant extraction
12
-
discriminant_patterns TEXT[] := ARRAY[
13
-
-- Parentheses patterns
14
-
'\(([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\)',
15
-
'\(([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\)',
16
-
'\(([^)]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\)',
17
-
'\(([^)]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\)',
18
-
'\(([^)]*(?:from|soundtrack|ost|score|theme).*?)\)',
19
-
20
-
-- Brackets patterns
21
-
'\[([^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\]',
22
-
'\[([^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\]',
23
-
'\[([^]]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\]',
24
-
'\[([^]]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\]',
25
-
'\[([^]]*(?:from|soundtrack|ost|score|theme).*?)\]',
26
-
27
-
-- Braces patterns
28
-
'\{([^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\}',
29
-
'\{([^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\}',
30
-
'\{([^}]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\}',
31
-
'\{([^}]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\}',
32
-
'\{([^}]*(?:from|soundtrack|ost|score|theme).*?)\}',
33
-
34
-
-- Dash/hyphen patterns (common for editions)
35
-
'[-–—]\s*([^-–—]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray).*?)$',
36
-
'[-–—]\s*(\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$',
37
-
38
-
-- Colon patterns (common for subtitles and versions)
39
-
':\s*([^:]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive).*?)$',
40
-
':\s*(\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$'
41
-
];
42
-
43
-
pattern TEXT;
44
-
match_result TEXT;
45
-
BEGIN
46
-
-- Return early if input is null or empty
47
-
IF name_text IS NULL OR trim(name_text) = '' THEN
48
-
RETURN NULL;
49
-
END IF;
50
-
51
-
-- Try each pattern to find discriminant information
52
-
FOREACH pattern IN ARRAY discriminant_patterns
53
-
LOOP
54
-
SELECT substring(name_text FROM pattern COLLATE "C") INTO match_result;
55
-
IF match_result IS NOT NULL AND length(trim(match_result)) > 0 THEN
56
-
-- Clean up the match result
57
-
match_result := trim(match_result);
58
-
-- Remove leading/trailing punctuation
59
-
match_result := regexp_replace(match_result, '^[^\w]+|[^\w]+$', '', 'g');
60
-
-- Ensure it's not just whitespace or empty after cleanup
61
-
IF length(trim(match_result)) > 0 THEN
62
-
RETURN match_result;
63
-
END IF;
64
-
END IF;
65
-
END LOOP;
66
-
67
-
RETURN NULL;
68
-
END;
69
-
$$ LANGUAGE plpgsql IMMUTABLE;
70
-
71
-
-- Enhanced function to get base name without discriminant
72
-
CREATE OR REPLACE FUNCTION get_base_name(name_text TEXT) RETURNS TEXT AS $$
73
-
DECLARE
74
-
-- Comprehensive cleanup patterns matching the extraction patterns
75
-
cleanup_patterns TEXT[] := ARRAY[
76
-
-- Remove parentheses content
77
-
'\s*\([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\)\s*',
78
-
'\s*\([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\)\s*',
79
-
'\s*\([^)]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\)\s*',
80
-
'\s*\([^)]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\)\s*',
81
-
'\s*\([^)]*(?:from|soundtrack|ost|score|theme).*?\)\s*',
82
-
83
-
-- Remove brackets content
84
-
'\s*\[[^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\]\s*',
85
-
'\s*\[[^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\]\s*',
86
-
'\s*\[[^]]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\]\s*',
87
-
'\s*\[[^]]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\]\s*',
88
-
'\s*\[[^]]*(?:from|soundtrack|ost|score|theme).*?\]\s*',
89
-
90
-
-- Remove braces content
91
-
'\s*\{[^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\}\s*',
92
-
'\s*\{[^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\}\s*',
93
-
'\s*\{[^}]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\}\s*',
94
-
'\s*\{[^}]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\}\s*',
95
-
'\s*\{[^}]*(?:from|soundtrack|ost|score|theme).*?\}\s*',
96
-
97
-
-- Remove dash/hyphen patterns
98
-
'\s*[-–—]\s*[^-–—]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray).*?$',
99
-
'\s*[-–—]\s*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$',
100
-
101
-
-- Remove colon patterns
102
-
'\s*:\s*[^:]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive).*?$',
103
-
'\s*:\s*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$'
104
-
];
105
-
106
-
pattern TEXT;
107
-
result_text TEXT := name_text;
108
-
BEGIN
109
-
-- Return early if input is null or empty
110
-
IF name_text IS NULL OR trim(name_text) = '' THEN
111
-
RETURN name_text;
112
-
END IF;
113
-
114
-
-- Remove discriminant patterns to get base name
115
-
FOREACH pattern IN ARRAY cleanup_patterns
116
-
LOOP
117
-
result_text := regexp_replace(result_text, pattern, ' ', 'gi');
118
-
END LOOP;
119
-
120
-
-- Clean up extra whitespace and normalize
121
-
result_text := regexp_replace(trim(result_text), '\s+', ' ', 'g');
122
-
123
-
-- Remove trailing punctuation that might be left after removal
124
-
result_text := regexp_replace(result_text, '[,;:\-–—]\s*$', '', 'g');
125
-
result_text := trim(result_text);
126
-
127
-
-- Ensure we don't return an empty string
128
-
IF length(result_text) = 0 THEN
129
-
RETURN name_text;
130
-
END IF;
131
-
132
-
RETURN result_text;
133
-
END;
134
-
$$ LANGUAGE plpgsql IMMUTABLE;
135
-
136
-
-- Create function to extract discriminant specifically for editions and versions
137
-
CREATE OR REPLACE FUNCTION extract_edition_discriminant(name_text TEXT) RETURNS TEXT AS $$
138
-
DECLARE
139
-
-- Focused patterns for edition/version extraction
140
-
edition_patterns TEXT[] := ARRAY[
141
-
-- Edition patterns
142
-
'\(([^)]*edition[^)]*)\)',
143
-
'\[([^]]*edition[^]]*)\]',
144
-
'\{([^}]*edition[^}]*)\}',
145
-
'[-–—]\s*([^-–—]*edition[^-–—]*)$',
146
-
':\s*([^:]*edition[^:]*)$',
147
-
148
-
-- Version patterns
149
-
'\(([^)]*version[^)]*)\)',
150
-
'\[([^]]*version[^]]*)\]',
151
-
'\{([^}]*version[^}]*)\}',
152
-
'[-–—]\s*([^-–—]*version[^-–—]*)$',
153
-
':\s*([^:]*version[^:]*)$',
154
-
155
-
-- Remaster patterns
156
-
'\(([^)]*remaster[^)]*)\)',
157
-
'\[([^]]*remaster[^]]*)\]',
158
-
'\{([^}]*remaster[^}]*)\}',
159
-
'[-–—]\s*([^-–—]*remaster[^-–—]*)$',
160
-
':\s*([^:]*remaster[^:]*)$',
161
-
162
-
-- Year-based patterns
163
-
'\(([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^)]*)\)',
164
-
'\[([^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^]]*)\]',
165
-
'\{([^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^}]*)\}'
166
-
];
167
-
168
-
pattern TEXT;
169
-
match_result TEXT;
170
-
BEGIN
171
-
-- Return early if input is null or empty
172
-
IF name_text IS NULL OR trim(name_text) = '' THEN
173
-
RETURN NULL;
174
-
END IF;
175
-
176
-
-- Try edition-specific patterns first
177
-
FOREACH pattern IN ARRAY edition_patterns
178
-
LOOP
179
-
SELECT substring(name_text FROM pattern COLLATE "C") INTO match_result;
180
-
IF match_result IS NOT NULL AND length(trim(match_result)) > 0 THEN
181
-
match_result := trim(match_result);
182
-
match_result := regexp_replace(match_result, '^[^\w]+|[^\w]+$', '', 'g');
183
-
IF length(trim(match_result)) > 0 THEN
184
-
RETURN match_result;
185
-
END IF;
186
-
END IF;
187
-
END LOOP;
188
-
189
-
RETURN NULL;
190
-
END;
191
-
$$ LANGUAGE plpgsql IMMUTABLE;
192
-
193
-
-- Update recordings table to populate discriminants from existing names
194
-
UPDATE recordings
195
-
SET discriminant = extract_discriminant(name)
196
-
WHERE discriminant IS NULL
197
-
AND extract_discriminant(name) IS NOT NULL;
198
-
199
-
-- Update releases table to populate discriminants from existing names
200
-
UPDATE releases
201
-
SET discriminant = extract_discriminant(name)
202
-
WHERE discriminant IS NULL
203
-
AND extract_discriminant(name) IS NOT NULL;
204
-
205
-
-- Update plays table to populate discriminants from existing names where not already set
206
-
UPDATE plays
207
-
SET track_discriminant = extract_discriminant(track_name)
208
-
WHERE track_discriminant IS NULL
209
-
AND extract_discriminant(track_name) IS NOT NULL;
210
-
211
-
UPDATE plays
212
-
SET release_discriminant = extract_discriminant(release_name)
213
-
WHERE release_discriminant IS NULL
214
-
AND release_name IS NOT NULL
215
-
AND extract_discriminant(release_name) IS NOT NULL;
216
-
217
-
-- Create indexes for efficient discriminant queries
218
-
CREATE INDEX IF NOT EXISTS idx_recordings_name_discriminant ON recordings (name, discriminant);
219
-
CREATE INDEX IF NOT EXISTS idx_releases_name_discriminant ON releases (name, discriminant);
220
-
221
-
-- Add comments for the new function
222
-
COMMENT ON FUNCTION extract_discriminant IS 'Enhanced discriminant extraction supporting comprehensive edition/version patterns including parentheses, brackets, braces, dashes, and colons';
223
-
COMMENT ON FUNCTION get_base_name IS 'Enhanced base name extraction removing comprehensive discriminant patterns to enable proper grouping';
224
-
COMMENT ON FUNCTION extract_edition_discriminant IS 'Specialized function for extracting edition and version discriminants with focused patterns';
225
-
226
-
-- Create a view to show discriminant extraction results for analysis
227
-
CREATE OR REPLACE VIEW discriminant_analysis AS
228
-
SELECT
229
-
'recordings' as table_name,
230
-
name as original_name,
231
-
discriminant,
232
-
get_base_name(name) as base_name,
233
-
extract_discriminant(name) as extracted_discriminant,
234
-
extract_edition_discriminant(name) as edition_discriminant
235
-
FROM recordings
236
-
WHERE name IS NOT NULL
237
-
UNION ALL
238
-
SELECT
239
-
'releases' as table_name,
240
-
name as original_name,
241
-
discriminant,
242
-
get_base_name(name) as base_name,
243
-
extract_discriminant(name) as extracted_discriminant,
244
-
extract_edition_discriminant(name) as edition_discriminant
245
-
FROM releases
246
-
WHERE name IS NOT NULL;
247
-
248
-
COMMENT ON VIEW discriminant_analysis IS 'Analysis view showing discriminant extraction results for quality assessment and debugging';
249
-
250
-
-- Refresh materialized views to include discriminant information
251
-
REFRESH MATERIALIZED VIEW mv_release_play_counts;
252
-
REFRESH MATERIALIZED VIEW mv_recording_play_counts;
253
-
254
-
-- Create summary statistics for discriminant usage
255
-
CREATE OR REPLACE VIEW discriminant_stats AS
256
-
SELECT
257
-
'recordings' as entity_type,
258
-
COUNT(*) as total_count,
259
-
COUNT(CASE WHEN discriminant IS NOT NULL THEN 1 END) as with_discriminant,
260
-
COUNT(CASE WHEN discriminant IS NULL AND extract_discriminant(name) IS NOT NULL THEN 1 END) as extractable_discriminant,
261
-
ROUND(
262
-
COUNT(CASE WHEN discriminant IS NOT NULL THEN 1 END) * 100.0 / COUNT(*), 2
263
-
) as discriminant_percentage
264
-
FROM recordings
265
-
UNION ALL
266
-
SELECT
267
-
'releases' as entity_type,
268
-
COUNT(*) as total_count,
269
-
COUNT(CASE WHEN discriminant IS NOT NULL THEN 1 END) as with_discriminant,
270
-
COUNT(CASE WHEN discriminant IS NULL AND extract_discriminant(name) IS NOT NULL THEN 1 END) as extractable_discriminant,
271
-
ROUND(
272
-
COUNT(CASE WHEN discriminant IS NOT NULL THEN 1 END) * 100.0 / COUNT(*), 2
273
-
) as discriminant_percentage
274
-
FROM releases;
275
-
276
-
COMMENT ON VIEW discriminant_stats IS 'Statistics showing discriminant usage and extraction potential across entity types';
-252
services/migrations/20241220000008_fix_discriminant_case_sensitivity.sql
-252
services/migrations/20241220000008_fix_discriminant_case_sensitivity.sql
···
1
-
-- Fix case sensitivity in discriminant extraction patterns
2
-
-- This migration updates the discriminant extraction functions to properly handle case-insensitive matching
3
-
4
-
-- Drop dependent views first, then functions, then recreate everything
5
-
DROP VIEW IF EXISTS discriminant_analysis CASCADE;
6
-
DROP VIEW IF EXISTS discriminant_stats CASCADE;
7
-
8
-
-- Drop existing functions to replace with case-insensitive versions
9
-
DROP FUNCTION IF EXISTS extract_discriminant(TEXT) CASCADE;
10
-
DROP FUNCTION IF EXISTS get_base_name(TEXT) CASCADE;
11
-
DROP FUNCTION IF EXISTS extract_edition_discriminant(TEXT) CASCADE;
12
-
13
-
-- Enhanced function to extract discriminants with case-insensitive matching
14
-
CREATE OR REPLACE FUNCTION extract_discriminant(name_text TEXT) RETURNS TEXT AS $$
15
-
DECLARE
16
-
-- Comprehensive patterns for discriminant extraction with case-insensitive flags
17
-
discriminant_patterns TEXT[] := ARRAY[
18
-
-- Parentheses patterns
19
-
'(?i)\(([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\)',
20
-
'(?i)\(([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\)',
21
-
'(?i)\(([^)]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\)',
22
-
'(?i)\(([^)]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\)',
23
-
'(?i)\(([^)]*(?:from|soundtrack|ost|score|theme).*?)\)',
24
-
25
-
-- Brackets patterns
26
-
'(?i)\[([^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\]',
27
-
'(?i)\[([^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\]',
28
-
'(?i)\[([^]]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\]',
29
-
'(?i)\[([^]]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\]',
30
-
'(?i)\[([^]]*(?:from|soundtrack|ost|score|theme).*?)\]',
31
-
32
-
-- Braces patterns
33
-
'(?i)\{([^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?)\}',
34
-
'(?i)\{([^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?)\}',
35
-
'(?i)\{([^}]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?)\}',
36
-
'(?i)\{([^}]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?)\}',
37
-
'(?i)\{([^}]*(?:from|soundtrack|ost|score|theme).*?)\}',
38
-
39
-
-- Dash/hyphen patterns (common for editions)
40
-
'(?i)[-–—]\s*([^-–—]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray).*?)$',
41
-
'(?i)[-–—]\s*(\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$',
42
-
43
-
-- Colon patterns (common for subtitles and versions)
44
-
'(?i):\s*([^:]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive).*?)$',
45
-
'(?i):\s*(\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$'
46
-
];
47
-
48
-
pattern TEXT;
49
-
match_result TEXT;
50
-
BEGIN
51
-
-- Return early if input is null or empty
52
-
IF name_text IS NULL OR trim(name_text) = '' THEN
53
-
RETURN NULL;
54
-
END IF;
55
-
56
-
-- Try each pattern to find discriminant information
57
-
FOREACH pattern IN ARRAY discriminant_patterns
58
-
LOOP
59
-
SELECT substring(name_text FROM pattern) INTO match_result;
60
-
IF match_result IS NOT NULL AND length(trim(match_result)) > 0 THEN
61
-
-- Clean up the match result
62
-
match_result := trim(match_result);
63
-
-- Remove leading/trailing punctuation
64
-
match_result := regexp_replace(match_result, '^[^\w]+|[^\w]+$', '', 'g');
65
-
-- Ensure it's not just whitespace or empty after cleanup
66
-
IF length(trim(match_result)) > 0 THEN
67
-
RETURN match_result;
68
-
END IF;
69
-
END IF;
70
-
END LOOP;
71
-
72
-
RETURN NULL;
73
-
END;
74
-
$$ LANGUAGE plpgsql IMMUTABLE;
75
-
76
-
-- Enhanced function to get base name without discriminant with case-insensitive matching
77
-
CREATE OR REPLACE FUNCTION get_base_name(name_text TEXT) RETURNS TEXT AS $$
78
-
DECLARE
79
-
-- Comprehensive cleanup patterns matching the extraction patterns
80
-
cleanup_patterns TEXT[] := ARRAY[
81
-
-- Remove parentheses content
82
-
'(?i)\s*\([^)]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\)\s*',
83
-
'(?i)\s*\([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\)\s*',
84
-
'(?i)\s*\([^)]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\)\s*',
85
-
'(?i)\s*\([^)]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\)\s*',
86
-
'(?i)\s*\([^)]*(?:from|soundtrack|ost|score|theme).*?\)\s*',
87
-
88
-
-- Remove brackets content
89
-
'(?i)\s*\[[^]]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\]\s*',
90
-
'(?i)\s*\[[^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\]\s*',
91
-
'(?i)\s*\[[^]]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\]\s*',
92
-
'(?i)\s*\[[^]]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\]\s*',
93
-
'(?i)\s*\[[^]]*(?:from|soundtrack|ost|score|theme).*?\]\s*',
94
-
95
-
-- Remove braces content
96
-
'(?i)\s*\{[^}]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray|hdtv|web|retail|promo|single|ep|lp|maxi|mini|radio|club|dance|house|techno|trance|ambient|classical|jazz|folk|country|rock|pop|metal|punk|indie|alternative).*?\}\s*',
97
-
'(?i)\s*\{[^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?\}\s*',
98
-
'(?i)\s*\{[^}]*(?:vol\.|volume|pt\.|part|disc|disk|cd)\s*\d+.*?\}\s*',
99
-
'(?i)\s*\{[^}]*(?:feat\.|featuring|ft\.|with|vs\.|versus|&|and)\s+.*?\}\s*',
100
-
'(?i)\s*\{[^}]*(?:from|soundtrack|ost|score|theme).*?\}\s*',
101
-
102
-
-- Remove dash/hyphen patterns
103
-
'(?i)\s*[-–—]\s*[^-–—]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive|digital|vinyl|cd|dvd|blu-ray).*?$',
104
-
'(?i)\s*[-–—]\s*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$',
105
-
106
-
-- Remove colon patterns
107
-
'(?i)\s*:\s*[^:]*(?:deluxe|remaster|remastered|extended|acoustic|live|radio|edit|version|remix|demo|instrumental|explicit|clean|bonus|edition|special|limited|expanded|director''s|uncut|final|ultimate|platinum|gold|anniversary|collector''s|standard|enhanced|super|mega|ultra|plus|pro|premium|complete|definitive|classic|original|alternate|alternative|unreleased|rare|exclusive).*?$',
108
-
'(?i)\s*:\s*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release).*?$'
109
-
];
110
-
111
-
pattern TEXT;
112
-
result_text TEXT := name_text;
113
-
BEGIN
114
-
-- Return early if input is null or empty
115
-
IF name_text IS NULL OR trim(name_text) = '' THEN
116
-
RETURN name_text;
117
-
END IF;
118
-
119
-
-- Remove discriminant patterns to get base name
120
-
FOREACH pattern IN ARRAY cleanup_patterns
121
-
LOOP
122
-
result_text := regexp_replace(result_text, pattern, ' ', 'g');
123
-
END LOOP;
124
-
125
-
-- Clean up extra whitespace and normalize
126
-
result_text := regexp_replace(trim(result_text), '\s+', ' ', 'g');
127
-
128
-
-- Remove trailing punctuation that might be left after removal
129
-
result_text := regexp_replace(result_text, '[,;:\-–—]\s*$', '', 'g');
130
-
result_text := trim(result_text);
131
-
132
-
-- Ensure we don't return an empty string
133
-
IF length(result_text) = 0 THEN
134
-
RETURN name_text;
135
-
END IF;
136
-
137
-
RETURN result_text;
138
-
END;
139
-
$$ LANGUAGE plpgsql IMMUTABLE;
140
-
141
-
-- Enhanced function to extract discriminant specifically for editions and versions with case-insensitive matching
142
-
CREATE OR REPLACE FUNCTION extract_edition_discriminant(name_text TEXT) RETURNS TEXT AS $$
143
-
DECLARE
144
-
-- Focused patterns for edition/version extraction with case-insensitive flags
145
-
edition_patterns TEXT[] := ARRAY[
146
-
-- Edition patterns
147
-
'(?i)\(([^)]*edition[^)]*)\)',
148
-
'(?i)\[([^]]*edition[^]]*)\]',
149
-
'(?i)\{([^}]*edition[^}]*)\}',
150
-
'(?i)[-–—]\s*([^-–—]*edition[^-–—]*)$',
151
-
'(?i):\s*([^:]*edition[^:]*)$',
152
-
153
-
-- Version patterns
154
-
'(?i)\(([^)]*version[^)]*)\)',
155
-
'(?i)\[([^]]*version[^]]*)\]',
156
-
'(?i)\{([^}]*version[^}]*)\}',
157
-
'(?i)[-–—]\s*([^-–—]*version[^-–—]*)$',
158
-
'(?i):\s*([^:]*version[^:]*)$',
159
-
160
-
-- Remaster patterns
161
-
'(?i)\(([^)]*remaster[^)]*)\)',
162
-
'(?i)\[([^]]*remaster[^]]*)\]',
163
-
'(?i)\{([^}]*remaster[^}]*)\}',
164
-
'(?i)[-–—]\s*([^-–—]*remaster[^-–—]*)$',
165
-
'(?i):\s*([^:]*remaster[^:]*)$',
166
-
167
-
-- Year-based patterns
168
-
'(?i)\(([^)]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^)]*)\)',
169
-
'(?i)\[([^]]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^]]*)\]',
170
-
'(?i)\{([^}]*(?:\d{4}|\d{2})\s*(?:remaster|edition|version|mix|cut|release)[^}]*)\}'
171
-
];
172
-
173
-
pattern TEXT;
174
-
match_result TEXT;
175
-
BEGIN
176
-
-- Return early if input is null or empty
177
-
IF name_text IS NULL OR trim(name_text) = '' THEN
178
-
RETURN NULL;
179
-
END IF;
180
-
181
-
-- Try edition-specific patterns first
182
-
FOREACH pattern IN ARRAY edition_patterns
183
-
LOOP
184
-
SELECT substring(name_text FROM pattern) INTO match_result;
185
-
IF match_result IS NOT NULL AND length(trim(match_result)) > 0 THEN
186
-
match_result := trim(match_result);
187
-
match_result := regexp_replace(match_result, '^[^\w]+|[^\w]+$', '', 'g');
188
-
IF length(trim(match_result)) > 0 THEN
189
-
RETURN match_result;
190
-
END IF;
191
-
END IF;
192
-
END LOOP;
193
-
194
-
RETURN NULL;
195
-
END;
196
-
$$ LANGUAGE plpgsql IMMUTABLE;
197
-
198
-
-- Update existing records with newly extracted discriminants (case-insensitive)
199
-
UPDATE recordings
200
-
SET discriminant = extract_discriminant(name)
201
-
WHERE discriminant IS NULL
202
-
AND extract_discriminant(name) IS NOT NULL;
203
-
204
-
UPDATE releases
205
-
SET discriminant = extract_discriminant(name)
206
-
WHERE discriminant IS NULL
207
-
AND extract_discriminant(name) IS NOT NULL;
208
-
209
-
UPDATE plays
210
-
SET track_discriminant = extract_discriminant(track_name)
211
-
WHERE track_discriminant IS NULL
212
-
AND extract_discriminant(track_name) IS NOT NULL;
213
-
214
-
UPDATE plays
215
-
SET release_discriminant = extract_discriminant(release_name)
216
-
WHERE release_discriminant IS NULL
217
-
AND release_name IS NOT NULL
218
-
AND extract_discriminant(release_name) IS NOT NULL;
219
-
220
-
-- Update comments for the enhanced functions
221
-
COMMENT ON FUNCTION extract_discriminant IS 'Enhanced case-insensitive discriminant extraction supporting comprehensive edition/version patterns including parentheses, brackets, braces, dashes, and colons';
222
-
COMMENT ON FUNCTION get_base_name IS 'Enhanced case-insensitive base name extraction removing comprehensive discriminant patterns to enable proper grouping';
223
-
COMMENT ON FUNCTION extract_edition_discriminant IS 'Specialized case-insensitive function for extracting edition and version discriminants with focused patterns';
224
-
225
-
-- Refresh materialized views to reflect the case-insensitive improvements
226
-
REFRESH MATERIALIZED VIEW mv_release_play_counts;
227
-
REFRESH MATERIALIZED VIEW mv_recording_play_counts;
228
-
229
-
-- Update discriminant analysis view to include case-insensitive results
230
-
DROP VIEW IF EXISTS discriminant_analysis;
231
-
CREATE OR REPLACE VIEW discriminant_analysis AS
232
-
SELECT
233
-
'recordings' as table_name,
234
-
name as original_name,
235
-
discriminant,
236
-
get_base_name(name) as base_name,
237
-
extract_discriminant(name) as extracted_discriminant,
238
-
extract_edition_discriminant(name) as edition_discriminant
239
-
FROM recordings
240
-
WHERE name IS NOT NULL
241
-
UNION ALL
242
-
SELECT
243
-
'releases' as table_name,
244
-
name as original_name,
245
-
discriminant,
246
-
get_base_name(name) as base_name,
247
-
extract_discriminant(name) as extracted_discriminant,
248
-
extract_edition_discriminant(name) as edition_discriminant
249
-
FROM releases
250
-
WHERE name IS NOT NULL;
251
-
252
-
COMMENT ON VIEW discriminant_analysis IS 'Analysis view showing case-insensitive discriminant extraction results for quality assessment and debugging';