+7
-1
src/features/services/check_hn.ts
+7
-1
src/features/services/check_hn.ts
···
25
} from "../../libs/hackernews";
26
import { addDays } from "../../libs/time";
27
import type { AnyMessageBlock } from "slack-edge";
28
-
import { sqlite } from "../../libs/db";
29
30
// Constants
31
const TOP_STORIES_LIMIT = 30; // Front page is considered the top 30 stories
···
699
.delete(leaderboardSnapshots)
700
.where(lt(leaderboardSnapshots.expiresAt, currentTime));
701
}
702
} catch (error) {
703
console.error("Error cleaning up expired data:", error);
704
Sentry.captureException(error);
···
25
} from "../../libs/hackernews";
26
import { addDays } from "../../libs/time";
27
import type { AnyMessageBlock } from "slack-edge";
28
+
import { optimizeLeaderboardSnapshots, sqlite } from "../../libs/db";
29
30
// Constants
31
const TOP_STORIES_LIMIT = 30; // Front page is considered the top 30 stories
···
699
.delete(leaderboardSnapshots)
700
.where(lt(leaderboardSnapshots.expiresAt, currentTime));
701
}
702
+
703
+
// Optimize leaderboard snapshots by removing redundant ones with batching
704
+
// Use a batch size of 100 for better performance without overwhelming the database
705
+
// Use conservative mode (true) to ensure we don't remove important data during sharp changes
706
+
const { optimizeLeaderboardSnapshots } = await import("../../libs/db");
707
+
await optimizeLeaderboardSnapshots(100, true);
708
} catch (error) {
709
console.error("Error cleaning up expired data:", error);
710
Sentry.captureException(error);
+250
-2
src/libs/db.ts
+250
-2
src/libs/db.ts
···
1
import { drizzle } from "drizzle-orm/bun-sqlite";
2
import { Database } from "bun:sqlite";
3
import * as schema from "./schema";
4
5
// Use environment variable for the database path in production
6
const dbPath = process.env.DATABASE_PATH || "./local.db";
···
9
const sqlite = new Database(dbPath, {
10
// Use WAL mode for better concurrency
11
readonly: false,
12
-
create: true
13
});
14
15
// Set a longer busy timeout to reduce "database is locked" errors
···
31
// Create a Drizzle instance with the database and schema
32
export const db = drizzle(sqlite, { schema });
33
34
// Export the sqlite instance and schema for use in other files
35
-
export { sqlite, schema };
···
1
import { drizzle } from "drizzle-orm/bun-sqlite";
2
import { Database } from "bun:sqlite";
3
import * as schema from "./schema";
4
+
import { eq, and, notInArray, count } from "drizzle-orm";
5
+
import * as Sentry from "@sentry/bun";
6
+
7
+
// Define interface for snapshot data
8
+
interface Snapshot {
9
+
id: number;
10
+
timestamp: number;
11
+
position: number;
12
+
score: number;
13
+
}
14
+
15
+
interface StoryCount {
16
+
story_id: number;
17
+
snapshot_count: number;
18
+
}
19
20
// Use environment variable for the database path in production
21
const dbPath = process.env.DATABASE_PATH || "./local.db";
···
24
const sqlite = new Database(dbPath, {
25
// Use WAL mode for better concurrency
26
readonly: false,
27
+
create: true,
28
});
29
30
// Set a longer busy timeout to reduce "database is locked" errors
···
46
// Create a Drizzle instance with the database and schema
47
export const db = drizzle(sqlite, { schema });
48
49
+
/**
50
+
* Optimizes leaderboard snapshots by removing redundant entries
51
+
* Keeps important snapshots: first, last, and any showing position/score changes
52
+
* Uses raw SQL for better performance on large datasets
53
+
* Preserves data points showing significant changes
54
+
* @param {number} batchSize - Number of stories to process in each batch (default: 50)
55
+
* @param {boolean} conservative - If true, uses more conservative rules to keep snapshots (default: true)
56
+
*/
57
+
async function optimizeLeaderboardSnapshots(batchSize = 50, conservative = true) {
58
+
try {
59
+
console.log("Starting leaderboard snapshots optimization...");
60
+
const startTime = Date.now();
61
+
62
+
// Get count of stories with snapshots
63
+
// Get count of eligible stories (more than 3 snapshots)
64
+
const storyCountResult = sqlite.query(
65
+
"SELECT COUNT(*) as count FROM (SELECT story_id FROM leaderboard_snapshots GROUP BY story_id HAVING COUNT(*) > 3)",
66
+
);
67
+
const storyCount = storyCountResult.get()
68
+
? (storyCountResult.get() as { count: number }).count
69
+
: 0;
70
+
71
+
if (storyCount === 0) {
72
+
console.log("No stories with snapshots to optimize");
73
+
return;
74
+
}
75
+
76
+
console.log(
77
+
`Found ${storyCount} stories with leaderboard snapshots to analyze`,
78
+
);
79
+
let totalRedundantSnapshots = 0;
80
+
let processedStories = 0;
81
+
82
+
// Direct SQL approach for performance
83
+
// Create temporary table for IDs to keep
84
+
sqlite.exec(`
85
+
DROP TABLE IF EXISTS temp_snapshots_to_keep;
86
+
CREATE TEMPORARY TABLE temp_snapshots_to_keep (
87
+
id INTEGER NOT NULL
88
+
);
89
+
`);
90
+
91
+
// Get stories with more than 3 snapshots (optimization candidates)
92
+
const candidateStories = sqlite
93
+
.query(
94
+
`SELECT story_id, COUNT(*) as snapshot_count
95
+
FROM leaderboard_snapshots
96
+
GROUP BY story_id
97
+
HAVING COUNT(*) > 3
98
+
ORDER BY snapshot_count DESC
99
+
LIMIT ${batchSize}`,
100
+
)
101
+
.all() as StoryCount[];
102
+
103
+
// Process each story in batches for memory efficiency
104
+
for (const story of candidateStories) {
105
+
const storyId = story.story_id;
106
+
if (!storyId) continue;
107
+
108
+
try {
109
+
// Clear the temporary table
110
+
sqlite.exec("DELETE FROM temp_snapshots_to_keep");
111
+
112
+
// Get all snapshots for this story with direct SQL for better performance
113
+
const snapshots = sqlite
114
+
.prepare(
115
+
`SELECT id, timestamp, position, score
116
+
FROM leaderboard_snapshots
117
+
WHERE story_id = ?
118
+
ORDER BY timestamp`,
119
+
)
120
+
.all(storyId) as Snapshot[];
121
+
122
+
if (!snapshots || snapshots.length <= 3) {
123
+
console.log(`Skipping story ${storyId}: Only ${snapshots?.length || 0} snapshots (minimum 4 required)`);
124
+
continue;
125
+
}
126
+
127
+
// Always keep first and last snapshots
128
+
const firstId = snapshots[0]?.id;
129
+
const lastId = snapshots[snapshots.length - 1]?.id;
130
+
131
+
if (firstId) {
132
+
sqlite.exec(`INSERT INTO temp_snapshots_to_keep VALUES (${firstId})`);
133
+
}
134
+
135
+
if (lastId && lastId !== firstId) {
136
+
sqlite.exec(`INSERT INTO temp_snapshots_to_keep VALUES (${lastId})`);
137
+
}
138
+
139
+
let lastPosition = snapshots[0]?.position;
140
+
let lastScore = snapshots[0]?.score;
141
+
let lastKeptIndex = 0;
142
+
143
+
// Track potential sharp changes
144
+
let significantChanges = 0;
145
+
let maxPositionJump = 0;
146
+
let maxScoreJump = 0;
147
+
148
+
// First pass - analyze change patterns to detect sharp/significant changes
149
+
if (conservative) {
150
+
for (let i = 1; i < snapshots.length; i++) {
151
+
if (snapshots[i] && snapshots[i-1]) {
152
+
const positionDiff = Math.abs((snapshots[i]?.position ?? 0) - (snapshots[i-1]?.position ?? 0));
153
+
const scoreDiff = Math.abs((snapshots[i]?.score ?? 0) - (snapshots[i-1]?.score ?? 0));
154
+
155
+
maxPositionJump = Math.max(maxPositionJump, positionDiff);
156
+
maxScoreJump = Math.max(maxScoreJump, scoreDiff);
157
+
158
+
// Count significant changes (position jumps of 3+ or score changes of 10%+)
159
+
if (positionDiff >= 3 || scoreDiff >= Math.max(5, (snapshots[i-1]?.score ?? 0) * 0.1)) {
160
+
significantChanges++;
161
+
}
162
+
}
163
+
}
164
+
}
165
+
166
+
// Determine how aggressive to be based on the story's volatility
167
+
const hasSharpChanges = significantChanges >= 2 || maxPositionJump >= 5 || maxScoreJump >= 20;
168
+
const keepEveryNthPoint = hasSharpChanges ? 2 : 4; // Keep more points if story has sharp changes
169
+
170
+
// Find snapshots to keep in one pass (changes and last before changes)
171
+
for (let i = 1; i < snapshots.length - 1; i++) {
172
+
const snapshot = snapshots[i];
173
+
if (
174
+
!snapshot ||
175
+
typeof snapshot.position !== "number" ||
176
+
typeof snapshot.score !== "number"
177
+
)
178
+
continue;
179
+
180
+
// With conservative mode, we'll keep more snapshots
181
+
if (conservative) {
182
+
// Keep snapshots at regular intervals to preserve shape of the graph
183
+
if (i % keepEveryNthPoint === 0) {
184
+
if (snapshot.id) {
185
+
sqlite.exec(
186
+
`INSERT INTO temp_snapshots_to_keep VALUES (${snapshot.id})`,
187
+
);
188
+
}
189
+
continue;
190
+
}
191
+
}
192
+
193
+
const positionChanged = snapshot.position !== lastPosition;
194
+
const scoreChanged = snapshot.score !== lastScore;
195
+
196
+
// For stories with sharp changes, be more sensitive to any change
197
+
const significantPositionChange = Math.abs((snapshot.position ?? 0) - (lastPosition ?? 0)) >= 2;
198
+
const significantScoreChange = Math.abs((snapshot.score ?? 0) - (lastScore ?? 0)) >= 3;
199
+
200
+
if (positionChanged || scoreChanged ||
201
+
(conservative && (significantPositionChange || significantScoreChange))) {
202
+
// Keep last snapshot before change
203
+
if (i - 1 > lastKeptIndex) {
204
+
const prevId = snapshots[i - 1]?.id;
205
+
if (prevId) {
206
+
sqlite.exec(
207
+
`INSERT INTO temp_snapshots_to_keep VALUES (${prevId})`,
208
+
);
209
+
}
210
+
}
211
+
212
+
// Keep snapshot with change
213
+
if (snapshot.id) {
214
+
sqlite.exec(
215
+
`INSERT INTO temp_snapshots_to_keep VALUES (${snapshot.id})`,
216
+
);
217
+
}
218
+
219
+
lastPosition = snapshot.position;
220
+
lastScore = snapshot.score;
221
+
lastKeptIndex = i;
222
+
}
223
+
}
224
+
225
+
// Delete redundant snapshots efficiently using NOT EXISTS
226
+
const statement = sqlite.prepare(
227
+
`DELETE FROM leaderboard_snapshots
228
+
WHERE story_id = ?
229
+
AND NOT EXISTS (
230
+
SELECT 1 FROM temp_snapshots_to_keep
231
+
WHERE temp_snapshots_to_keep.id = leaderboard_snapshots.id
232
+
)`,
233
+
);
234
+
235
+
// Run once and get changes
236
+
const deletedCount = statement.run(storyId).changes;
237
+
statement.finalize();
238
+
239
+
// Count already calculated above
240
+
totalRedundantSnapshots += deletedCount;
241
+
processedStories++;
242
+
243
+
// Log more details to help debug issues
244
+
const keptCount = snapshots.length - deletedCount;
245
+
const keepPercent = Math.round((keptCount / snapshots.length) * 100);
246
+
247
+
console.log(
248
+
`Story ${storyId}: ${keptCount}/${snapshots.length} snapshots kept (${keepPercent}%)${hasSharpChanges ? ' - SHARP CHANGES DETECTED' : ''} - Max jumps: pos=${maxPositionJump}, score=${maxScoreJump}`
249
+
);
250
+
251
+
if (processedStories % 10 === 0) {
252
+
console.log(
253
+
`Processed ${processedStories}/${storyCount} stories, removed ${totalRedundantSnapshots} redundant snapshots so far`,
254
+
);
255
+
}
256
+
} catch (error) {
257
+
console.error(
258
+
`Error optimizing snapshots for story ${storyId}:`,
259
+
error,
260
+
);
261
+
Sentry.captureException(error);
262
+
}
263
+
}
264
+
265
+
// Clean up temporary table
266
+
sqlite.exec("DROP TABLE IF EXISTS temp_snapshots_to_keep");
267
+
268
+
const duration = (Date.now() - startTime) / 1000;
269
+
console.log(
270
+
`Leaderboard optimization complete: processed ${processedStories}/${storyCount} stories, removed ${totalRedundantSnapshots} redundant snapshots in ${duration.toFixed(2)}s`,
271
+
);
272
+
273
+
// If there are more stories to process, return how many are left
274
+
return storyCount - processedStories;
275
+
} catch (error) {
276
+
console.error("Error during leaderboard snapshots optimization:", error);
277
+
Sentry.captureException(error);
278
+
return 0;
279
+
}
280
+
}
281
+
282
// Export the sqlite instance and schema for use in other files
283
+
export { sqlite, schema, optimizeLeaderboardSnapshots };