forked from
parakeet.at/parakeet
Rust AppView - highly experimental!
1#!/bin/bash
2set -e
3
4# Train 500KB zstd dictionary from parakeet_historical posts
5
6DICT_SIZE=524288 # 500KB (512 * 1024)
7DICT_VERSION=1
8OUTPUT_DIR="./parakeet-db/src/dicts"
9TEMP_DIR="/tmp/parakeet_dict_training"
10
11echo "=========================================="
12echo "Post Content Dictionary Training"
13echo "=========================================="
14echo "Dictionary size: 500KB"
15echo "Database: parakeet_historical"
16echo ""
17
18# Create directories
19mkdir -p "$OUTPUT_DIR"
20mkdir -p "$TEMP_DIR"
21
22# Extract all posts from historical database
23echo "[1/4] Extracting posts from parakeet_historical..."
24psql parakeet_historical -c "
25 COPY (
26 SELECT content
27 FROM posts
28 WHERE content IS NOT NULL
29 AND LENGTH(content) > 0
30 ) TO STDOUT
31" > "$TEMP_DIR/all_posts.txt"
32
33# Show stats
34TOTAL_SIZE=$(stat -f%z "$TEMP_DIR/all_posts.txt" 2>/dev/null || stat -c%s "$TEMP_DIR/all_posts.txt")
35TOTAL_LINES=$(wc -l < "$TEMP_DIR/all_posts.txt")
36TOTAL_SIZE_MB=$(awk "BEGIN {printf \"%.2f\", $TOTAL_SIZE / 1048576}")
37
38echo " Extracted: $TOTAL_LINES posts"
39echo " Total size: ${TOTAL_SIZE_MB} MB"
40echo ""
41
42# Train dictionary
43# Use -B to split file into blocks (posts are ~75 bytes avg, use 256 byte blocks)
44echo "[2/4] Training 500KB dictionary..."
45zstd --train "$TEMP_DIR/all_posts.txt" \
46 -o "$OUTPUT_DIR/post_content_v${DICT_VERSION}.dict" \
47 --maxdict=$DICT_SIZE \
48 -B256
49
50DICT_ACTUAL_SIZE=$(stat -f%z "$OUTPUT_DIR/post_content_v${DICT_VERSION}.dict" 2>/dev/null || stat -c%s "$OUTPUT_DIR/post_content_v${DICT_VERSION}.dict")
51DICT_SIZE_KB=$(awk "BEGIN {printf \"%.2f\", $DICT_ACTUAL_SIZE / 1024}")
52
53echo " Dictionary created: ${DICT_SIZE_KB} KB"
54echo ""
55
56# Test compression ratio
57echo "[3/4] Testing compression ratio..."
58
59# Create test sample (10K posts)
60head -n 10000 "$TEMP_DIR/all_posts.txt" > "$TEMP_DIR/test_sample.txt"
61TEST_SIZE=$(stat -f%z "$TEMP_DIR/test_sample.txt" 2>/dev/null || stat -c%s "$TEMP_DIR/test_sample.txt")
62
63# Compress without dictionary
64zstd -q -c "$TEMP_DIR/test_sample.txt" > "$TEMP_DIR/test_no_dict.zst"
65NO_DICT_SIZE=$(stat -f%z "$TEMP_DIR/test_no_dict.zst" 2>/dev/null || stat -c%s "$TEMP_DIR/test_no_dict.zst")
66
67# Compress with dictionary
68zstd -q -D "$OUTPUT_DIR/post_content_v${DICT_VERSION}.dict" \
69 -c "$TEMP_DIR/test_sample.txt" > "$TEMP_DIR/test_with_dict.zst"
70WITH_DICT_SIZE=$(stat -f%z "$TEMP_DIR/test_with_dict.zst" 2>/dev/null || stat -c%s "$TEMP_DIR/test_with_dict.zst")
71
72# Calculate ratios
73NO_DICT_RATIO=$(awk "BEGIN {printf \"%.2f\", $TEST_SIZE / $NO_DICT_SIZE}")
74WITH_DICT_RATIO=$(awk "BEGIN {printf \"%.2f\", $TEST_SIZE / $WITH_DICT_SIZE}")
75IMPROVEMENT=$(awk "BEGIN {printf \"%.2f\", $WITH_DICT_RATIO - $NO_DICT_RATIO}")
76
77echo " Test sample size: $TEST_SIZE bytes"
78echo " Without dictionary: $NO_DICT_SIZE bytes (${NO_DICT_RATIO}x compression)"
79echo " With dictionary: $WITH_DICT_SIZE bytes (${WITH_DICT_RATIO}x compression)"
80echo " Improvement: +${IMPROVEMENT}x"
81echo ""
82
83# Cleanup
84echo "[4/4] Cleaning up..."
85rm -rf "$TEMP_DIR"
86echo ""
87
88echo "=========================================="
89echo "Dictionary training complete!"
90echo "=========================================="
91echo "Location: $OUTPUT_DIR/post_content_v${DICT_VERSION}.dict"
92echo "Size: ${DICT_SIZE_KB} KB"
93echo "Compression ratio: ${WITH_DICT_RATIO}x"
94echo ""
95echo "Next steps:"
96echo "1. Review the compression ratio above"
97echo "2. Run diesel migration to update schema"
98echo "3. Update consumer and parakeet code"