Rust AppView - highly experimental!
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at experiments 98 lines 3.5 kB view raw
1#!/bin/bash 2set -e 3 4# Train 500KB zstd dictionary from parakeet_historical posts 5 6DICT_SIZE=524288 # 500KB (512 * 1024) 7DICT_VERSION=1 8OUTPUT_DIR="./parakeet-db/src/dicts" 9TEMP_DIR="/tmp/parakeet_dict_training" 10 11echo "==========================================" 12echo "Post Content Dictionary Training" 13echo "==========================================" 14echo "Dictionary size: 500KB" 15echo "Database: parakeet_historical" 16echo "" 17 18# Create directories 19mkdir -p "$OUTPUT_DIR" 20mkdir -p "$TEMP_DIR" 21 22# Extract all posts from historical database 23echo "[1/4] Extracting posts from parakeet_historical..." 24psql parakeet_historical -c " 25 COPY ( 26 SELECT content 27 FROM posts 28 WHERE content IS NOT NULL 29 AND LENGTH(content) > 0 30 ) TO STDOUT 31" > "$TEMP_DIR/all_posts.txt" 32 33# Show stats 34TOTAL_SIZE=$(stat -f%z "$TEMP_DIR/all_posts.txt" 2>/dev/null || stat -c%s "$TEMP_DIR/all_posts.txt") 35TOTAL_LINES=$(wc -l < "$TEMP_DIR/all_posts.txt") 36TOTAL_SIZE_MB=$(awk "BEGIN {printf \"%.2f\", $TOTAL_SIZE / 1048576}") 37 38echo " Extracted: $TOTAL_LINES posts" 39echo " Total size: ${TOTAL_SIZE_MB} MB" 40echo "" 41 42# Train dictionary 43# Use -B to split file into blocks (posts are ~75 bytes avg, use 256 byte blocks) 44echo "[2/4] Training 500KB dictionary..." 45zstd --train "$TEMP_DIR/all_posts.txt" \ 46 -o "$OUTPUT_DIR/post_content_v${DICT_VERSION}.dict" \ 47 --maxdict=$DICT_SIZE \ 48 -B256 49 50DICT_ACTUAL_SIZE=$(stat -f%z "$OUTPUT_DIR/post_content_v${DICT_VERSION}.dict" 2>/dev/null || stat -c%s "$OUTPUT_DIR/post_content_v${DICT_VERSION}.dict") 51DICT_SIZE_KB=$(awk "BEGIN {printf \"%.2f\", $DICT_ACTUAL_SIZE / 1024}") 52 53echo " Dictionary created: ${DICT_SIZE_KB} KB" 54echo "" 55 56# Test compression ratio 57echo "[3/4] Testing compression ratio..." 58 59# Create test sample (10K posts) 60head -n 10000 "$TEMP_DIR/all_posts.txt" > "$TEMP_DIR/test_sample.txt" 61TEST_SIZE=$(stat -f%z "$TEMP_DIR/test_sample.txt" 2>/dev/null || stat -c%s "$TEMP_DIR/test_sample.txt") 62 63# Compress without dictionary 64zstd -q -c "$TEMP_DIR/test_sample.txt" > "$TEMP_DIR/test_no_dict.zst" 65NO_DICT_SIZE=$(stat -f%z "$TEMP_DIR/test_no_dict.zst" 2>/dev/null || stat -c%s "$TEMP_DIR/test_no_dict.zst") 66 67# Compress with dictionary 68zstd -q -D "$OUTPUT_DIR/post_content_v${DICT_VERSION}.dict" \ 69 -c "$TEMP_DIR/test_sample.txt" > "$TEMP_DIR/test_with_dict.zst" 70WITH_DICT_SIZE=$(stat -f%z "$TEMP_DIR/test_with_dict.zst" 2>/dev/null || stat -c%s "$TEMP_DIR/test_with_dict.zst") 71 72# Calculate ratios 73NO_DICT_RATIO=$(awk "BEGIN {printf \"%.2f\", $TEST_SIZE / $NO_DICT_SIZE}") 74WITH_DICT_RATIO=$(awk "BEGIN {printf \"%.2f\", $TEST_SIZE / $WITH_DICT_SIZE}") 75IMPROVEMENT=$(awk "BEGIN {printf \"%.2f\", $WITH_DICT_RATIO - $NO_DICT_RATIO}") 76 77echo " Test sample size: $TEST_SIZE bytes" 78echo " Without dictionary: $NO_DICT_SIZE bytes (${NO_DICT_RATIO}x compression)" 79echo " With dictionary: $WITH_DICT_SIZE bytes (${WITH_DICT_RATIO}x compression)" 80echo " Improvement: +${IMPROVEMENT}x" 81echo "" 82 83# Cleanup 84echo "[4/4] Cleaning up..." 85rm -rf "$TEMP_DIR" 86echo "" 87 88echo "==========================================" 89echo "Dictionary training complete!" 90echo "==========================================" 91echo "Location: $OUTPUT_DIR/post_content_v${DICT_VERSION}.dict" 92echo "Size: ${DICT_SIZE_KB} KB" 93echo "Compression ratio: ${WITH_DICT_RATIO}x" 94echo "" 95echo "Next steps:" 96echo "1. Review the compression ratio above" 97echo "2. Run diesel migration to update schema" 98echo "3. Update consumer and parakeet code"