commit e1d95a8e1b735e71b015eae542f08762fc26fc2a · smokesignal.events/quickdid

+416 -128

docs/telegraf-timescaledb-metrics-guide.md

··· 6 6 7 7 This setup creates a metrics pipeline that: 8 8 - Collects StatsD metrics via Telegraf on UDP port 8125 9 - - Stores metrics in PostgreSQL with TimescaleDB extensions 9 + - Creates individual PostgreSQL tables for each metric type 10 + - Stores metric tags as JSONB for flexible querying 10 11 - Automatically creates hypertables for time-series optimization 11 12 - Provides a complete Docker Compose configuration for easy deployment 12 13 14 + ## Important Note on Table Structure 15 + 16 + The Telegraf PostgreSQL output plugin with the configuration in this guide creates **individual tables for each metric name**. For example: 17 + - `quickdid.http.request.count` becomes table `"quickdid.http.request.count"` 18 + - `quickdid.resolver.rate_limit.available_permits` becomes table `"quickdid.resolver.rate_limit.available_permits"` 19 + 20 + Each table has the following structure: 21 + - `time` (timestamptz) - The timestamp of the metric 22 + - `tags` (jsonb) - All tags stored as a JSON object 23 + - Metric-specific columns for values (e.g., `value`, `mean`, `p99`, etc.) 24 + 13 25 ## Prerequisites 14 26 15 27 - Docker and Docker Compose installed ··· 25 37 ├── docker-compose.yml 26 38 ├── telegraf/ 27 39 │ └── telegraf.conf 40 + ├── test-scripts/ 41 + │ ├── send-metrics.sh 42 + │ └── verify-queries.sql 28 43 └── .env 29 44 ``` 30 45 ··· 86 101 [[outputs.postgresql]] 87 102 connection = "host=timescaledb user=${TELEGRAF_DB_USER} password=${TELEGRAF_DB_PASSWORD} dbname=${POSTGRES_DB} sslmode=disable" 88 103 schema = "public" 104 + 105 + # Create individual tables for each metric with hypertable support 89 106 create_templates = [ 90 107 '''CREATE TABLE IF NOT EXISTS {{.table}} ({{.columns}})''', 91 108 '''SELECT create_hypertable({{.table|quoteLiteral}}, 'time', if_not_exists => TRUE)''', 92 109 ] 110 + 111 + # Store all tags as JSONB for flexible querying 93 112 tags_as_jsonb = true 94 - 113 + 114 + # Keep fields as separate columns for better performance on aggregations 115 + fields_as_jsonb = false 95 116 ``` 96 117 97 118 ### 3. Docker Compose Configuration (docker-compose.yml) ··· 103 124 104 125 services: 105 126 timescaledb: 106 - image: timescale/timescaledb-ha:pg17 127 + image: timescale/timescaledb:latest-pg17 107 128 container_name: timescaledb 108 129 restart: unless-stopped 109 130 environment: ··· 116 137 volumes: 117 138 - timescale_data:/home/postgres/pgdata/data 118 139 - ./init-scripts:/docker-entrypoint-initdb.d:ro 140 + command: 141 + - postgres 142 + - -c 143 + - shared_buffers=1GB 144 + - -c 145 + - effective_cache_size=3GB 146 + - -c 147 + - maintenance_work_mem=512MB 148 + - -c 149 + - work_mem=32MB 150 + - -c 151 + - timescaledb.max_background_workers=8 152 + - -c 153 + - max_parallel_workers_per_gather=2 154 + - -c 155 + - max_parallel_workers=8 119 156 healthcheck: 120 157 test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB}"] 121 158 interval: 10s ··· 125 162 - metrics_network 126 163 127 164 telegraf: 128 - image: telegraf 165 + image: telegraf:1.35 129 166 container_name: telegraf 130 167 restart: unless-stopped 131 168 environment: ··· 171 208 172 209 ### 4. Database Initialization Script (optional) 173 210 174 - Create `init-scripts/01-init.sql` to set up the Telegraf user and schema: 211 + Create `init-scripts/01-init.sql` to set up the TimescaleDB extension: 175 212 176 213 ```sql 177 214 -- Enable TimescaleDB extension 178 215 CREATE EXTENSION IF NOT EXISTS timescaledb; 216 + 217 + -- Enable additional useful extensions 218 + CREATE EXTENSION IF NOT EXISTS pg_stat_statements; 219 + ``` 220 + 221 + ## Test Scripts 222 + 223 + ### 1. Send Test Metrics Script (test-scripts/send-metrics.sh) 224 + 225 + Create a script to send various types of metrics: 226 + 227 + ```bash 228 + #!/bin/bash 229 + 230 + # Send test metrics to StatsD/Telegraf 231 + 232 + echo "Sending test metrics to StatsD on localhost:8125..." 233 + 234 + # Counter metrics 235 + for i in {1..10}; do 236 + echo "quickdid.http.request.count:1|c|#method:GET,path:/resolve,status:200" | nc -u -w0 localhost 8125 237 + echo "quickdid.http.request.count:1|c|#method:POST,path:/api,status:201" | nc -u -w0 localhost 8125 238 + echo "quickdid.http.request.count:1|c|#method:GET,path:/resolve,status:404" | nc -u -w0 localhost 8125 239 + done 240 + 241 + # Gauge metrics 242 + echo "quickdid.resolver.rate_limit.available_permits:10|g" | nc -u -w0 localhost 8125 243 + echo "quickdid.resolver.rate_limit.available_permits:8|g" | nc -u -w0 localhost 8125 244 + echo "quickdid.resolver.rate_limit.available_permits:5|g" | nc -u -w0 localhost 8125 245 + 246 + # Timing metrics (in milliseconds) 247 + for i in {1..20}; do 248 + duration=$((RANDOM % 100 + 10)) 249 + echo "quickdid.http.request.duration_ms:${duration}|ms|#method:GET,path:/resolve,status:200" | nc -u -w0 localhost 8125 250 + done 251 + 252 + for i in {1..10}; do 253 + duration=$((RANDOM % 200 + 50)) 254 + echo "quickdid.http.request.duration_ms:${duration}|ms|#method:POST,path:/api,status:201" | nc -u -w0 localhost 8125 255 + done 256 + 257 + # Histogram metrics 258 + for i in {1..15}; do 259 + resolution_time=$((RANDOM % 500 + 50)) 260 + echo "quickdid.resolver.resolution_time:${resolution_time}|h|#resolver:redis" | nc -u -w0 localhost 8125 261 + echo "quickdid.resolver.resolution_time:$((resolution_time * 2))|h|#resolver:base" | nc -u -w0 localhost 8125 262 + done 263 + 264 + # Cache metrics 265 + echo "quickdid.cache.hit.count:45|c|#cache_type:redis" | nc -u -w0 localhost 8125 266 + echo "quickdid.cache.miss.count:5|c|#cache_type:redis" | nc -u -w0 localhost 8125 267 + echo "quickdid.cache.size:1024|g|#cache_type:memory" | nc -u -w0 localhost 8125 268 + 269 + echo "Metrics sent! Wait 15 seconds for Telegraf to flush..." 270 + sleep 15 271 + echo "Done!" 272 + ``` 273 + 274 + ### 2. Verify Queries Script (test-scripts/verify-queries.sql) 275 + 276 + Create a SQL script to verify all queries work correctly: 277 + 278 + ```sql 279 + -- Test script to verify all metrics queries work correctly 280 + -- Run this after sending test metrics with send-metrics.sh 281 + 282 + \echo '===== CHECKING AVAILABLE TABLES =====' 283 + SELECT table_name 284 + FROM information_schema.tables 285 + WHERE table_schema = 'public' 286 + AND table_name LIKE 'quickdid%' 287 + ORDER BY table_name; 288 + 289 + \echo '' 290 + \echo '===== CHECKING TABLE STRUCTURES =====' 291 + \echo 'Structure of quickdid.http.request.count table:' 292 + \d "quickdid.http.request.count" 293 + 294 + \echo '' 295 + \echo 'Structure of quickdid.http.request.duration_ms table:' 296 + \d "quickdid.http.request.duration_ms" 297 + 298 + \echo '' 299 + \echo '===== QUERY 1: Recent HTTP Request Counts =====' 300 + SELECT 301 + time, 302 + tags, 303 + tags->>'method' as method, 304 + tags->>'path' as path, 305 + tags->>'status' as status, 306 + value 307 + FROM "quickdid.http.request.count" 308 + WHERE time > NOW() - INTERVAL '1 hour' 309 + ORDER BY time DESC 310 + LIMIT 10; 311 + 312 + \echo '' 313 + \echo '===== QUERY 2: HTTP Request Duration Statistics by Endpoint =====' 314 + SELECT 315 + time_bucket('1 minute', time) AS minute, 316 + tags->>'method' as method, 317 + tags->>'path' as path, 318 + tags->>'status' as status, 319 + COUNT(*) as request_count, 320 + AVG(mean) as avg_duration_ms, 321 + MAX(p99) as p99_duration_ms, 322 + MIN(mean) as min_duration_ms 323 + FROM "quickdid.http.request.duration_ms" 324 + WHERE time > NOW() - INTERVAL '1 hour' 325 + AND tags IS NOT NULL 326 + GROUP BY minute, tags->>'method', tags->>'path', tags->>'status' 327 + ORDER BY minute DESC 328 + LIMIT 10; 329 + 330 + \echo '' 331 + \echo '===== QUERY 3: Rate Limiter Status Over Time =====' 332 + SELECT 333 + time, 334 + value as available_permits 335 + FROM "quickdid.resolver.rate_limit.available_permits" 336 + WHERE time > NOW() - INTERVAL '1 hour' 337 + ORDER BY time DESC 338 + LIMIT 10; 339 + 340 + \echo '' 341 + \echo '===== QUERY 4: Resolver Performance Comparison =====' 342 + SELECT 343 + tags->>'resolver' as resolver_type, 344 + COUNT(*) as sample_count, 345 + AVG(mean) as avg_resolution_time_ms, 346 + MAX(p99) as p99_resolution_time_ms, 347 + MIN(mean) as min_resolution_time_ms 348 + FROM "quickdid.resolver.resolution_time" 349 + WHERE time > NOW() - INTERVAL '1 hour' 350 + AND tags->>'resolver' IS NOT NULL 351 + GROUP BY tags->>'resolver' 352 + ORDER BY avg_resolution_time_ms; 353 + 354 + \echo '' 355 + \echo '===== QUERY 5: Cache Hit Rate Analysis =====' 356 + WITH cache_stats AS ( 357 + SELECT 358 + 'hits' as metric_type, 359 + SUM(value) as total_count 360 + FROM "quickdid.cache.hit.count" 361 + WHERE time > NOW() - INTERVAL '1 hour' 362 + UNION ALL 363 + SELECT 364 + 'misses' as metric_type, 365 + SUM(value) as total_count 366 + FROM "quickdid.cache.miss.count" 367 + WHERE time > NOW() - INTERVAL '1 hour' 368 + ) 369 + SELECT 370 + SUM(CASE WHEN metric_type = 'hits' THEN total_count ELSE 0 END) as total_hits, 371 + SUM(CASE WHEN metric_type = 'misses' THEN total_count ELSE 0 END) as total_misses, 372 + CASE 373 + WHEN SUM(total_count) > 0 THEN 374 + ROUND(100.0 * SUM(CASE WHEN metric_type = 'hits' THEN total_count ELSE 0 END) / SUM(total_count), 2) 375 + ELSE 0 376 + END as hit_rate_percentage 377 + FROM cache_stats; 378 + 379 + \echo '' 380 + \echo '===== QUERY 6: Hypertable Information =====' 381 + SELECT 382 + hypertable_schema, 383 + hypertable_name, 384 + owner, 385 + num_dimensions, 386 + num_chunks, 387 + compression_enabled 388 + FROM timescaledb_information.hypertables 389 + WHERE hypertable_name LIKE 'quickdid%' 390 + ORDER BY hypertable_name; 391 + 392 + \echo '' 393 + \echo '===== QUERY 7: HTTP Error Rate by Endpoint =====' 394 + WITH status_counts AS ( 395 + SELECT 396 + time_bucket('5 minutes', time) as period, 397 + tags->>'path' as path, 398 + CASE 399 + WHEN (tags->>'status')::int >= 400 THEN 'error' 400 + ELSE 'success' 401 + END as status_category, 402 + SUM(value) as request_count 403 + FROM "quickdid.http.request.count" 404 + WHERE time > NOW() - INTERVAL '1 hour' 405 + GROUP BY period, path, status_category 406 + ) 407 + SELECT 408 + period, 409 + path, 410 + SUM(CASE WHEN status_category = 'error' THEN request_count ELSE 0 END) as error_count, 411 + SUM(CASE WHEN status_category = 'success' THEN request_count ELSE 0 END) as success_count, 412 + CASE 413 + WHEN SUM(request_count) > 0 THEN 414 + ROUND(100.0 * SUM(CASE WHEN status_category = 'error' THEN request_count ELSE 0 END) / SUM(request_count), 2) 415 + ELSE 0 416 + END as error_rate_percentage 417 + FROM status_counts 418 + GROUP BY period, path 419 + HAVING SUM(request_count) > 0 420 + ORDER BY period DESC, error_rate_percentage DESC; 421 + 422 + \echo '' 423 + \echo '===== TEST COMPLETED =====' 179 424 ``` 180 425 181 426 ## Usage ··· 187 432 cd metrics-stack 188 433 ``` 189 434 190 - 2. Start the services: 435 + 2. Make the test scripts executable: 436 + ```bash 437 + chmod +x test-scripts/send-metrics.sh 438 + ``` 439 + 440 + 3. Start the services: 191 441 ```bash 192 442 docker-compose up -d 193 443 ``` 194 444 195 - 3. Check the logs to ensure everything is running: 445 + 4. Check the logs to ensure everything is running: 196 446 ```bash 197 447 docker-compose logs -f 198 448 ``` 199 449 200 - ### Sending Metrics 450 + 5. Wait for services to be fully ready (about 30 seconds) 201 451 202 - Send test metrics to Telegraf using StatsD protocol: 452 + ### Running the Test Suite 203 453 454 + 1. Send test metrics: 204 455 ```bash 205 - # Send a counter metric 206 - echo "quickdid.http.request.count:1|c|#method:GET,path:/resolve,status:200" | nc -u -w0 localhost 8125 207 - 208 - # Send a gauge metric 209 - echo "quickdid.resolver.rate_limit.available_permits:10|g" | nc -u -w0 localhost 8125 456 + ./test-scripts/send-metrics.sh 457 + ``` 210 458 211 - # Send a timing metric 212 - echo "quickdid.http.request.duration_ms:42|ms|#method:POST,path:/api,status:201" | nc -u -w0 localhost 8125 213 - 214 - # Send a histogram 215 - echo "quickdid.resolver.resolution_time:100|h|#resolver:redis" | nc -u -w0 localhost 8125 459 + 2. Verify all queries work: 460 + ```bash 461 + docker exec -i timescaledb psql -U postgres -d metrics < test-scripts/verify-queries.sql 216 462 ``` 217 463 218 - ### Querying Metrics 464 + ### Manual Querying 219 465 220 - Connect to TimescaleDB to query your metrics: 466 + Connect to TimescaleDB to run queries manually: 221 467 222 468 ```bash 223 469 # Connect to the database 224 470 docker exec -it timescaledb psql -U postgres -d metrics 225 471 226 - # Query recent metrics 472 + # List all metric tables 473 + \dt "quickdid*" 474 + 475 + # Describe a specific table structure 476 + \d "quickdid.http.request.duration_ms" 477 + 478 + # Query with JSONB tag filtering 227 479 SELECT 228 480 time, 229 - tags, 230 - value 231 - FROM "quickdid.http.request.count" 232 - WHERE time > NOW() - INTERVAL '1 hour' 233 - ORDER BY time DESC 234 - LIMIT 10; 235 - 236 - # Query specific metric with tags 237 - SELECT 238 - time_bucket('1 minute', time) AS minute, 239 481 tags->>'method' as method, 240 - tags->>'path' as path, 241 - tags->>'status' as status, 242 - AVG(mean::numeric) as avg_duration 482 + mean as avg_ms, 483 + '99_percentile' as p99_ms 243 484 FROM "quickdid.http.request.duration_ms" 244 - WHERE 245 - time > NOW() - INTERVAL '1 hour' 246 - GROUP BY minute, method, path, status 247 - ORDER BY minute DESC; 485 + WHERE tags @> '{"method": "GET"}'::jsonb 486 + AND time > NOW() - INTERVAL '1 hour' 487 + ORDER BY time DESC 488 + LIMIT 10; 248 489 ``` 249 490 250 491 ## Advanced Configuration 251 492 252 - ### Continuous Aggregates 493 + ### Continuous Aggregates for Performance 253 494 254 - Create continuous aggregates for better query performance: 495 + Create continuous aggregates for frequently queried data: 255 496 256 497 ```sql 257 - -- Create a continuous aggregate for HTTP metrics 498 + -- Create hourly aggregates for HTTP metrics 258 499 CREATE MATERIALIZED VIEW http_metrics_hourly 259 500 WITH (timescaledb.continuous) AS 260 501 SELECT 261 502 time_bucket('1 hour', time) AS hour, 262 - name, 263 503 tags->>'method' as method, 264 504 tags->>'path' as path, 265 505 tags->>'status' as status, 266 506 COUNT(*) as request_count, 267 - AVG((fields->>'value')::numeric) as avg_value, 268 - MAX((fields->>'value')::numeric) as max_value, 269 - MIN((fields->>'value')::numeric) as min_value 270 - FROM telegraf.metrics 271 - WHERE name LIKE 'quickdid_http_%' 272 - GROUP BY hour, name, method, path, status 507 + AVG(mean) as avg_duration_ms, 508 + MAX('99_percentile') as p99_duration_ms, 509 + MIN(mean) as min_duration_ms 510 + FROM "quickdid.http.request.duration_ms" 511 + WHERE tags IS NOT NULL 512 + GROUP BY hour, method, path, status 273 513 WITH NO DATA; 274 514 275 515 -- Add refresh policy ··· 277 517 start_offset => INTERVAL '3 hours', 278 518 end_offset => INTERVAL '1 hour', 279 519 schedule_interval => INTERVAL '1 hour'); 520 + 521 + -- Manually refresh to populate initial data 522 + CALL refresh_continuous_aggregate('http_metrics_hourly', NULL, NULL); 523 + 524 + -- Query the aggregate 525 + SELECT * FROM http_metrics_hourly 526 + ORDER BY hour DESC, request_count DESC 527 + LIMIT 20; 280 528 ``` 281 529 282 - ## Monitoring and Maintenance 530 + ### Data Retention Policies 283 531 284 - ### Health Checks 532 + Set up automatic data retention: 285 533 286 - Check the health of your TimescaleDB hypertables: 534 + ```sql 535 + -- Add retention policy to drop data older than 30 days 536 + SELECT add_retention_policy('"quickdid.http.request.count"', INTERVAL '30 days'); 537 + SELECT add_retention_policy('"quickdid.http.request.duration_ms"', INTERVAL '30 days'); 538 + 539 + -- View retention policies 540 + SELECT js.* FROM timescaledb_information.job_stats js 541 + JOIN timescaledb_information.jobs j ON js.job_id = j.job_id 542 + WHERE j.proc_name LIKE '%retention%'; 543 + ``` 544 + 545 + ### Compression for Storage Optimization 546 + 547 + Enable compression for older data: 287 548 288 549 ```sql 289 - -- View hypertable information 290 - SELECT * FROM timescaledb_information.hypertables; 550 + -- Enable compression on a hypertable 551 + ALTER TABLE "quickdid.http.request.duration_ms" SET ( 552 + timescaledb.compress, 553 + timescaledb.compress_segmentby = 'tags' 554 + ); 291 555 292 - -- Check chunk information 556 + -- Add compression policy (compress chunks older than 7 days) 557 + SELECT add_compression_policy('"quickdid.http.request.duration_ms"', INTERVAL '7 days'); 558 + 559 + -- Manually compress old chunks 560 + SELECT compress_chunk(format('%I.%I', c.chunk_schema, c.chunk_name)::regclass) 561 + FROM timescaledb_information.chunks c 562 + WHERE c.hypertable_name = 'quickdid.http.request.duration_ms' 563 + AND c.range_end < NOW() - INTERVAL '7 days' 564 + AND NOT c.is_compressed; 565 + 566 + -- Check compression status 567 + SELECT 568 + hypertable_name, 569 + uncompressed_total_bytes, 570 + compressed_total_bytes, 571 + compression_ratio 572 + FROM timescaledb_information.hypertable_compression_stats 573 + WHERE hypertable_name LIKE 'quickdid%'; 574 + ``` 575 + 576 + ## Monitoring and Maintenance 577 + 578 + ### Health Checks 579 + 580 + ```sql 581 + -- Check chunk distribution 293 582 SELECT 294 583 hypertable_name, 295 584 chunk_name, 296 585 range_start, 297 586 range_end, 298 - is_compressed 587 + is_compressed, 588 + pg_size_pretty(total_bytes) as size 299 589 FROM timescaledb_information.chunks 300 - WHERE hypertable_name = 'quickdid.http.request.duration_ms' 301 - ORDER BY range_start DESC 302 - LIMIT 10; 590 + WHERE hypertable_name LIKE 'quickdid%' 591 + ORDER BY hypertable_name, range_start DESC 592 + LIMIT 20; 303 593 304 - -- Check compression status 594 + -- Check background jobs 595 + SELECT 596 + job_id, 597 + application_name, 598 + job_type, 599 + schedule_interval, 600 + last_run_started_at, 601 + last_successful_finish, 602 + next_scheduled_run 603 + FROM timescaledb_information.job_stats 604 + ORDER BY job_id; 605 + 606 + -- Check table sizes 305 607 SELECT 306 608 hypertable_name, 307 - uncompressed_total_bytes, 308 - compressed_total_bytes, 309 - compression_ratio 310 - FROM timescaledb_information.hypertable_compression_stats; 609 + chunks_total_size, 610 + chunks_compressed_size, 611 + chunks_uncompressed_size 612 + FROM timescaledb_information.hypertables 613 + WHERE hypertable_name LIKE 'quickdid%'; 311 614 ``` 312 615 313 616 ### Troubleshooting 314 617 315 - 1. **Telegraf can't connect to TimescaleDB:** 316 - - Check that TimescaleDB is healthy: `docker-compose ps` 317 - - Verify credentials in `.env` file 318 - - Check network connectivity between containers 618 + 1. **Tables not being created:** 619 + - Check Telegraf logs: `docker-compose logs telegraf | grep -i error` 620 + - Verify PostgreSQL connectivity: `docker exec telegraf telegraf --test` 621 + - Ensure metrics are being received: `docker-compose logs telegraf | grep statsd` 319 622 320 - 2. **Metrics not appearing in database:** 321 - - Check Telegraf logs: `docker-compose logs telegraf` 322 - - Verify StatsD metrics are being received: `docker-compose logs telegraf | grep statsd` 323 - - Check table creation: Connect to database and run `\dt telegraf.*` 623 + 2. **Queries returning no data:** 624 + - Verify tables exist: `\dt "quickdid*"` in psql 625 + - Check table contents: `SELECT COUNT(*) FROM "quickdid.http.request.count";` 626 + - Verify time ranges in WHERE clauses 324 627 325 628 3. **Performance issues:** 326 - - Check chunk sizes and adjust chunk_time_interval if needed 327 - - Verify compression is working 328 - - Consider creating appropriate indexes on frequently queried columns 629 + - Check if hypertables are created: Query `timescaledb_information.hypertables` 630 + - Verify compression is working if enabled 631 + - Consider creating appropriate indexes on JSONB paths: 632 + ```sql 633 + CREATE INDEX idx_http_method ON "quickdid.http.request.duration_ms" ((tags->>'method')); 634 + CREATE INDEX idx_http_path ON "quickdid.http.request.duration_ms" ((tags->>'path')); 635 + ``` 329 636 330 637 ## Integration with QuickDID 331 638 ··· 344 651 345 652 QuickDID will automatically send metrics to Telegraf, which will store them in TimescaleDB for analysis. 346 653 347 - ## Backup and Restore 348 - 349 - ### Backup 654 + ## Key Differences from Generic Metrics Table Approach 350 655 351 - Create a backup of your metrics data: 352 - 353 - ```bash 354 - # Backup entire database 355 - docker exec timescaledb pg_dump -U postgres metrics_db > metrics_backup.sql 356 - 357 - # Backup specific time range 358 - docker exec timescaledb pg_dump -U postgres \ 359 - --table='telegraf.metrics' \ 360 - --data-only \ 361 - --where="time >= '2024-01-01' AND time < '2024-02-01'" \ 362 - metrics_db > metrics_january.sql 363 - ``` 364 - 365 - ### Restore 656 + This configuration creates **individual tables per metric** instead of a single generic metrics table. Benefits include: 366 657 367 - Restore from backup: 658 + 1. **Better performance**: Each metric has its own optimized schema 659 + 2. **Clearer data model**: Tables directly represent metrics 660 + 3. **Easier querying**: No need to filter by metric name 661 + 4. **Type safety**: Each metric's fields have appropriate types 662 + 5. **Efficient compression**: Per-metric compression strategies 368 663 369 - ```bash 370 - # Restore database 371 - docker exec -i timescaledb psql -U postgres metrics_db < metrics_backup.sql 372 - ``` 664 + Trade-offs: 665 + - More tables to manage (mitigated by TimescaleDB automation) 666 + - Need to know metric names upfront for queries 667 + - Schema changes require table alterations 373 668 374 669 ## Security Considerations 375 670 376 671 1. **Use strong passwords:** Update the default passwords in `.env` 377 672 2. **Enable SSL:** Configure `sslmode=require` in production 378 673 3. **Network isolation:** Use Docker networks to isolate services 379 - 4. **Access control:** Limit database user permissions to minimum required 674 + 4. **Access control:** Create separate database users with minimal permissions: 675 + ```sql 676 + CREATE USER metrics_reader WITH PASSWORD 'readonly_password'; 677 + GRANT CONNECT ON DATABASE metrics TO metrics_reader; 678 + GRANT USAGE ON SCHEMA public TO metrics_reader; 679 + GRANT SELECT ON ALL TABLES IN SCHEMA public TO metrics_reader; 680 + ``` 380 681 5. **Regular updates:** Keep Docker images updated for security patches 381 682 382 683 ## Performance Tuning 383 684 384 - ### TimescaleDB Configuration 685 + ### PostgreSQL/TimescaleDB Settings 385 686 386 - Add these settings to optimize performance: 687 + The docker-compose.yml includes optimized settings. Adjust based on your hardware: 387 688 388 - ```yaml 389 - # In docker-compose.yml under timescaledb service 390 - command: 391 - - postgres 392 - - -c 393 - - shared_buffers=1GB 394 - - -c 395 - - effective_cache_size=3GB 396 - - -c 397 - - maintenance_work_mem=512MB 398 - - -c 399 - - work_mem=32MB 400 - - -c 401 - - timescaledb.max_background_workers=8 402 - - -c 403 - - max_parallel_workers_per_gather=2 404 - - -c 405 - - max_parallel_workers=8 406 - ``` 689 + - `shared_buffers`: 25% of system RAM 690 + - `effective_cache_size`: 75% of system RAM 691 + - `maintenance_work_mem`: 5% of system RAM 692 + - `work_mem`: RAM / max_connections / 2 407 693 408 694 ### Telegraf Buffer Settings 409 695 410 - Adjust buffer settings based on your metric volume: 696 + For high-volume metrics, adjust in telegraf.conf: 411 697 412 698 ```toml 413 699 [agent] 414 - metric_batch_size = 5000 # Increase for high volume 700 + metric_batch_size = 5000 # Increase for high volume 415 701 metric_buffer_limit = 100000 # Increase buffer size 416 - flush_interval = "5s" # Decrease for more frequent writes 702 + flush_interval = "5s" # Decrease for more frequent writes 417 703 ``` 418 704 419 705 ## Conclusion 420 706 421 - This setup provides a robust metrics collection and storage solution using: 422 - - **Telegraf** for flexible metric collection via StatsD 423 - - **TimescaleDB** for efficient time-series data storage 424 - - **Docker Compose** for easy deployment and management 707 + This setup provides a robust metrics collection and storage solution with: 708 + - **Individual metric tables** for optimal performance and clarity 709 + - **JSONB tag storage** for flexible querying 710 + - **TimescaleDB hypertables** for efficient time-series storage 711 + - **Comprehensive test suite** to verify functionality 712 + - **Production-ready configuration** with compression and retention policies 425 713 426 - The system is production-ready with support for compression, retention policies, and continuous aggregates for optimal performance at scale. 714 + The system correctly handles StatsD metrics from QuickDID and provides powerful querying capabilities through PostgreSQL's JSONB support and TimescaleDB's time-series functions.

+364

test-scripts/docker-test.sh

··· 1 + #!/bin/bash 2 + 3 + # Comprehensive test script for Telegraf/TimescaleDB metrics setup 4 + # This script validates the entire metrics pipeline 5 + 6 + set -e 7 + 8 + echo "=========================================" 9 + echo "Telegraf/TimescaleDB Metrics Test Suite" 10 + echo "=========================================" 11 + echo "" 12 + 13 + # Check if Docker is running 14 + if ! docker info > /dev/null 2>&1; then 15 + echo "❌ Docker is not running. Please start Docker first." 16 + exit 1 17 + fi 18 + 19 + # Function to wait for a service to be healthy 20 + wait_for_service() { 21 + local service=$1 22 + local max_attempts=30 23 + local attempt=1 24 + 25 + echo -n "Waiting for $service to be healthy" 26 + while [ $attempt -le $max_attempts ]; do 27 + if docker-compose ps $service | grep -q "healthy"; then 28 + echo " ✅" 29 + return 0 30 + fi 31 + echo -n "." 32 + sleep 2 33 + attempt=$((attempt + 1)) 34 + done 35 + echo " ❌" 36 + echo "Service $service failed to become healthy after $max_attempts attempts" 37 + return 1 38 + } 39 + 40 + # Function to run SQL query 41 + run_query() { 42 + docker exec -i timescaledb psql -U postgres -d metrics -t -c "$1" 2>/dev/null 43 + } 44 + 45 + # Function to check table exists 46 + check_table() { 47 + local table=$1 48 + local result=$(run_query "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = '$table');") 49 + if [[ "$result" =~ "t" ]]; then 50 + echo "✅ Table '$table' exists" 51 + return 0 52 + else 53 + echo "❌ Table '$table' does not exist" 54 + return 1 55 + fi 56 + } 57 + 58 + # Navigate to the metrics-stack directory (create if needed) 59 + if [ ! -d "metrics-stack" ]; then 60 + echo "Creating metrics-stack directory..." 61 + mkdir -p metrics-stack/telegraf 62 + mkdir -p metrics-stack/test-scripts 63 + mkdir -p metrics-stack/init-scripts 64 + fi 65 + 66 + cd metrics-stack 67 + 68 + # Create .env file if it doesn't exist 69 + if [ ! -f ".env" ]; then 70 + echo "Creating .env file..." 71 + cat > .env << 'EOF' 72 + # PostgreSQL/TimescaleDB Configuration 73 + POSTGRES_DB=metrics 74 + POSTGRES_USER=postgres 75 + POSTGRES_PASSWORD=secretpassword 76 + 77 + # Telegraf Database User 78 + TELEGRAF_DB_USER=postgres 79 + TELEGRAF_DB_PASSWORD=secretpassword 80 + 81 + # TimescaleDB Settings 82 + TIMESCALE_TELEMETRY=off 83 + EOF 84 + fi 85 + 86 + # Copy configuration files if they don't exist 87 + if [ ! -f "telegraf/telegraf.conf" ]; then 88 + echo "Creating telegraf.conf..." 89 + cat > telegraf/telegraf.conf << 'EOF' 90 + [agent] 91 + interval = "10s" 92 + round_interval = true 93 + metric_batch_size = 1000 94 + metric_buffer_limit = 10000 95 + collection_jitter = "0s" 96 + flush_interval = "10s" 97 + flush_jitter = "0s" 98 + precision = "" 99 + debug = false 100 + quiet = false 101 + hostname = "telegraf-agent" 102 + omit_hostname = false 103 + 104 + [[inputs.statsd]] 105 + service_address = ":8125" 106 + protocol = "udp" 107 + delete_gauges = true 108 + delete_counters = true 109 + delete_sets = true 110 + delete_timings = true 111 + percentiles = [50, 90, 95, 99] 112 + metric_separator = "." 113 + allowed_pending_messages = 10000 114 + datadog_extensions = true 115 + datadog_distributions = true 116 + 117 + [[outputs.postgresql]] 118 + connection = "host=timescaledb user=${TELEGRAF_DB_USER} password=${TELEGRAF_DB_PASSWORD} dbname=${POSTGRES_DB} sslmode=disable" 119 + schema = "public" 120 + create_templates = [ 121 + '''CREATE TABLE IF NOT EXISTS {{.table}} ({{.columns}})''', 122 + '''SELECT create_hypertable({{.table|quoteLiteral}}, 'time', if_not_exists => TRUE)''', 123 + ] 124 + tags_as_jsonb = true 125 + fields_as_jsonb = false 126 + EOF 127 + fi 128 + 129 + # Copy docker-compose.yml if it doesn't exist 130 + if [ ! -f "docker-compose.yml" ]; then 131 + echo "Creating docker-compose.yml..." 132 + cat > docker-compose.yml << 'EOF' 133 + version: '3.8' 134 + 135 + services: 136 + timescaledb: 137 + image: timescale/timescaledb:latest-pg17 138 + container_name: timescaledb 139 + restart: unless-stopped 140 + environment: 141 + POSTGRES_DB: ${POSTGRES_DB} 142 + POSTGRES_USER: ${POSTGRES_USER} 143 + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} 144 + TIMESCALE_TELEMETRY: ${TIMESCALE_TELEMETRY} 145 + ports: 146 + - "5442:5432" 147 + volumes: 148 + - timescale_data:/home/postgres/pgdata/data 149 + - ./init-scripts:/docker-entrypoint-initdb.d:ro 150 + command: 151 + - postgres 152 + - -c 153 + - shared_buffers=256MB 154 + - -c 155 + - effective_cache_size=1GB 156 + - -c 157 + - maintenance_work_mem=64MB 158 + - -c 159 + - work_mem=8MB 160 + healthcheck: 161 + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB}"] 162 + interval: 10s 163 + timeout: 5s 164 + retries: 5 165 + networks: 166 + - metrics_network 167 + 168 + telegraf: 169 + image: telegraf:1.35 170 + container_name: telegraf 171 + restart: unless-stopped 172 + environment: 173 + TELEGRAF_DB_USER: ${TELEGRAF_DB_USER} 174 + TELEGRAF_DB_PASSWORD: ${TELEGRAF_DB_PASSWORD} 175 + POSTGRES_DB: ${POSTGRES_DB} 176 + ports: 177 + - "8125:8125/udp" 178 + volumes: 179 + - ./telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro 180 + depends_on: 181 + timescaledb: 182 + condition: service_healthy 183 + networks: 184 + - metrics_network 185 + command: ["telegraf", "--config", "/etc/telegraf/telegraf.conf"] 186 + 187 + networks: 188 + metrics_network: 189 + driver: bridge 190 + 191 + volumes: 192 + timescale_data: 193 + EOF 194 + fi 195 + 196 + # Create init script 197 + if [ ! -f "init-scripts/01-init.sql" ]; then 198 + echo "Creating init script..." 199 + cat > init-scripts/01-init.sql << 'EOF' 200 + -- Enable TimescaleDB extension 201 + CREATE EXTENSION IF NOT EXISTS timescaledb; 202 + CREATE EXTENSION IF NOT EXISTS pg_stat_statements; 203 + EOF 204 + fi 205 + 206 + echo "" 207 + echo "Step 1: Starting Docker services..." 208 + echo "=========================================" 209 + docker-compose down -v 2>/dev/null || true 210 + docker-compose up -d 211 + 212 + echo "" 213 + echo "Step 2: Waiting for services to be healthy..." 214 + echo "=========================================" 215 + wait_for_service timescaledb 216 + sleep 5 # Extra time for Telegraf to connect 217 + 218 + echo "" 219 + echo "Step 3: Sending test metrics..." 220 + echo "=========================================" 221 + 222 + # Send various types of metrics 223 + echo "Sending counter metrics..." 224 + for i in {1..5}; do 225 + echo "quickdid.http.request.count:1|c|#method:GET,path:/resolve,status:200" | nc -u -w0 localhost 8125 226 + echo "quickdid.http.request.count:1|c|#method:POST,path:/api,status:201" | nc -u -w0 localhost 8125 227 + done 228 + 229 + echo "Sending gauge metrics..." 230 + echo "quickdid.resolver.rate_limit.available_permits:10|g" | nc -u -w0 localhost 8125 231 + sleep 1 232 + echo "quickdid.resolver.rate_limit.available_permits:5|g" | nc -u -w0 localhost 8125 233 + 234 + echo "Sending timing metrics..." 235 + for i in {1..10}; do 236 + duration=$((RANDOM % 100 + 10)) 237 + echo "quickdid.http.request.duration_ms:${duration}|ms|#method:GET,path:/resolve,status:200" | nc -u -w0 localhost 8125 238 + done 239 + 240 + echo "Sending histogram metrics..." 241 + for i in {1..5}; do 242 + resolution_time=$((RANDOM % 500 + 50)) 243 + echo "quickdid.resolver.resolution_time:${resolution_time}|h|#resolver:redis" | nc -u -w0 localhost 8125 244 + done 245 + 246 + echo "Waiting 15 seconds for Telegraf to flush metrics..." 247 + sleep 15 248 + 249 + echo "" 250 + echo "Step 4: Verifying table creation..." 251 + echo "=========================================" 252 + 253 + # Check if tables were created 254 + check_table "quickdid.http.request.count" 255 + check_table "quickdid.http.request.duration_ms" 256 + check_table "quickdid.resolver.rate_limit.available_permits" 257 + check_table "quickdid.resolver.resolution_time" 258 + 259 + echo "" 260 + echo "Step 5: Verifying data insertion..." 261 + echo "=========================================" 262 + 263 + # Check row counts 264 + for table in "quickdid.http.request.count" "quickdid.http.request.duration_ms" "quickdid.resolver.rate_limit.available_permits" "quickdid.resolver.resolution_time"; do 265 + count=$(run_query "SELECT COUNT(*) FROM \"$table\";" | tr -d ' ') 266 + if [ "$count" -gt 0 ]; then 267 + echo "✅ Table '$table' has $count rows" 268 + else 269 + echo "❌ Table '$table' is empty" 270 + fi 271 + done 272 + 273 + echo "" 274 + echo "Step 6: Testing JSONB tag queries..." 275 + echo "=========================================" 276 + 277 + # Test JSONB tag filtering 278 + result=$(run_query "SELECT COUNT(*) FROM \"quickdid.http.request.count\" WHERE tags->>'method' = 'GET';" | tr -d ' ') 279 + if [ "$result" -gt 0 ]; then 280 + echo "✅ JSONB tag filtering works (found $result GET requests)" 281 + else 282 + echo "❌ JSONB tag filtering failed" 283 + fi 284 + 285 + echo "" 286 + echo "Step 7: Testing TimescaleDB functions..." 287 + echo "=========================================" 288 + 289 + # Test time_bucket function 290 + result=$(run_query "SELECT COUNT(*) FROM (SELECT time_bucket('1 minute', time) FROM \"quickdid.http.request.count\" GROUP BY 1) t;" | tr -d ' ') 291 + if [ "$result" -gt 0 ]; then 292 + echo "✅ time_bucket function works" 293 + else 294 + echo "❌ time_bucket function failed" 295 + fi 296 + 297 + # Check if hypertables were created 298 + hypertable_count=$(run_query "SELECT COUNT(*) FROM timescaledb_information.hypertables WHERE hypertable_name LIKE 'quickdid%';" | tr -d ' ') 299 + if [ "$hypertable_count" -gt 0 ]; then 300 + echo "✅ Found $hypertable_count hypertables" 301 + else 302 + echo "❌ No hypertables found" 303 + fi 304 + 305 + echo "" 306 + echo "Step 8: Running comprehensive query tests..." 307 + echo "=========================================" 308 + 309 + # Run the verify-queries.sql script if it exists 310 + if [ -f "../test-scripts/verify-queries.sql" ]; then 311 + echo "Running verify-queries.sql..." 312 + docker exec -i timescaledb psql -U postgres -d metrics < ../test-scripts/verify-queries.sql > query_results.txt 2>&1 313 + if [ $? -eq 0 ]; then 314 + echo "✅ All queries executed successfully" 315 + echo " Results saved to query_results.txt" 316 + else 317 + echo "❌ Some queries failed. Check query_results.txt for details" 318 + fi 319 + else 320 + echo "⚠️ verify-queries.sql not found, skipping comprehensive query tests" 321 + fi 322 + 323 + echo "" 324 + echo "=========================================" 325 + echo "Test Summary" 326 + echo "=========================================" 327 + 328 + # Generate summary 329 + failures=0 330 + successes=0 331 + 332 + # Count successes and failures from the output 333 + if check_table "quickdid.http.request.count" > /dev/null 2>&1; then 334 + successes=$((successes + 1)) 335 + else 336 + failures=$((failures + 1)) 337 + fi 338 + 339 + if [ "$hypertable_count" -gt 0 ]; then 340 + successes=$((successes + 1)) 341 + else 342 + failures=$((failures + 1)) 343 + fi 344 + 345 + echo "" 346 + if [ $failures -eq 0 ]; then 347 + echo "✅ All tests passed successfully!" 348 + echo "" 349 + echo "You can now:" 350 + echo "1. Connect to the database: docker exec -it timescaledb psql -U postgres -d metrics" 351 + echo "2. View logs: docker-compose logs -f" 352 + echo "3. Send more metrics: echo 'metric.name:value|type|#tag:value' | nc -u -w0 localhost 8125" 353 + echo "4. Stop services: docker-compose down" 354 + else 355 + echo "⚠️ Some tests failed. Please check the output above for details." 356 + echo "" 357 + echo "Troubleshooting tips:" 358 + echo "1. Check Telegraf logs: docker-compose logs telegraf" 359 + echo "2. Check TimescaleDB logs: docker-compose logs timescaledb" 360 + echo "3. Verify connectivity: docker exec telegraf telegraf --test" 361 + fi 362 + 363 + echo "" 364 + echo "Test complete!"

+44

test-scripts/send-metrics.sh

··· 1 + #!/bin/bash 2 + 3 + # Send test metrics to StatsD/Telegraf 4 + 5 + echo "Sending test metrics to StatsD on localhost:8125..." 6 + 7 + # Counter metrics 8 + for i in {1..10}; do 9 + echo "quickdid.http.request.count:1|c|#method:GET,path:/resolve,status:200" | nc -u -w0 localhost 8125 10 + echo "quickdid.http.request.count:1|c|#method:POST,path:/api,status:201" | nc -u -w0 localhost 8125 11 + echo "quickdid.http.request.count:1|c|#method:GET,path:/resolve,status:404" | nc -u -w0 localhost 8125 12 + done 13 + 14 + # Gauge metrics 15 + echo "quickdid.resolver.rate_limit.available_permits:10|g" | nc -u -w0 localhost 8125 16 + echo "quickdid.resolver.rate_limit.available_permits:8|g" | nc -u -w0 localhost 8125 17 + echo "quickdid.resolver.rate_limit.available_permits:5|g" | nc -u -w0 localhost 8125 18 + 19 + # Timing metrics (in milliseconds) 20 + for i in {1..20}; do 21 + duration=$((RANDOM % 100 + 10)) 22 + echo "quickdid.http.request.duration_ms:${duration}|ms|#method:GET,path:/resolve,status:200" | nc -u -w0 localhost 8125 23 + done 24 + 25 + for i in {1..10}; do 26 + duration=$((RANDOM % 200 + 50)) 27 + echo "quickdid.http.request.duration_ms:${duration}|ms|#method:POST,path:/api,status:201" | nc -u -w0 localhost 8125 28 + done 29 + 30 + # Histogram metrics 31 + for i in {1..15}; do 32 + resolution_time=$((RANDOM % 500 + 50)) 33 + echo "quickdid.resolver.resolution_time:${resolution_time}|h|#resolver:redis" | nc -u -w0 localhost 8125 34 + echo "quickdid.resolver.resolution_time:$((resolution_time * 2))|h|#resolver:base" | nc -u -w0 localhost 8125 35 + done 36 + 37 + # Cache metrics 38 + echo "quickdid.cache.hit.count:45|c|#cache_type:redis" | nc -u -w0 localhost 8125 39 + echo "quickdid.cache.miss.count:5|c|#cache_type:redis" | nc -u -w0 localhost 8125 40 + echo "quickdid.cache.size:1024|g|#cache_type:memory" | nc -u -w0 localhost 8125 41 + 42 + echo "Metrics sent! Wait 15 seconds for Telegraf to flush..." 43 + sleep 15 44 + echo "Done!"

+145

test-scripts/verify-queries.sql

··· 1 + -- Test script to verify all metrics queries work correctly 2 + -- Run this after sending test metrics with send-metrics.sh 3 + 4 + \echo '===== CHECKING AVAILABLE TABLES =====' 5 + SELECT table_name 6 + FROM information_schema.tables 7 + WHERE table_schema = 'public' 8 + AND table_name LIKE 'quickdid%' 9 + ORDER BY table_name; 10 + 11 + \echo '' 12 + \echo '===== CHECKING TABLE STRUCTURES =====' 13 + \echo 'Structure of quickdid.http.request.count table:' 14 + \d "quickdid.http.request.count" 15 + 16 + \echo '' 17 + \echo 'Structure of quickdid.http.request.duration_ms table:' 18 + \d "quickdid.http.request.duration_ms" 19 + 20 + \echo '' 21 + \echo '===== QUERY 1: Recent HTTP Request Counts =====' 22 + SELECT 23 + time, 24 + tags, 25 + tags->>'method' as method, 26 + tags->>'path' as path, 27 + tags->>'status' as status, 28 + value 29 + FROM "quickdid.http.request.count" 30 + WHERE time > NOW() - INTERVAL '1 hour' 31 + ORDER BY time DESC 32 + LIMIT 10; 33 + 34 + \echo '' 35 + \echo '===== QUERY 2: HTTP Request Duration Statistics by Endpoint =====' 36 + SELECT 37 + time_bucket('1 minute', time) AS minute, 38 + tags->>'method' as method, 39 + tags->>'path' as path, 40 + tags->>'status' as status, 41 + COUNT(*) as request_count, 42 + AVG(mean) as avg_duration_ms, 43 + MAX(p99) as p99_duration_ms, 44 + MIN(mean) as min_duration_ms 45 + FROM "quickdid.http.request.duration_ms" 46 + WHERE time > NOW() - INTERVAL '1 hour' 47 + AND tags IS NOT NULL 48 + GROUP BY minute, tags->>'method', tags->>'path', tags->>'status' 49 + ORDER BY minute DESC 50 + LIMIT 10; 51 + 52 + \echo '' 53 + \echo '===== QUERY 3: Rate Limiter Status Over Time =====' 54 + SELECT 55 + time, 56 + value as available_permits 57 + FROM "quickdid.resolver.rate_limit.available_permits" 58 + WHERE time > NOW() - INTERVAL '1 hour' 59 + ORDER BY time DESC 60 + LIMIT 10; 61 + 62 + \echo '' 63 + \echo '===== QUERY 4: Resolver Performance Comparison =====' 64 + SELECT 65 + tags->>'resolver' as resolver_type, 66 + COUNT(*) as sample_count, 67 + AVG(mean) as avg_resolution_time_ms, 68 + MAX(p99) as p99_resolution_time_ms, 69 + MIN(mean) as min_resolution_time_ms 70 + FROM "quickdid.resolver.resolution_time" 71 + WHERE time > NOW() - INTERVAL '1 hour' 72 + AND tags->>'resolver' IS NOT NULL 73 + GROUP BY tags->>'resolver' 74 + ORDER BY avg_resolution_time_ms; 75 + 76 + \echo '' 77 + \echo '===== QUERY 5: Cache Hit Rate Analysis =====' 78 + WITH cache_stats AS ( 79 + SELECT 80 + 'hits' as metric_type, 81 + SUM(value) as total_count 82 + FROM "quickdid.cache.hit.count" 83 + WHERE time > NOW() - INTERVAL '1 hour' 84 + UNION ALL 85 + SELECT 86 + 'misses' as metric_type, 87 + SUM(value) as total_count 88 + FROM "quickdid.cache.miss.count" 89 + WHERE time > NOW() - INTERVAL '1 hour' 90 + ) 91 + SELECT 92 + SUM(CASE WHEN metric_type = 'hits' THEN total_count ELSE 0 END) as total_hits, 93 + SUM(CASE WHEN metric_type = 'misses' THEN total_count ELSE 0 END) as total_misses, 94 + CASE 95 + WHEN SUM(total_count) > 0 THEN 96 + ROUND(100.0 * SUM(CASE WHEN metric_type = 'hits' THEN total_count ELSE 0 END) / SUM(total_count), 2) 97 + ELSE 0 98 + END as hit_rate_percentage 99 + FROM cache_stats; 100 + 101 + \echo '' 102 + \echo '===== QUERY 6: Hypertable Information =====' 103 + SELECT 104 + hypertable_schema, 105 + hypertable_name, 106 + owner, 107 + num_dimensions, 108 + num_chunks, 109 + compression_enabled 110 + FROM timescaledb_information.hypertables 111 + WHERE hypertable_name LIKE 'quickdid%' 112 + ORDER BY hypertable_name; 113 + 114 + \echo '' 115 + \echo '===== QUERY 7: HTTP Error Rate by Endpoint =====' 116 + WITH status_counts AS ( 117 + SELECT 118 + time_bucket('5 minutes', time) as period, 119 + tags->>'path' as path, 120 + CASE 121 + WHEN (tags->>'status')::int >= 400 THEN 'error' 122 + ELSE 'success' 123 + END as status_category, 124 + SUM(value) as request_count 125 + FROM "quickdid.http.request.count" 126 + WHERE time > NOW() - INTERVAL '1 hour' 127 + GROUP BY period, path, status_category 128 + ) 129 + SELECT 130 + period, 131 + path, 132 + SUM(CASE WHEN status_category = 'error' THEN request_count ELSE 0 END) as error_count, 133 + SUM(CASE WHEN status_category = 'success' THEN request_count ELSE 0 END) as success_count, 134 + CASE 135 + WHEN SUM(request_count) > 0 THEN 136 + ROUND(100.0 * SUM(CASE WHEN status_category = 'error' THEN request_count ELSE 0 END) / SUM(request_count), 2) 137 + ELSE 0 138 + END as error_rate_percentage 139 + FROM status_counts 140 + GROUP BY period, path 141 + HAVING SUM(request_count) > 0 142 + ORDER BY period DESC, error_rate_percentage DESC; 143 + 144 + \echo '' 145 + \echo '===== TEST COMPLETED ====='