+416
-128
docs/telegraf-timescaledb-metrics-guide.md
+416
-128
docs/telegraf-timescaledb-metrics-guide.md
···
6
6
7
7
This setup creates a metrics pipeline that:
8
8
- Collects StatsD metrics via Telegraf on UDP port 8125
9
-
- Stores metrics in PostgreSQL with TimescaleDB extensions
9
+
- Creates individual PostgreSQL tables for each metric type
10
+
- Stores metric tags as JSONB for flexible querying
10
11
- Automatically creates hypertables for time-series optimization
11
12
- Provides a complete Docker Compose configuration for easy deployment
12
13
14
+
## Important Note on Table Structure
15
+
16
+
The Telegraf PostgreSQL output plugin with the configuration in this guide creates **individual tables for each metric name**. For example:
17
+
- `quickdid.http.request.count` becomes table `"quickdid.http.request.count"`
18
+
- `quickdid.resolver.rate_limit.available_permits` becomes table `"quickdid.resolver.rate_limit.available_permits"`
19
+
20
+
Each table has the following structure:
21
+
- `time` (timestamptz) - The timestamp of the metric
22
+
- `tags` (jsonb) - All tags stored as a JSON object
23
+
- Metric-specific columns for values (e.g., `value`, `mean`, `p99`, etc.)
24
+
13
25
## Prerequisites
14
26
15
27
- Docker and Docker Compose installed
···
25
37
├── docker-compose.yml
26
38
├── telegraf/
27
39
│ └── telegraf.conf
40
+
├── test-scripts/
41
+
│ ├── send-metrics.sh
42
+
│ └── verify-queries.sql
28
43
└── .env
29
44
```
30
45
···
86
101
[[outputs.postgresql]]
87
102
connection = "host=timescaledb user=${TELEGRAF_DB_USER} password=${TELEGRAF_DB_PASSWORD} dbname=${POSTGRES_DB} sslmode=disable"
88
103
schema = "public"
104
+
105
+
# Create individual tables for each metric with hypertable support
89
106
create_templates = [
90
107
'''CREATE TABLE IF NOT EXISTS {{.table}} ({{.columns}})''',
91
108
'''SELECT create_hypertable({{.table|quoteLiteral}}, 'time', if_not_exists => TRUE)''',
92
109
]
110
+
111
+
# Store all tags as JSONB for flexible querying
93
112
tags_as_jsonb = true
94
-
113
+
114
+
# Keep fields as separate columns for better performance on aggregations
115
+
fields_as_jsonb = false
95
116
```
96
117
97
118
### 3. Docker Compose Configuration (docker-compose.yml)
···
103
124
104
125
services:
105
126
timescaledb:
106
-
image: timescale/timescaledb-ha:pg17
127
+
image: timescale/timescaledb:latest-pg17
107
128
container_name: timescaledb
108
129
restart: unless-stopped
109
130
environment:
···
116
137
volumes:
117
138
- timescale_data:/home/postgres/pgdata/data
118
139
- ./init-scripts:/docker-entrypoint-initdb.d:ro
140
+
command:
141
+
- postgres
142
+
- -c
143
+
- shared_buffers=1GB
144
+
- -c
145
+
- effective_cache_size=3GB
146
+
- -c
147
+
- maintenance_work_mem=512MB
148
+
- -c
149
+
- work_mem=32MB
150
+
- -c
151
+
- timescaledb.max_background_workers=8
152
+
- -c
153
+
- max_parallel_workers_per_gather=2
154
+
- -c
155
+
- max_parallel_workers=8
119
156
healthcheck:
120
157
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB}"]
121
158
interval: 10s
···
125
162
- metrics_network
126
163
127
164
telegraf:
128
-
image: telegraf
165
+
image: telegraf:1.35
129
166
container_name: telegraf
130
167
restart: unless-stopped
131
168
environment:
···
171
208
172
209
### 4. Database Initialization Script (optional)
173
210
174
-
Create `init-scripts/01-init.sql` to set up the Telegraf user and schema:
211
+
Create `init-scripts/01-init.sql` to set up the TimescaleDB extension:
175
212
176
213
```sql
177
214
-- Enable TimescaleDB extension
178
215
CREATE EXTENSION IF NOT EXISTS timescaledb;
216
+
217
+
-- Enable additional useful extensions
218
+
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
219
+
```
220
+
221
+
## Test Scripts
222
+
223
+
### 1. Send Test Metrics Script (test-scripts/send-metrics.sh)
224
+
225
+
Create a script to send various types of metrics:
226
+
227
+
```bash
228
+
#!/bin/bash
229
+
230
+
# Send test metrics to StatsD/Telegraf
231
+
232
+
echo "Sending test metrics to StatsD on localhost:8125..."
233
+
234
+
# Counter metrics
235
+
for i in {1..10}; do
236
+
echo "quickdid.http.request.count:1|c|#method:GET,path:/resolve,status:200" | nc -u -w0 localhost 8125
237
+
echo "quickdid.http.request.count:1|c|#method:POST,path:/api,status:201" | nc -u -w0 localhost 8125
238
+
echo "quickdid.http.request.count:1|c|#method:GET,path:/resolve,status:404" | nc -u -w0 localhost 8125
239
+
done
240
+
241
+
# Gauge metrics
242
+
echo "quickdid.resolver.rate_limit.available_permits:10|g" | nc -u -w0 localhost 8125
243
+
echo "quickdid.resolver.rate_limit.available_permits:8|g" | nc -u -w0 localhost 8125
244
+
echo "quickdid.resolver.rate_limit.available_permits:5|g" | nc -u -w0 localhost 8125
245
+
246
+
# Timing metrics (in milliseconds)
247
+
for i in {1..20}; do
248
+
duration=$((RANDOM % 100 + 10))
249
+
echo "quickdid.http.request.duration_ms:${duration}|ms|#method:GET,path:/resolve,status:200" | nc -u -w0 localhost 8125
250
+
done
251
+
252
+
for i in {1..10}; do
253
+
duration=$((RANDOM % 200 + 50))
254
+
echo "quickdid.http.request.duration_ms:${duration}|ms|#method:POST,path:/api,status:201" | nc -u -w0 localhost 8125
255
+
done
256
+
257
+
# Histogram metrics
258
+
for i in {1..15}; do
259
+
resolution_time=$((RANDOM % 500 + 50))
260
+
echo "quickdid.resolver.resolution_time:${resolution_time}|h|#resolver:redis" | nc -u -w0 localhost 8125
261
+
echo "quickdid.resolver.resolution_time:$((resolution_time * 2))|h|#resolver:base" | nc -u -w0 localhost 8125
262
+
done
263
+
264
+
# Cache metrics
265
+
echo "quickdid.cache.hit.count:45|c|#cache_type:redis" | nc -u -w0 localhost 8125
266
+
echo "quickdid.cache.miss.count:5|c|#cache_type:redis" | nc -u -w0 localhost 8125
267
+
echo "quickdid.cache.size:1024|g|#cache_type:memory" | nc -u -w0 localhost 8125
268
+
269
+
echo "Metrics sent! Wait 15 seconds for Telegraf to flush..."
270
+
sleep 15
271
+
echo "Done!"
272
+
```
273
+
274
+
### 2. Verify Queries Script (test-scripts/verify-queries.sql)
275
+
276
+
Create a SQL script to verify all queries work correctly:
277
+
278
+
```sql
279
+
-- Test script to verify all metrics queries work correctly
280
+
-- Run this after sending test metrics with send-metrics.sh
281
+
282
+
\echo '===== CHECKING AVAILABLE TABLES ====='
283
+
SELECT table_name
284
+
FROM information_schema.tables
285
+
WHERE table_schema = 'public'
286
+
AND table_name LIKE 'quickdid%'
287
+
ORDER BY table_name;
288
+
289
+
\echo ''
290
+
\echo '===== CHECKING TABLE STRUCTURES ====='
291
+
\echo 'Structure of quickdid.http.request.count table:'
292
+
\d "quickdid.http.request.count"
293
+
294
+
\echo ''
295
+
\echo 'Structure of quickdid.http.request.duration_ms table:'
296
+
\d "quickdid.http.request.duration_ms"
297
+
298
+
\echo ''
299
+
\echo '===== QUERY 1: Recent HTTP Request Counts ====='
300
+
SELECT
301
+
time,
302
+
tags,
303
+
tags->>'method' as method,
304
+
tags->>'path' as path,
305
+
tags->>'status' as status,
306
+
value
307
+
FROM "quickdid.http.request.count"
308
+
WHERE time > NOW() - INTERVAL '1 hour'
309
+
ORDER BY time DESC
310
+
LIMIT 10;
311
+
312
+
\echo ''
313
+
\echo '===== QUERY 2: HTTP Request Duration Statistics by Endpoint ====='
314
+
SELECT
315
+
time_bucket('1 minute', time) AS minute,
316
+
tags->>'method' as method,
317
+
tags->>'path' as path,
318
+
tags->>'status' as status,
319
+
COUNT(*) as request_count,
320
+
AVG(mean) as avg_duration_ms,
321
+
MAX(p99) as p99_duration_ms,
322
+
MIN(mean) as min_duration_ms
323
+
FROM "quickdid.http.request.duration_ms"
324
+
WHERE time > NOW() - INTERVAL '1 hour'
325
+
AND tags IS NOT NULL
326
+
GROUP BY minute, tags->>'method', tags->>'path', tags->>'status'
327
+
ORDER BY minute DESC
328
+
LIMIT 10;
329
+
330
+
\echo ''
331
+
\echo '===== QUERY 3: Rate Limiter Status Over Time ====='
332
+
SELECT
333
+
time,
334
+
value as available_permits
335
+
FROM "quickdid.resolver.rate_limit.available_permits"
336
+
WHERE time > NOW() - INTERVAL '1 hour'
337
+
ORDER BY time DESC
338
+
LIMIT 10;
339
+
340
+
\echo ''
341
+
\echo '===== QUERY 4: Resolver Performance Comparison ====='
342
+
SELECT
343
+
tags->>'resolver' as resolver_type,
344
+
COUNT(*) as sample_count,
345
+
AVG(mean) as avg_resolution_time_ms,
346
+
MAX(p99) as p99_resolution_time_ms,
347
+
MIN(mean) as min_resolution_time_ms
348
+
FROM "quickdid.resolver.resolution_time"
349
+
WHERE time > NOW() - INTERVAL '1 hour'
350
+
AND tags->>'resolver' IS NOT NULL
351
+
GROUP BY tags->>'resolver'
352
+
ORDER BY avg_resolution_time_ms;
353
+
354
+
\echo ''
355
+
\echo '===== QUERY 5: Cache Hit Rate Analysis ====='
356
+
WITH cache_stats AS (
357
+
SELECT
358
+
'hits' as metric_type,
359
+
SUM(value) as total_count
360
+
FROM "quickdid.cache.hit.count"
361
+
WHERE time > NOW() - INTERVAL '1 hour'
362
+
UNION ALL
363
+
SELECT
364
+
'misses' as metric_type,
365
+
SUM(value) as total_count
366
+
FROM "quickdid.cache.miss.count"
367
+
WHERE time > NOW() - INTERVAL '1 hour'
368
+
)
369
+
SELECT
370
+
SUM(CASE WHEN metric_type = 'hits' THEN total_count ELSE 0 END) as total_hits,
371
+
SUM(CASE WHEN metric_type = 'misses' THEN total_count ELSE 0 END) as total_misses,
372
+
CASE
373
+
WHEN SUM(total_count) > 0 THEN
374
+
ROUND(100.0 * SUM(CASE WHEN metric_type = 'hits' THEN total_count ELSE 0 END) / SUM(total_count), 2)
375
+
ELSE 0
376
+
END as hit_rate_percentage
377
+
FROM cache_stats;
378
+
379
+
\echo ''
380
+
\echo '===== QUERY 6: Hypertable Information ====='
381
+
SELECT
382
+
hypertable_schema,
383
+
hypertable_name,
384
+
owner,
385
+
num_dimensions,
386
+
num_chunks,
387
+
compression_enabled
388
+
FROM timescaledb_information.hypertables
389
+
WHERE hypertable_name LIKE 'quickdid%'
390
+
ORDER BY hypertable_name;
391
+
392
+
\echo ''
393
+
\echo '===== QUERY 7: HTTP Error Rate by Endpoint ====='
394
+
WITH status_counts AS (
395
+
SELECT
396
+
time_bucket('5 minutes', time) as period,
397
+
tags->>'path' as path,
398
+
CASE
399
+
WHEN (tags->>'status')::int >= 400 THEN 'error'
400
+
ELSE 'success'
401
+
END as status_category,
402
+
SUM(value) as request_count
403
+
FROM "quickdid.http.request.count"
404
+
WHERE time > NOW() - INTERVAL '1 hour'
405
+
GROUP BY period, path, status_category
406
+
)
407
+
SELECT
408
+
period,
409
+
path,
410
+
SUM(CASE WHEN status_category = 'error' THEN request_count ELSE 0 END) as error_count,
411
+
SUM(CASE WHEN status_category = 'success' THEN request_count ELSE 0 END) as success_count,
412
+
CASE
413
+
WHEN SUM(request_count) > 0 THEN
414
+
ROUND(100.0 * SUM(CASE WHEN status_category = 'error' THEN request_count ELSE 0 END) / SUM(request_count), 2)
415
+
ELSE 0
416
+
END as error_rate_percentage
417
+
FROM status_counts
418
+
GROUP BY period, path
419
+
HAVING SUM(request_count) > 0
420
+
ORDER BY period DESC, error_rate_percentage DESC;
421
+
422
+
\echo ''
423
+
\echo '===== TEST COMPLETED ====='
179
424
```
180
425
181
426
## Usage
···
187
432
cd metrics-stack
188
433
```
189
434
190
-
2. Start the services:
435
+
2. Make the test scripts executable:
436
+
```bash
437
+
chmod +x test-scripts/send-metrics.sh
438
+
```
439
+
440
+
3. Start the services:
191
441
```bash
192
442
docker-compose up -d
193
443
```
194
444
195
-
3. Check the logs to ensure everything is running:
445
+
4. Check the logs to ensure everything is running:
196
446
```bash
197
447
docker-compose logs -f
198
448
```
199
449
200
-
### Sending Metrics
450
+
5. Wait for services to be fully ready (about 30 seconds)
201
451
202
-
Send test metrics to Telegraf using StatsD protocol:
452
+
### Running the Test Suite
203
453
454
+
1. Send test metrics:
204
455
```bash
205
-
# Send a counter metric
206
-
echo "quickdid.http.request.count:1|c|#method:GET,path:/resolve,status:200" | nc -u -w0 localhost 8125
207
-
208
-
# Send a gauge metric
209
-
echo "quickdid.resolver.rate_limit.available_permits:10|g" | nc -u -w0 localhost 8125
456
+
./test-scripts/send-metrics.sh
457
+
```
210
458
211
-
# Send a timing metric
212
-
echo "quickdid.http.request.duration_ms:42|ms|#method:POST,path:/api,status:201" | nc -u -w0 localhost 8125
213
-
214
-
# Send a histogram
215
-
echo "quickdid.resolver.resolution_time:100|h|#resolver:redis" | nc -u -w0 localhost 8125
459
+
2. Verify all queries work:
460
+
```bash
461
+
docker exec -i timescaledb psql -U postgres -d metrics < test-scripts/verify-queries.sql
216
462
```
217
463
218
-
### Querying Metrics
464
+
### Manual Querying
219
465
220
-
Connect to TimescaleDB to query your metrics:
466
+
Connect to TimescaleDB to run queries manually:
221
467
222
468
```bash
223
469
# Connect to the database
224
470
docker exec -it timescaledb psql -U postgres -d metrics
225
471
226
-
# Query recent metrics
472
+
# List all metric tables
473
+
\dt "quickdid*"
474
+
475
+
# Describe a specific table structure
476
+
\d "quickdid.http.request.duration_ms"
477
+
478
+
# Query with JSONB tag filtering
227
479
SELECT
228
480
time,
229
-
tags,
230
-
value
231
-
FROM "quickdid.http.request.count"
232
-
WHERE time > NOW() - INTERVAL '1 hour'
233
-
ORDER BY time DESC
234
-
LIMIT 10;
235
-
236
-
# Query specific metric with tags
237
-
SELECT
238
-
time_bucket('1 minute', time) AS minute,
239
481
tags->>'method' as method,
240
-
tags->>'path' as path,
241
-
tags->>'status' as status,
242
-
AVG(mean::numeric) as avg_duration
482
+
mean as avg_ms,
483
+
'99_percentile' as p99_ms
243
484
FROM "quickdid.http.request.duration_ms"
244
-
WHERE
245
-
time > NOW() - INTERVAL '1 hour'
246
-
GROUP BY minute, method, path, status
247
-
ORDER BY minute DESC;
485
+
WHERE tags @> '{"method": "GET"}'::jsonb
486
+
AND time > NOW() - INTERVAL '1 hour'
487
+
ORDER BY time DESC
488
+
LIMIT 10;
248
489
```
249
490
250
491
## Advanced Configuration
251
492
252
-
### Continuous Aggregates
493
+
### Continuous Aggregates for Performance
253
494
254
-
Create continuous aggregates for better query performance:
495
+
Create continuous aggregates for frequently queried data:
255
496
256
497
```sql
257
-
-- Create a continuous aggregate for HTTP metrics
498
+
-- Create hourly aggregates for HTTP metrics
258
499
CREATE MATERIALIZED VIEW http_metrics_hourly
259
500
WITH (timescaledb.continuous) AS
260
501
SELECT
261
502
time_bucket('1 hour', time) AS hour,
262
-
name,
263
503
tags->>'method' as method,
264
504
tags->>'path' as path,
265
505
tags->>'status' as status,
266
506
COUNT(*) as request_count,
267
-
AVG((fields->>'value')::numeric) as avg_value,
268
-
MAX((fields->>'value')::numeric) as max_value,
269
-
MIN((fields->>'value')::numeric) as min_value
270
-
FROM telegraf.metrics
271
-
WHERE name LIKE 'quickdid_http_%'
272
-
GROUP BY hour, name, method, path, status
507
+
AVG(mean) as avg_duration_ms,
508
+
MAX('99_percentile') as p99_duration_ms,
509
+
MIN(mean) as min_duration_ms
510
+
FROM "quickdid.http.request.duration_ms"
511
+
WHERE tags IS NOT NULL
512
+
GROUP BY hour, method, path, status
273
513
WITH NO DATA;
274
514
275
515
-- Add refresh policy
···
277
517
start_offset => INTERVAL '3 hours',
278
518
end_offset => INTERVAL '1 hour',
279
519
schedule_interval => INTERVAL '1 hour');
520
+
521
+
-- Manually refresh to populate initial data
522
+
CALL refresh_continuous_aggregate('http_metrics_hourly', NULL, NULL);
523
+
524
+
-- Query the aggregate
525
+
SELECT * FROM http_metrics_hourly
526
+
ORDER BY hour DESC, request_count DESC
527
+
LIMIT 20;
280
528
```
281
529
282
-
## Monitoring and Maintenance
530
+
### Data Retention Policies
283
531
284
-
### Health Checks
532
+
Set up automatic data retention:
285
533
286
-
Check the health of your TimescaleDB hypertables:
534
+
```sql
535
+
-- Add retention policy to drop data older than 30 days
536
+
SELECT add_retention_policy('"quickdid.http.request.count"', INTERVAL '30 days');
537
+
SELECT add_retention_policy('"quickdid.http.request.duration_ms"', INTERVAL '30 days');
538
+
539
+
-- View retention policies
540
+
SELECT js.* FROM timescaledb_information.job_stats js
541
+
JOIN timescaledb_information.jobs j ON js.job_id = j.job_id
542
+
WHERE j.proc_name LIKE '%retention%';
543
+
```
544
+
545
+
### Compression for Storage Optimization
546
+
547
+
Enable compression for older data:
287
548
288
549
```sql
289
-
-- View hypertable information
290
-
SELECT * FROM timescaledb_information.hypertables;
550
+
-- Enable compression on a hypertable
551
+
ALTER TABLE "quickdid.http.request.duration_ms" SET (
552
+
timescaledb.compress,
553
+
timescaledb.compress_segmentby = 'tags'
554
+
);
291
555
292
-
-- Check chunk information
556
+
-- Add compression policy (compress chunks older than 7 days)
557
+
SELECT add_compression_policy('"quickdid.http.request.duration_ms"', INTERVAL '7 days');
558
+
559
+
-- Manually compress old chunks
560
+
SELECT compress_chunk(format('%I.%I', c.chunk_schema, c.chunk_name)::regclass)
561
+
FROM timescaledb_information.chunks c
562
+
WHERE c.hypertable_name = 'quickdid.http.request.duration_ms'
563
+
AND c.range_end < NOW() - INTERVAL '7 days'
564
+
AND NOT c.is_compressed;
565
+
566
+
-- Check compression status
567
+
SELECT
568
+
hypertable_name,
569
+
uncompressed_total_bytes,
570
+
compressed_total_bytes,
571
+
compression_ratio
572
+
FROM timescaledb_information.hypertable_compression_stats
573
+
WHERE hypertable_name LIKE 'quickdid%';
574
+
```
575
+
576
+
## Monitoring and Maintenance
577
+
578
+
### Health Checks
579
+
580
+
```sql
581
+
-- Check chunk distribution
293
582
SELECT
294
583
hypertable_name,
295
584
chunk_name,
296
585
range_start,
297
586
range_end,
298
-
is_compressed
587
+
is_compressed,
588
+
pg_size_pretty(total_bytes) as size
299
589
FROM timescaledb_information.chunks
300
-
WHERE hypertable_name = 'quickdid.http.request.duration_ms'
301
-
ORDER BY range_start DESC
302
-
LIMIT 10;
590
+
WHERE hypertable_name LIKE 'quickdid%'
591
+
ORDER BY hypertable_name, range_start DESC
592
+
LIMIT 20;
303
593
304
-
-- Check compression status
594
+
-- Check background jobs
595
+
SELECT
596
+
job_id,
597
+
application_name,
598
+
job_type,
599
+
schedule_interval,
600
+
last_run_started_at,
601
+
last_successful_finish,
602
+
next_scheduled_run
603
+
FROM timescaledb_information.job_stats
604
+
ORDER BY job_id;
605
+
606
+
-- Check table sizes
305
607
SELECT
306
608
hypertable_name,
307
-
uncompressed_total_bytes,
308
-
compressed_total_bytes,
309
-
compression_ratio
310
-
FROM timescaledb_information.hypertable_compression_stats;
609
+
chunks_total_size,
610
+
chunks_compressed_size,
611
+
chunks_uncompressed_size
612
+
FROM timescaledb_information.hypertables
613
+
WHERE hypertable_name LIKE 'quickdid%';
311
614
```
312
615
313
616
### Troubleshooting
314
617
315
-
1. **Telegraf can't connect to TimescaleDB:**
316
-
- Check that TimescaleDB is healthy: `docker-compose ps`
317
-
- Verify credentials in `.env` file
318
-
- Check network connectivity between containers
618
+
1. **Tables not being created:**
619
+
- Check Telegraf logs: `docker-compose logs telegraf | grep -i error`
620
+
- Verify PostgreSQL connectivity: `docker exec telegraf telegraf --test`
621
+
- Ensure metrics are being received: `docker-compose logs telegraf | grep statsd`
319
622
320
-
2. **Metrics not appearing in database:**
321
-
- Check Telegraf logs: `docker-compose logs telegraf`
322
-
- Verify StatsD metrics are being received: `docker-compose logs telegraf | grep statsd`
323
-
- Check table creation: Connect to database and run `\dt telegraf.*`
623
+
2. **Queries returning no data:**
624
+
- Verify tables exist: `\dt "quickdid*"` in psql
625
+
- Check table contents: `SELECT COUNT(*) FROM "quickdid.http.request.count";`
626
+
- Verify time ranges in WHERE clauses
324
627
325
628
3. **Performance issues:**
326
-
- Check chunk sizes and adjust chunk_time_interval if needed
327
-
- Verify compression is working
328
-
- Consider creating appropriate indexes on frequently queried columns
629
+
- Check if hypertables are created: Query `timescaledb_information.hypertables`
630
+
- Verify compression is working if enabled
631
+
- Consider creating appropriate indexes on JSONB paths:
632
+
```sql
633
+
CREATE INDEX idx_http_method ON "quickdid.http.request.duration_ms" ((tags->>'method'));
634
+
CREATE INDEX idx_http_path ON "quickdid.http.request.duration_ms" ((tags->>'path'));
635
+
```
329
636
330
637
## Integration with QuickDID
331
638
···
344
651
345
652
QuickDID will automatically send metrics to Telegraf, which will store them in TimescaleDB for analysis.
346
653
347
-
## Backup and Restore
348
-
349
-
### Backup
654
+
## Key Differences from Generic Metrics Table Approach
350
655
351
-
Create a backup of your metrics data:
352
-
353
-
```bash
354
-
# Backup entire database
355
-
docker exec timescaledb pg_dump -U postgres metrics_db > metrics_backup.sql
356
-
357
-
# Backup specific time range
358
-
docker exec timescaledb pg_dump -U postgres \
359
-
--table='telegraf.metrics' \
360
-
--data-only \
361
-
--where="time >= '2024-01-01' AND time < '2024-02-01'" \
362
-
metrics_db > metrics_january.sql
363
-
```
364
-
365
-
### Restore
656
+
This configuration creates **individual tables per metric** instead of a single generic metrics table. Benefits include:
366
657
367
-
Restore from backup:
658
+
1. **Better performance**: Each metric has its own optimized schema
659
+
2. **Clearer data model**: Tables directly represent metrics
660
+
3. **Easier querying**: No need to filter by metric name
661
+
4. **Type safety**: Each metric's fields have appropriate types
662
+
5. **Efficient compression**: Per-metric compression strategies
368
663
369
-
```bash
370
-
# Restore database
371
-
docker exec -i timescaledb psql -U postgres metrics_db < metrics_backup.sql
372
-
```
664
+
Trade-offs:
665
+
- More tables to manage (mitigated by TimescaleDB automation)
666
+
- Need to know metric names upfront for queries
667
+
- Schema changes require table alterations
373
668
374
669
## Security Considerations
375
670
376
671
1. **Use strong passwords:** Update the default passwords in `.env`
377
672
2. **Enable SSL:** Configure `sslmode=require` in production
378
673
3. **Network isolation:** Use Docker networks to isolate services
379
-
4. **Access control:** Limit database user permissions to minimum required
674
+
4. **Access control:** Create separate database users with minimal permissions:
675
+
```sql
676
+
CREATE USER metrics_reader WITH PASSWORD 'readonly_password';
677
+
GRANT CONNECT ON DATABASE metrics TO metrics_reader;
678
+
GRANT USAGE ON SCHEMA public TO metrics_reader;
679
+
GRANT SELECT ON ALL TABLES IN SCHEMA public TO metrics_reader;
680
+
```
380
681
5. **Regular updates:** Keep Docker images updated for security patches
381
682
382
683
## Performance Tuning
383
684
384
-
### TimescaleDB Configuration
685
+
### PostgreSQL/TimescaleDB Settings
385
686
386
-
Add these settings to optimize performance:
687
+
The docker-compose.yml includes optimized settings. Adjust based on your hardware:
387
688
388
-
```yaml
389
-
# In docker-compose.yml under timescaledb service
390
-
command:
391
-
- postgres
392
-
- -c
393
-
- shared_buffers=1GB
394
-
- -c
395
-
- effective_cache_size=3GB
396
-
- -c
397
-
- maintenance_work_mem=512MB
398
-
- -c
399
-
- work_mem=32MB
400
-
- -c
401
-
- timescaledb.max_background_workers=8
402
-
- -c
403
-
- max_parallel_workers_per_gather=2
404
-
- -c
405
-
- max_parallel_workers=8
406
-
```
689
+
- `shared_buffers`: 25% of system RAM
690
+
- `effective_cache_size`: 75% of system RAM
691
+
- `maintenance_work_mem`: 5% of system RAM
692
+
- `work_mem`: RAM / max_connections / 2
407
693
408
694
### Telegraf Buffer Settings
409
695
410
-
Adjust buffer settings based on your metric volume:
696
+
For high-volume metrics, adjust in telegraf.conf:
411
697
412
698
```toml
413
699
[agent]
414
-
metric_batch_size = 5000 # Increase for high volume
700
+
metric_batch_size = 5000 # Increase for high volume
415
701
metric_buffer_limit = 100000 # Increase buffer size
416
-
flush_interval = "5s" # Decrease for more frequent writes
702
+
flush_interval = "5s" # Decrease for more frequent writes
417
703
```
418
704
419
705
## Conclusion
420
706
421
-
This setup provides a robust metrics collection and storage solution using:
422
-
- **Telegraf** for flexible metric collection via StatsD
423
-
- **TimescaleDB** for efficient time-series data storage
424
-
- **Docker Compose** for easy deployment and management
707
+
This setup provides a robust metrics collection and storage solution with:
708
+
- **Individual metric tables** for optimal performance and clarity
709
+
- **JSONB tag storage** for flexible querying
710
+
- **TimescaleDB hypertables** for efficient time-series storage
711
+
- **Comprehensive test suite** to verify functionality
712
+
- **Production-ready configuration** with compression and retention policies
425
713
426
-
The system is production-ready with support for compression, retention policies, and continuous aggregates for optimal performance at scale.
714
+
The system correctly handles StatsD metrics from QuickDID and provides powerful querying capabilities through PostgreSQL's JSONB support and TimescaleDB's time-series functions.
+364
test-scripts/docker-test.sh
+364
test-scripts/docker-test.sh
···
1
+
#!/bin/bash
2
+
3
+
# Comprehensive test script for Telegraf/TimescaleDB metrics setup
4
+
# This script validates the entire metrics pipeline
5
+
6
+
set -e
7
+
8
+
echo "========================================="
9
+
echo "Telegraf/TimescaleDB Metrics Test Suite"
10
+
echo "========================================="
11
+
echo ""
12
+
13
+
# Check if Docker is running
14
+
if ! docker info > /dev/null 2>&1; then
15
+
echo "❌ Docker is not running. Please start Docker first."
16
+
exit 1
17
+
fi
18
+
19
+
# Function to wait for a service to be healthy
20
+
wait_for_service() {
21
+
local service=$1
22
+
local max_attempts=30
23
+
local attempt=1
24
+
25
+
echo -n "Waiting for $service to be healthy"
26
+
while [ $attempt -le $max_attempts ]; do
27
+
if docker-compose ps $service | grep -q "healthy"; then
28
+
echo " ✅"
29
+
return 0
30
+
fi
31
+
echo -n "."
32
+
sleep 2
33
+
attempt=$((attempt + 1))
34
+
done
35
+
echo " ❌"
36
+
echo "Service $service failed to become healthy after $max_attempts attempts"
37
+
return 1
38
+
}
39
+
40
+
# Function to run SQL query
41
+
run_query() {
42
+
docker exec -i timescaledb psql -U postgres -d metrics -t -c "$1" 2>/dev/null
43
+
}
44
+
45
+
# Function to check table exists
46
+
check_table() {
47
+
local table=$1
48
+
local result=$(run_query "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = '$table');")
49
+
if [[ "$result" =~ "t" ]]; then
50
+
echo "✅ Table '$table' exists"
51
+
return 0
52
+
else
53
+
echo "❌ Table '$table' does not exist"
54
+
return 1
55
+
fi
56
+
}
57
+
58
+
# Navigate to the metrics-stack directory (create if needed)
59
+
if [ ! -d "metrics-stack" ]; then
60
+
echo "Creating metrics-stack directory..."
61
+
mkdir -p metrics-stack/telegraf
62
+
mkdir -p metrics-stack/test-scripts
63
+
mkdir -p metrics-stack/init-scripts
64
+
fi
65
+
66
+
cd metrics-stack
67
+
68
+
# Create .env file if it doesn't exist
69
+
if [ ! -f ".env" ]; then
70
+
echo "Creating .env file..."
71
+
cat > .env << 'EOF'
72
+
# PostgreSQL/TimescaleDB Configuration
73
+
POSTGRES_DB=metrics
74
+
POSTGRES_USER=postgres
75
+
POSTGRES_PASSWORD=secretpassword
76
+
77
+
# Telegraf Database User
78
+
TELEGRAF_DB_USER=postgres
79
+
TELEGRAF_DB_PASSWORD=secretpassword
80
+
81
+
# TimescaleDB Settings
82
+
TIMESCALE_TELEMETRY=off
83
+
EOF
84
+
fi
85
+
86
+
# Copy configuration files if they don't exist
87
+
if [ ! -f "telegraf/telegraf.conf" ]; then
88
+
echo "Creating telegraf.conf..."
89
+
cat > telegraf/telegraf.conf << 'EOF'
90
+
[agent]
91
+
interval = "10s"
92
+
round_interval = true
93
+
metric_batch_size = 1000
94
+
metric_buffer_limit = 10000
95
+
collection_jitter = "0s"
96
+
flush_interval = "10s"
97
+
flush_jitter = "0s"
98
+
precision = ""
99
+
debug = false
100
+
quiet = false
101
+
hostname = "telegraf-agent"
102
+
omit_hostname = false
103
+
104
+
[[inputs.statsd]]
105
+
service_address = ":8125"
106
+
protocol = "udp"
107
+
delete_gauges = true
108
+
delete_counters = true
109
+
delete_sets = true
110
+
delete_timings = true
111
+
percentiles = [50, 90, 95, 99]
112
+
metric_separator = "."
113
+
allowed_pending_messages = 10000
114
+
datadog_extensions = true
115
+
datadog_distributions = true
116
+
117
+
[[outputs.postgresql]]
118
+
connection = "host=timescaledb user=${TELEGRAF_DB_USER} password=${TELEGRAF_DB_PASSWORD} dbname=${POSTGRES_DB} sslmode=disable"
119
+
schema = "public"
120
+
create_templates = [
121
+
'''CREATE TABLE IF NOT EXISTS {{.table}} ({{.columns}})''',
122
+
'''SELECT create_hypertable({{.table|quoteLiteral}}, 'time', if_not_exists => TRUE)''',
123
+
]
124
+
tags_as_jsonb = true
125
+
fields_as_jsonb = false
126
+
EOF
127
+
fi
128
+
129
+
# Copy docker-compose.yml if it doesn't exist
130
+
if [ ! -f "docker-compose.yml" ]; then
131
+
echo "Creating docker-compose.yml..."
132
+
cat > docker-compose.yml << 'EOF'
133
+
version: '3.8'
134
+
135
+
services:
136
+
timescaledb:
137
+
image: timescale/timescaledb:latest-pg17
138
+
container_name: timescaledb
139
+
restart: unless-stopped
140
+
environment:
141
+
POSTGRES_DB: ${POSTGRES_DB}
142
+
POSTGRES_USER: ${POSTGRES_USER}
143
+
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
144
+
TIMESCALE_TELEMETRY: ${TIMESCALE_TELEMETRY}
145
+
ports:
146
+
- "5442:5432"
147
+
volumes:
148
+
- timescale_data:/home/postgres/pgdata/data
149
+
- ./init-scripts:/docker-entrypoint-initdb.d:ro
150
+
command:
151
+
- postgres
152
+
- -c
153
+
- shared_buffers=256MB
154
+
- -c
155
+
- effective_cache_size=1GB
156
+
- -c
157
+
- maintenance_work_mem=64MB
158
+
- -c
159
+
- work_mem=8MB
160
+
healthcheck:
161
+
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB}"]
162
+
interval: 10s
163
+
timeout: 5s
164
+
retries: 5
165
+
networks:
166
+
- metrics_network
167
+
168
+
telegraf:
169
+
image: telegraf:1.35
170
+
container_name: telegraf
171
+
restart: unless-stopped
172
+
environment:
173
+
TELEGRAF_DB_USER: ${TELEGRAF_DB_USER}
174
+
TELEGRAF_DB_PASSWORD: ${TELEGRAF_DB_PASSWORD}
175
+
POSTGRES_DB: ${POSTGRES_DB}
176
+
ports:
177
+
- "8125:8125/udp"
178
+
volumes:
179
+
- ./telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro
180
+
depends_on:
181
+
timescaledb:
182
+
condition: service_healthy
183
+
networks:
184
+
- metrics_network
185
+
command: ["telegraf", "--config", "/etc/telegraf/telegraf.conf"]
186
+
187
+
networks:
188
+
metrics_network:
189
+
driver: bridge
190
+
191
+
volumes:
192
+
timescale_data:
193
+
EOF
194
+
fi
195
+
196
+
# Create init script
197
+
if [ ! -f "init-scripts/01-init.sql" ]; then
198
+
echo "Creating init script..."
199
+
cat > init-scripts/01-init.sql << 'EOF'
200
+
-- Enable TimescaleDB extension
201
+
CREATE EXTENSION IF NOT EXISTS timescaledb;
202
+
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
203
+
EOF
204
+
fi
205
+
206
+
echo ""
207
+
echo "Step 1: Starting Docker services..."
208
+
echo "========================================="
209
+
docker-compose down -v 2>/dev/null || true
210
+
docker-compose up -d
211
+
212
+
echo ""
213
+
echo "Step 2: Waiting for services to be healthy..."
214
+
echo "========================================="
215
+
wait_for_service timescaledb
216
+
sleep 5 # Extra time for Telegraf to connect
217
+
218
+
echo ""
219
+
echo "Step 3: Sending test metrics..."
220
+
echo "========================================="
221
+
222
+
# Send various types of metrics
223
+
echo "Sending counter metrics..."
224
+
for i in {1..5}; do
225
+
echo "quickdid.http.request.count:1|c|#method:GET,path:/resolve,status:200" | nc -u -w0 localhost 8125
226
+
echo "quickdid.http.request.count:1|c|#method:POST,path:/api,status:201" | nc -u -w0 localhost 8125
227
+
done
228
+
229
+
echo "Sending gauge metrics..."
230
+
echo "quickdid.resolver.rate_limit.available_permits:10|g" | nc -u -w0 localhost 8125
231
+
sleep 1
232
+
echo "quickdid.resolver.rate_limit.available_permits:5|g" | nc -u -w0 localhost 8125
233
+
234
+
echo "Sending timing metrics..."
235
+
for i in {1..10}; do
236
+
duration=$((RANDOM % 100 + 10))
237
+
echo "quickdid.http.request.duration_ms:${duration}|ms|#method:GET,path:/resolve,status:200" | nc -u -w0 localhost 8125
238
+
done
239
+
240
+
echo "Sending histogram metrics..."
241
+
for i in {1..5}; do
242
+
resolution_time=$((RANDOM % 500 + 50))
243
+
echo "quickdid.resolver.resolution_time:${resolution_time}|h|#resolver:redis" | nc -u -w0 localhost 8125
244
+
done
245
+
246
+
echo "Waiting 15 seconds for Telegraf to flush metrics..."
247
+
sleep 15
248
+
249
+
echo ""
250
+
echo "Step 4: Verifying table creation..."
251
+
echo "========================================="
252
+
253
+
# Check if tables were created
254
+
check_table "quickdid.http.request.count"
255
+
check_table "quickdid.http.request.duration_ms"
256
+
check_table "quickdid.resolver.rate_limit.available_permits"
257
+
check_table "quickdid.resolver.resolution_time"
258
+
259
+
echo ""
260
+
echo "Step 5: Verifying data insertion..."
261
+
echo "========================================="
262
+
263
+
# Check row counts
264
+
for table in "quickdid.http.request.count" "quickdid.http.request.duration_ms" "quickdid.resolver.rate_limit.available_permits" "quickdid.resolver.resolution_time"; do
265
+
count=$(run_query "SELECT COUNT(*) FROM \"$table\";" | tr -d ' ')
266
+
if [ "$count" -gt 0 ]; then
267
+
echo "✅ Table '$table' has $count rows"
268
+
else
269
+
echo "❌ Table '$table' is empty"
270
+
fi
271
+
done
272
+
273
+
echo ""
274
+
echo "Step 6: Testing JSONB tag queries..."
275
+
echo "========================================="
276
+
277
+
# Test JSONB tag filtering
278
+
result=$(run_query "SELECT COUNT(*) FROM \"quickdid.http.request.count\" WHERE tags->>'method' = 'GET';" | tr -d ' ')
279
+
if [ "$result" -gt 0 ]; then
280
+
echo "✅ JSONB tag filtering works (found $result GET requests)"
281
+
else
282
+
echo "❌ JSONB tag filtering failed"
283
+
fi
284
+
285
+
echo ""
286
+
echo "Step 7: Testing TimescaleDB functions..."
287
+
echo "========================================="
288
+
289
+
# Test time_bucket function
290
+
result=$(run_query "SELECT COUNT(*) FROM (SELECT time_bucket('1 minute', time) FROM \"quickdid.http.request.count\" GROUP BY 1) t;" | tr -d ' ')
291
+
if [ "$result" -gt 0 ]; then
292
+
echo "✅ time_bucket function works"
293
+
else
294
+
echo "❌ time_bucket function failed"
295
+
fi
296
+
297
+
# Check if hypertables were created
298
+
hypertable_count=$(run_query "SELECT COUNT(*) FROM timescaledb_information.hypertables WHERE hypertable_name LIKE 'quickdid%';" | tr -d ' ')
299
+
if [ "$hypertable_count" -gt 0 ]; then
300
+
echo "✅ Found $hypertable_count hypertables"
301
+
else
302
+
echo "❌ No hypertables found"
303
+
fi
304
+
305
+
echo ""
306
+
echo "Step 8: Running comprehensive query tests..."
307
+
echo "========================================="
308
+
309
+
# Run the verify-queries.sql script if it exists
310
+
if [ -f "../test-scripts/verify-queries.sql" ]; then
311
+
echo "Running verify-queries.sql..."
312
+
docker exec -i timescaledb psql -U postgres -d metrics < ../test-scripts/verify-queries.sql > query_results.txt 2>&1
313
+
if [ $? -eq 0 ]; then
314
+
echo "✅ All queries executed successfully"
315
+
echo " Results saved to query_results.txt"
316
+
else
317
+
echo "❌ Some queries failed. Check query_results.txt for details"
318
+
fi
319
+
else
320
+
echo "⚠️ verify-queries.sql not found, skipping comprehensive query tests"
321
+
fi
322
+
323
+
echo ""
324
+
echo "========================================="
325
+
echo "Test Summary"
326
+
echo "========================================="
327
+
328
+
# Generate summary
329
+
failures=0
330
+
successes=0
331
+
332
+
# Count successes and failures from the output
333
+
if check_table "quickdid.http.request.count" > /dev/null 2>&1; then
334
+
successes=$((successes + 1))
335
+
else
336
+
failures=$((failures + 1))
337
+
fi
338
+
339
+
if [ "$hypertable_count" -gt 0 ]; then
340
+
successes=$((successes + 1))
341
+
else
342
+
failures=$((failures + 1))
343
+
fi
344
+
345
+
echo ""
346
+
if [ $failures -eq 0 ]; then
347
+
echo "✅ All tests passed successfully!"
348
+
echo ""
349
+
echo "You can now:"
350
+
echo "1. Connect to the database: docker exec -it timescaledb psql -U postgres -d metrics"
351
+
echo "2. View logs: docker-compose logs -f"
352
+
echo "3. Send more metrics: echo 'metric.name:value|type|#tag:value' | nc -u -w0 localhost 8125"
353
+
echo "4. Stop services: docker-compose down"
354
+
else
355
+
echo "⚠️ Some tests failed. Please check the output above for details."
356
+
echo ""
357
+
echo "Troubleshooting tips:"
358
+
echo "1. Check Telegraf logs: docker-compose logs telegraf"
359
+
echo "2. Check TimescaleDB logs: docker-compose logs timescaledb"
360
+
echo "3. Verify connectivity: docker exec telegraf telegraf --test"
361
+
fi
362
+
363
+
echo ""
364
+
echo "Test complete!"
+44
test-scripts/send-metrics.sh
+44
test-scripts/send-metrics.sh
···
1
+
#!/bin/bash
2
+
3
+
# Send test metrics to StatsD/Telegraf
4
+
5
+
echo "Sending test metrics to StatsD on localhost:8125..."
6
+
7
+
# Counter metrics
8
+
for i in {1..10}; do
9
+
echo "quickdid.http.request.count:1|c|#method:GET,path:/resolve,status:200" | nc -u -w0 localhost 8125
10
+
echo "quickdid.http.request.count:1|c|#method:POST,path:/api,status:201" | nc -u -w0 localhost 8125
11
+
echo "quickdid.http.request.count:1|c|#method:GET,path:/resolve,status:404" | nc -u -w0 localhost 8125
12
+
done
13
+
14
+
# Gauge metrics
15
+
echo "quickdid.resolver.rate_limit.available_permits:10|g" | nc -u -w0 localhost 8125
16
+
echo "quickdid.resolver.rate_limit.available_permits:8|g" | nc -u -w0 localhost 8125
17
+
echo "quickdid.resolver.rate_limit.available_permits:5|g" | nc -u -w0 localhost 8125
18
+
19
+
# Timing metrics (in milliseconds)
20
+
for i in {1..20}; do
21
+
duration=$((RANDOM % 100 + 10))
22
+
echo "quickdid.http.request.duration_ms:${duration}|ms|#method:GET,path:/resolve,status:200" | nc -u -w0 localhost 8125
23
+
done
24
+
25
+
for i in {1..10}; do
26
+
duration=$((RANDOM % 200 + 50))
27
+
echo "quickdid.http.request.duration_ms:${duration}|ms|#method:POST,path:/api,status:201" | nc -u -w0 localhost 8125
28
+
done
29
+
30
+
# Histogram metrics
31
+
for i in {1..15}; do
32
+
resolution_time=$((RANDOM % 500 + 50))
33
+
echo "quickdid.resolver.resolution_time:${resolution_time}|h|#resolver:redis" | nc -u -w0 localhost 8125
34
+
echo "quickdid.resolver.resolution_time:$((resolution_time * 2))|h|#resolver:base" | nc -u -w0 localhost 8125
35
+
done
36
+
37
+
# Cache metrics
38
+
echo "quickdid.cache.hit.count:45|c|#cache_type:redis" | nc -u -w0 localhost 8125
39
+
echo "quickdid.cache.miss.count:5|c|#cache_type:redis" | nc -u -w0 localhost 8125
40
+
echo "quickdid.cache.size:1024|g|#cache_type:memory" | nc -u -w0 localhost 8125
41
+
42
+
echo "Metrics sent! Wait 15 seconds for Telegraf to flush..."
43
+
sleep 15
44
+
echo "Done!"
+145
test-scripts/verify-queries.sql
+145
test-scripts/verify-queries.sql
···
1
+
-- Test script to verify all metrics queries work correctly
2
+
-- Run this after sending test metrics with send-metrics.sh
3
+
4
+
\echo '===== CHECKING AVAILABLE TABLES ====='
5
+
SELECT table_name
6
+
FROM information_schema.tables
7
+
WHERE table_schema = 'public'
8
+
AND table_name LIKE 'quickdid%'
9
+
ORDER BY table_name;
10
+
11
+
\echo ''
12
+
\echo '===== CHECKING TABLE STRUCTURES ====='
13
+
\echo 'Structure of quickdid.http.request.count table:'
14
+
\d "quickdid.http.request.count"
15
+
16
+
\echo ''
17
+
\echo 'Structure of quickdid.http.request.duration_ms table:'
18
+
\d "quickdid.http.request.duration_ms"
19
+
20
+
\echo ''
21
+
\echo '===== QUERY 1: Recent HTTP Request Counts ====='
22
+
SELECT
23
+
time,
24
+
tags,
25
+
tags->>'method' as method,
26
+
tags->>'path' as path,
27
+
tags->>'status' as status,
28
+
value
29
+
FROM "quickdid.http.request.count"
30
+
WHERE time > NOW() - INTERVAL '1 hour'
31
+
ORDER BY time DESC
32
+
LIMIT 10;
33
+
34
+
\echo ''
35
+
\echo '===== QUERY 2: HTTP Request Duration Statistics by Endpoint ====='
36
+
SELECT
37
+
time_bucket('1 minute', time) AS minute,
38
+
tags->>'method' as method,
39
+
tags->>'path' as path,
40
+
tags->>'status' as status,
41
+
COUNT(*) as request_count,
42
+
AVG(mean) as avg_duration_ms,
43
+
MAX(p99) as p99_duration_ms,
44
+
MIN(mean) as min_duration_ms
45
+
FROM "quickdid.http.request.duration_ms"
46
+
WHERE time > NOW() - INTERVAL '1 hour'
47
+
AND tags IS NOT NULL
48
+
GROUP BY minute, tags->>'method', tags->>'path', tags->>'status'
49
+
ORDER BY minute DESC
50
+
LIMIT 10;
51
+
52
+
\echo ''
53
+
\echo '===== QUERY 3: Rate Limiter Status Over Time ====='
54
+
SELECT
55
+
time,
56
+
value as available_permits
57
+
FROM "quickdid.resolver.rate_limit.available_permits"
58
+
WHERE time > NOW() - INTERVAL '1 hour'
59
+
ORDER BY time DESC
60
+
LIMIT 10;
61
+
62
+
\echo ''
63
+
\echo '===== QUERY 4: Resolver Performance Comparison ====='
64
+
SELECT
65
+
tags->>'resolver' as resolver_type,
66
+
COUNT(*) as sample_count,
67
+
AVG(mean) as avg_resolution_time_ms,
68
+
MAX(p99) as p99_resolution_time_ms,
69
+
MIN(mean) as min_resolution_time_ms
70
+
FROM "quickdid.resolver.resolution_time"
71
+
WHERE time > NOW() - INTERVAL '1 hour'
72
+
AND tags->>'resolver' IS NOT NULL
73
+
GROUP BY tags->>'resolver'
74
+
ORDER BY avg_resolution_time_ms;
75
+
76
+
\echo ''
77
+
\echo '===== QUERY 5: Cache Hit Rate Analysis ====='
78
+
WITH cache_stats AS (
79
+
SELECT
80
+
'hits' as metric_type,
81
+
SUM(value) as total_count
82
+
FROM "quickdid.cache.hit.count"
83
+
WHERE time > NOW() - INTERVAL '1 hour'
84
+
UNION ALL
85
+
SELECT
86
+
'misses' as metric_type,
87
+
SUM(value) as total_count
88
+
FROM "quickdid.cache.miss.count"
89
+
WHERE time > NOW() - INTERVAL '1 hour'
90
+
)
91
+
SELECT
92
+
SUM(CASE WHEN metric_type = 'hits' THEN total_count ELSE 0 END) as total_hits,
93
+
SUM(CASE WHEN metric_type = 'misses' THEN total_count ELSE 0 END) as total_misses,
94
+
CASE
95
+
WHEN SUM(total_count) > 0 THEN
96
+
ROUND(100.0 * SUM(CASE WHEN metric_type = 'hits' THEN total_count ELSE 0 END) / SUM(total_count), 2)
97
+
ELSE 0
98
+
END as hit_rate_percentage
99
+
FROM cache_stats;
100
+
101
+
\echo ''
102
+
\echo '===== QUERY 6: Hypertable Information ====='
103
+
SELECT
104
+
hypertable_schema,
105
+
hypertable_name,
106
+
owner,
107
+
num_dimensions,
108
+
num_chunks,
109
+
compression_enabled
110
+
FROM timescaledb_information.hypertables
111
+
WHERE hypertable_name LIKE 'quickdid%'
112
+
ORDER BY hypertable_name;
113
+
114
+
\echo ''
115
+
\echo '===== QUERY 7: HTTP Error Rate by Endpoint ====='
116
+
WITH status_counts AS (
117
+
SELECT
118
+
time_bucket('5 minutes', time) as period,
119
+
tags->>'path' as path,
120
+
CASE
121
+
WHEN (tags->>'status')::int >= 400 THEN 'error'
122
+
ELSE 'success'
123
+
END as status_category,
124
+
SUM(value) as request_count
125
+
FROM "quickdid.http.request.count"
126
+
WHERE time > NOW() - INTERVAL '1 hour'
127
+
GROUP BY period, path, status_category
128
+
)
129
+
SELECT
130
+
period,
131
+
path,
132
+
SUM(CASE WHEN status_category = 'error' THEN request_count ELSE 0 END) as error_count,
133
+
SUM(CASE WHEN status_category = 'success' THEN request_count ELSE 0 END) as success_count,
134
+
CASE
135
+
WHEN SUM(request_count) > 0 THEN
136
+
ROUND(100.0 * SUM(CASE WHEN status_category = 'error' THEN request_count ELSE 0 END) / SUM(request_count), 2)
137
+
ELSE 0
138
+
END as error_rate_percentage
139
+
FROM status_counts
140
+
GROUP BY period, path
141
+
HAVING SUM(request_count) > 0
142
+
ORDER BY period DESC, error_rate_percentage DESC;
143
+
144
+
\echo ''
145
+
\echo '===== TEST COMPLETED ====='