+25
-3
Cargo.lock
+25
-3
Cargo.lock
···
742
"links",
743
"mediatype",
744
"metrics",
745
-
"metrics-exporter-prometheus",
746
"metrics-process",
747
"num-format",
748
"ratelimit",
···
2171
2172
[[package]]
2173
name = "metrics"
2174
-
version = "0.24.1"
2175
source = "registry+https://github.com/rust-lang/crates.io-index"
2176
-
checksum = "7a7deb012b3b2767169ff203fadb4c6b0b82b947512e5eb9e0b78c2e186ad9e3"
2177
dependencies = [
2178
"ahash",
2179
"portable-atomic",
···
2195
"metrics-util",
2196
"quanta",
2197
"thiserror 1.0.69",
2198
"tokio",
2199
"tracing",
2200
]
···
3836
"jetstream",
3837
"log",
3838
"lsm-tree",
3839
"schemars",
3840
"semver",
3841
"serde",
···
742
"links",
743
"mediatype",
744
"metrics",
745
+
"metrics-exporter-prometheus 0.16.2",
746
"metrics-process",
747
"num-format",
748
"ratelimit",
···
2171
2172
[[package]]
2173
name = "metrics"
2174
+
version = "0.24.2"
2175
source = "registry+https://github.com/rust-lang/crates.io-index"
2176
+
checksum = "25dea7ac8057892855ec285c440160265225438c3c45072613c25a4b26e98ef5"
2177
dependencies = [
2178
"ahash",
2179
"portable-atomic",
···
2195
"metrics-util",
2196
"quanta",
2197
"thiserror 1.0.69",
2198
+
"tokio",
2199
+
"tracing",
2200
+
]
2201
+
2202
+
[[package]]
2203
+
name = "metrics-exporter-prometheus"
2204
+
version = "0.17.0"
2205
+
source = "registry+https://github.com/rust-lang/crates.io-index"
2206
+
checksum = "df88858cd28baaaf2cfc894e37789ed4184be0e1351157aec7bf3c2266c793fd"
2207
+
dependencies = [
2208
+
"base64 0.22.1",
2209
+
"http-body-util",
2210
+
"hyper",
2211
+
"hyper-util",
2212
+
"indexmap 2.9.0",
2213
+
"ipnet",
2214
+
"metrics",
2215
+
"metrics-util",
2216
+
"quanta",
2217
+
"thiserror 2.0.12",
2218
"tokio",
2219
"tracing",
2220
]
···
3856
"jetstream",
3857
"log",
3858
"lsm-tree",
3859
+
"metrics",
3860
+
"metrics-exporter-prometheus 0.17.0",
3861
"schemars",
3862
"semver",
3863
"serde",
+1
-1
cozy-setup (move to another repo).md
+1
-1
cozy-setup (move to another repo).md
+11
jetstream/src/events.rs
+11
jetstream/src/events.rs
···
142
let t: SystemTime = self.into();
143
t.elapsed()
144
}
145
+
/// Compute the age of the cursor vs the local clock
146
+
///
147
+
/// Converts the resulting duration into an f64, which can be negative!
148
+
///
149
+
/// Warning: this exploits the internal implementation detail of jetstream cursors
150
+
pub fn elapsed_micros_f64(&self) -> f64 {
151
+
match self.elapsed() {
152
+
Ok(d) => d.as_micros() as f64,
153
+
Err(e) => -(e.duration().as_micros() as f64),
154
+
}
155
+
}
156
/// Get the immediate next cursor value
157
///
158
/// This is possible for the implementation of jetstream cursors
+196
ufos ops (move to micro-ops).md
+196
ufos ops (move to micro-ops).md
···
···
1
+
ufos ops
2
+
3
+
btrfs snapshots: snapper
4
+
5
+
```bash
6
+
sudo apt install snapper
7
+
sudo snapper -c ufos-db create-config /mnt/ufos-db
8
+
9
+
# edit /etc/snapper/configs/ufos-db
10
+
# change
11
+
TIMELINE_MIN_AGE="1800"
12
+
TIMELINE_LIMIT_HOURLY="10"
13
+
TIMELINE_LIMIT_DAILY="10"
14
+
TIMELINE_LIMIT_WEEKLY="0"
15
+
TIMELINE_LIMIT_MONTHLY="10"
16
+
TIMELINE_LIMIT_YEARLY="10"
17
+
# to
18
+
TIMELINE_MIN_AGE="1800"
19
+
TIMELINE_LIMIT_HOURLY="22"
20
+
TIMELINE_LIMIT_DAILY="4"
21
+
TIMELINE_LIMIT_WEEKLY="0"
22
+
TIMELINE_LIMIT_MONTHLY="0"
23
+
TIMELINE_LIMIT_YEARLY="0"
24
+
```
25
+
26
+
this should be enough?
27
+
28
+
list snapshots:
29
+
30
+
```bash
31
+
sudo snapper -c ufos-db list
32
+
```
33
+
34
+
systemd
35
+
36
+
create file: `/etc/systemd/system/ufos.service`
37
+
38
+
```ini
39
+
[Unit]
40
+
Description=UFOs-API
41
+
After=network.target
42
+
43
+
[Service]
44
+
User=pi
45
+
WorkingDirectory=/home/pi/
46
+
ExecStart=/home/pi/ufos --jetstream us-west-2 --data /mnt/ufos-db/
47
+
Environment="RUST_LOG=info"
48
+
LimitNOFILE=16384
49
+
Restart=always
50
+
51
+
[Install]
52
+
WantedBy=multi-user.target
53
+
```
54
+
55
+
then
56
+
57
+
```bash
58
+
sudo systemctl daemon-reload
59
+
sudo systemctl enable ufos
60
+
sudo systemctl start ufos
61
+
```
62
+
63
+
monitor with
64
+
65
+
```bash
66
+
journalctl -u ufos -f
67
+
```
68
+
69
+
make sure a backup dir exists
70
+
71
+
```bash
72
+
mkdir /home/pi/backup
73
+
```
74
+
75
+
mount the NAS
76
+
77
+
```bash
78
+
sudo mount.cifs "//truenas.local/folks data" /home/pi/backup -o user=phil,uid=pi
79
+
```
80
+
81
+
manual rsync
82
+
83
+
```bash
84
+
sudo rsync -ahP --delete /mnt/ufos-db/.snapshots/1/snapshot/ backup/ufos/
85
+
```
86
+
87
+
backup script sketch
88
+
89
+
```bash
90
+
NUM=$(sudo snapper --csvout -c ufos-db list --type single --columns number | tail -n1)
91
+
sudo rsync -ahP --delete "/mnt/ufos-db/.snapshots/${NUM}/snapshot/" backup/ufos/
92
+
```
93
+
94
+
just crontab it?
95
+
96
+
`sudo crontab -e`
97
+
```bash
98
+
0 1/6 * * * rsync -ahP --delete "/mnt/ufos-db/.snapshots/$(sudo snapper --csvout -c ufos-db list --columns number | tail -n1)/snapshot/" backup/ufos/
99
+
```
100
+
101
+
^^ try once initial backup is done
102
+
103
+
104
+
--columns subvolume,number
105
+
106
+
subvolume
107
+
number
108
+
109
+
110
+
111
+
112
+
gateway: follow constellation for nginx->prom thing
113
+
114
+
config at `/etc/prometheus-nginxlog-exporter.hcl`
115
+
116
+
before: `/etc/prometheus-nginxlog-exporter.hcl`
117
+
118
+
```hcl
119
+
listen {
120
+
port = 4044
121
+
}
122
+
123
+
namespace "nginx" {
124
+
source = {
125
+
files = [
126
+
"/var/log/nginx/constellation-access.log"
127
+
]
128
+
}
129
+
130
+
format = "$remote_addr - $remote_user [$time_local] \"$request\" $status $upstream_cache_status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$http_x_forwarded_for\""
131
+
132
+
labels {
133
+
app = "constellation"
134
+
}
135
+
136
+
relabel "cache_status" {
137
+
from = "upstream_cache_status"
138
+
}
139
+
}
140
+
```
141
+
142
+
after:
143
+
144
+
```hcl
145
+
listen {
146
+
port = 4044
147
+
}
148
+
149
+
namespace "constellation" {
150
+
source = {
151
+
files = [
152
+
"/var/log/nginx/constellation-access.log"
153
+
]
154
+
}
155
+
156
+
format = "$remote_addr - $remote_user [$time_local] \"$request\" $status $upstream_cache_status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$http_x_forwarded_for\""
157
+
158
+
labels {
159
+
app = "constellation"
160
+
}
161
+
162
+
relabel "cache_status" {
163
+
from = "upstream_cache_status"
164
+
}
165
+
166
+
namespace_label = "vhost"
167
+
metrics_override = { prefix = "nginx" }
168
+
}
169
+
170
+
namespace "ufos" {
171
+
source = {
172
+
files = [
173
+
"/var/log/nginx/ufos-access.log"
174
+
]
175
+
}
176
+
177
+
format = "$remote_addr - $remote_user [$time_local] \"$request\" $status $upstream_cache_status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$http_x_forwarded_for\""
178
+
179
+
labels {
180
+
app = "ufos"
181
+
}
182
+
183
+
relabel "cache_status" {
184
+
from = "upstream_cache_status"
185
+
}
186
+
187
+
namespace_label = "vhost"
188
+
metrics_override = { prefix = "nginx" }
189
+
}
190
+
```
191
+
192
+
193
+
```bash
194
+
systemctl start prometheus-nginxlog-exporter.service
195
+
```
196
+
+2
ufos/Cargo.toml
+2
ufos/Cargo.toml
···
19
jetstream = { path = "../jetstream" }
20
log = "0.4.26"
21
lsm-tree = "2.6.6"
22
+
metrics = "0.24.2"
23
+
metrics-exporter-prometheus = { version = "0.17.0", default-features = false, features = ["http-listener"] }
24
schemars = { version = "0.8.22", features = ["raw_value", "chrono"] }
25
semver = "1.0.26"
26
serde = "1.0.219"
+37
-4
ufos/src/consumer.rs
+37
-4
ufos/src/consumer.rs
···
5
DefaultJetstreamEndpoints, JetstreamCompression, JetstreamConfig, JetstreamConnector,
6
JetstreamReceiver,
7
};
8
use std::mem;
9
use std::time::Duration;
10
use tokio::sync::mpsc::{channel, Receiver, Sender};
···
79
batch_sender: Sender<LimitedBatch>,
80
sketch_secret: SketchSecretPrefix,
81
) -> Self {
82
let mut rate_limit = tokio::time::interval(std::time::Duration::from_millis(3));
83
rate_limit.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
84
Self {
···
188
// holds up all consumer progress until it can send to the channel
189
// use this when the current batch is too full to add more to it
190
async fn send_current_batch_now(&mut self, small: bool, referrer: &str) -> anyhow::Result<()> {
191
let beginning = match self.current_batch.initial_cursor.map(|c| c.elapsed()) {
192
None => "unknown".to_string(),
193
Some(Ok(t)) => format!("{:?}", t),
194
Some(Err(e)) => format!("+{:?}", e.duration()),
195
};
196
-
log::info!(
197
-
"sending batch now from {beginning}, {}, queue capacity: {}, referrer: {referrer}",
198
-
if small { "small" } else { "full" },
199
-
self.batch_sender.capacity(),
200
);
201
let current = mem::take(&mut self.current_batch);
202
self.rate_limit.tick().await;
203
self.batch_sender
204
.send_timeout(current.batch, Duration::from_secs_f64(SEND_TIMEOUT_S))
205
.await?;
206
Ok(())
207
}
208
}
···
5
DefaultJetstreamEndpoints, JetstreamCompression, JetstreamConfig, JetstreamConnector,
6
JetstreamReceiver,
7
};
8
+
use metrics::{
9
+
counter, describe_counter, describe_gauge, describe_histogram, gauge, histogram, Unit,
10
+
};
11
use std::mem;
12
use std::time::Duration;
13
use tokio::sync::mpsc::{channel, Receiver, Sender};
···
82
batch_sender: Sender<LimitedBatch>,
83
sketch_secret: SketchSecretPrefix,
84
) -> Self {
85
+
describe_counter!(
86
+
"batcher_batches_sent",
87
+
Unit::Count,
88
+
"how many batches of events were sent from Batcher to storage"
89
+
);
90
+
describe_gauge!(
91
+
"batcher_batch_age",
92
+
Unit::Microseconds,
93
+
"how old the last-sent batch was"
94
+
);
95
+
describe_gauge!(
96
+
"batcher_send_queue_capacity",
97
+
Unit::Count,
98
+
"how many spaces are available for batches in the send queue"
99
+
);
100
+
describe_histogram!(
101
+
"batcher_total_collections",
102
+
Unit::Count,
103
+
"how many collections are in this batch"
104
+
);
105
let mut rate_limit = tokio::time::interval(std::time::Duration::from_millis(3));
106
rate_limit.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
107
Self {
···
211
// holds up all consumer progress until it can send to the channel
212
// use this when the current batch is too full to add more to it
213
async fn send_current_batch_now(&mut self, small: bool, referrer: &str) -> anyhow::Result<()> {
214
+
let size_label = if small { "small" } else { "full" };
215
+
let queue_cap = self.batch_sender.capacity();
216
+
217
+
if let Some(cursor) = self.current_batch.initial_cursor {
218
+
gauge!("batcher_batch_age", "size" => size_label).set(cursor.elapsed_micros_f64());
219
+
}
220
+
histogram!("batcher_total_collections", "size" => size_label)
221
+
.record(self.current_batch.batch.total_collections() as f64);
222
+
gauge!("batcher_send_queue_capacity").set(queue_cap as f64);
223
+
224
let beginning = match self.current_batch.initial_cursor.map(|c| c.elapsed()) {
225
None => "unknown".to_string(),
226
Some(Ok(t)) => format!("{:?}", t),
227
Some(Err(e)) => format!("+{:?}", e.duration()),
228
};
229
+
log::trace!(
230
+
"sending batch now from {beginning}, {size_label}, queue capacity: {queue_cap}, referrer: {referrer}",
231
);
232
let current = mem::take(&mut self.current_batch);
233
self.rate_limit.tick().await;
234
self.batch_sender
235
.send_timeout(current.batch, Duration::from_secs_f64(SEND_TIMEOUT_S))
236
.await?;
237
+
counter!("batcher_batches_sent", "size" => size_label, "referrer" => referrer.to_string())
238
+
.increment(1);
239
Ok(())
240
}
241
}
+43
-1
ufos/src/main.rs
+43
-1
ufos/src/main.rs
···
1
use clap::Parser;
2
use jetstream::events::Cursor;
3
use std::path::PathBuf;
4
use std::time::{Duration, SystemTime};
5
use ufos::consumer;
···
104
105
let stating = do_update_stuff(read_store);
106
107
tokio::select! {
108
z = serving => log::warn!("serve task ended: {z:?}"),
109
z = rolling => log::warn!("rollup task ended: {z:?}"),
···
116
Ok(())
117
}
118
119
async fn do_update_stuff(read_store: impl StoreReader) {
120
let started_at = std::time::SystemTime::now();
121
let mut first_cursor = None;
122
let mut first_rollup = None;
···
170
started_at: SystemTime,
171
now: SystemTime,
172
) {
173
let nice_dt_two_maybes = |earlier: Option<Cursor>, later: Option<Cursor>| match (earlier, later)
174
{
175
(Some(earlier), Some(later)) => match later.duration_since(&earlier) {
···
208
let rollup_rate = rate(rollup_cursor, last_rollup, dt_real);
209
let rollup_avg = rate(rollup_cursor, first_rollup, dt_real_total);
210
211
-
log::info!(
212
"cursor: {} behind (→{}, {cursor_rate}x, {cursor_avg}x avg). rollup: {} behind (→{}, {rollup_rate}x, {rollup_avg}x avg).",
213
latest_cursor.map(|c| c.elapsed().map(nice_duration).unwrap_or("++".to_string())).unwrap_or("?".to_string()),
214
nice_dt_two_maybes(last_cursor, latest_cursor),
···
1
use clap::Parser;
2
use jetstream::events::Cursor;
3
+
use metrics::{describe_gauge, gauge, Unit};
4
+
use metrics_exporter_prometheus::PrometheusBuilder;
5
use std::path::PathBuf;
6
use std::time::{Duration, SystemTime};
7
use ufos::consumer;
···
106
107
let stating = do_update_stuff(read_store);
108
109
+
install_metrics_server()?;
110
+
111
tokio::select! {
112
z = serving => log::warn!("serve task ended: {z:?}"),
113
z = rolling => log::warn!("rollup task ended: {z:?}"),
···
120
Ok(())
121
}
122
123
+
fn install_metrics_server() -> anyhow::Result<()> {
124
+
log::info!("installing metrics server...");
125
+
let host = [0, 0, 0, 0];
126
+
let port = 8765;
127
+
PrometheusBuilder::new()
128
+
.set_quantiles(&[0.5, 0.9, 0.99, 1.0])?
129
+
.set_bucket_duration(Duration::from_secs(60))?
130
+
.set_bucket_count(std::num::NonZero::new(10).unwrap()) // count * duration = 10 mins. stuff doesn't happen that fast here.
131
+
.set_enable_unit_suffix(false) // this seemed buggy for constellation (sometimes wouldn't engage)
132
+
.with_http_listener((host, port))
133
+
.install()?;
134
+
log::info!(
135
+
"metrics server installed! listening on http://{}.{}.{}.{}:{port}",
136
+
host[0],
137
+
host[1],
138
+
host[2],
139
+
host[3]
140
+
);
141
+
Ok(())
142
+
}
143
+
144
async fn do_update_stuff(read_store: impl StoreReader) {
145
+
describe_gauge!(
146
+
"persisted_cursor_age",
147
+
Unit::Microseconds,
148
+
"microseconds between our clock and the latest persisted event's cursor"
149
+
);
150
+
describe_gauge!(
151
+
"rollup_cursor_age",
152
+
Unit::Microseconds,
153
+
"microseconds between our clock and the latest rollup cursor"
154
+
);
155
let started_at = std::time::SystemTime::now();
156
let mut first_cursor = None;
157
let mut first_rollup = None;
···
205
started_at: SystemTime,
206
now: SystemTime,
207
) {
208
+
if let Some(cursor) = latest_cursor {
209
+
gauge!("persisted_cursor_age").set(cursor.elapsed_micros_f64());
210
+
}
211
+
if let Some(cursor) = rollup_cursor {
212
+
gauge!("rollup_cursor_age").set(cursor.elapsed_micros_f64());
213
+
}
214
+
215
let nice_dt_two_maybes = |earlier: Option<Cursor>, later: Option<Cursor>| match (earlier, later)
216
{
217
(Some(earlier), Some(later)) => match later.duration_since(&earlier) {
···
250
let rollup_rate = rate(rollup_cursor, last_rollup, dt_real);
251
let rollup_avg = rate(rollup_cursor, first_rollup, dt_real_total);
252
253
+
log::trace!(
254
"cursor: {} behind (→{}, {cursor_rate}x, {cursor_avg}x avg). rollup: {} behind (→{}, {rollup_rate}x, {rollup_avg}x avg).",
255
latest_cursor.map(|c| c.elapsed().map(nice_duration).unwrap_or("++".to_string())).unwrap_or("?".to_string()),
256
nice_dt_two_maybes(last_cursor, latest_cursor),
+41
-21
ufos/src/storage_fjall.rs
+41
-21
ufos/src/storage_fjall.rs
···
23
Batch as FjallBatch, Config, Keyspace, PartitionCreateOptions, PartitionHandle, Snapshot,
24
};
25
use jetstream::events::Cursor;
26
use std::collections::{HashMap, HashSet};
27
use std::iter::Peekable;
28
use std::ops::Bound;
···
1263
impl StoreWriter<FjallBackground> for FjallWriter {
1264
fn background_tasks(&mut self, reroll: bool) -> StorageResult<FjallBackground> {
1265
if self.bg_taken.swap(true, Ordering::SeqCst) {
1266
-
Err(StorageError::BackgroundAlreadyStarted)
1267
-
} else {
1268
-
if reroll {
1269
-
log::info!("reroll: resetting rollup cursor...");
1270
-
insert_static_neu::<NewRollupCursorKey>(&self.global, Cursor::from_start())?;
1271
-
log::info!("reroll: clearing trim cursors...");
1272
-
let mut batch = self.keyspace.batch();
1273
-
for kv in self
1274
-
.global
1275
-
.prefix(TrimCollectionCursorKey::from_prefix_to_db_bytes(
1276
-
&Default::default(),
1277
-
)?)
1278
-
{
1279
-
let (k, _) = kv?;
1280
-
batch.remove(&self.global, k);
1281
-
}
1282
-
let n = batch.len();
1283
-
batch.commit()?;
1284
-
log::info!("reroll: cleared {n} trim cursors.");
1285
}
1286
-
Ok(FjallBackground(self.clone()))
1287
}
1288
}
1289
1290
fn insert_batch<const LIMIT: usize>(
···
1599
break;
1600
}
1601
}
1602
for c in completed {
1603
dirty_nsids.remove(&c);
1604
}
1605
-
log::info!("finished trimming {n} nsids in {:?}: {total_danglers} dangling and {total_deleted} total removed.", t0.elapsed());
1606
},
1607
};
1608
}
···
23
Batch as FjallBatch, Config, Keyspace, PartitionCreateOptions, PartitionHandle, Snapshot,
24
};
25
use jetstream::events::Cursor;
26
+
use metrics::{counter, describe_counter, describe_histogram, histogram, Unit};
27
use std::collections::{HashMap, HashSet};
28
use std::iter::Peekable;
29
use std::ops::Bound;
···
1264
impl StoreWriter<FjallBackground> for FjallWriter {
1265
fn background_tasks(&mut self, reroll: bool) -> StorageResult<FjallBackground> {
1266
if self.bg_taken.swap(true, Ordering::SeqCst) {
1267
+
return Err(StorageError::BackgroundAlreadyStarted);
1268
+
}
1269
+
describe_histogram!(
1270
+
"storage_trim_dirty_nsids",
1271
+
Unit::Count,
1272
+
"number of NSIDs trimmed"
1273
+
);
1274
+
describe_histogram!(
1275
+
"storage_trim_duration",
1276
+
Unit::Microseconds,
1277
+
"how long it took to trim the dirty NSIDs"
1278
+
);
1279
+
describe_counter!(
1280
+
"storage_trim_removed",
1281
+
Unit::Count,
1282
+
"how many records were removed during trim"
1283
+
);
1284
+
if reroll {
1285
+
log::info!("reroll: resetting rollup cursor...");
1286
+
insert_static_neu::<NewRollupCursorKey>(&self.global, Cursor::from_start())?;
1287
+
log::info!("reroll: clearing trim cursors...");
1288
+
let mut batch = self.keyspace.batch();
1289
+
for kv in self
1290
+
.global
1291
+
.prefix(TrimCollectionCursorKey::from_prefix_to_db_bytes(
1292
+
&Default::default(),
1293
+
)?)
1294
+
{
1295
+
let (k, _) = kv?;
1296
+
batch.remove(&self.global, k);
1297
}
1298
+
let n = batch.len();
1299
+
batch.commit()?;
1300
+
log::info!("reroll: cleared {n} trim cursors.");
1301
}
1302
+
Ok(FjallBackground(self.clone()))
1303
}
1304
1305
fn insert_batch<const LIMIT: usize>(
···
1614
break;
1615
}
1616
}
1617
+
let dt = t0.elapsed();
1618
+
log::trace!("finished trimming {n} nsids in {:?}: {total_danglers} dangling and {total_deleted} total removed.", dt);
1619
+
histogram!("storage_trim_dirty_nsids").record(completed.len() as f64);
1620
+
histogram!("storage_trim_duration").record(dt.as_micros() as f64);
1621
+
counter!("storage_trim_removed", "dangling" => "true").increment(total_danglers as u64);
1622
+
counter!("storage_trim_removed", "dangling" => "false").increment((total_deleted - total_danglers) as u64);
1623
for c in completed {
1624
dirty_nsids.remove(&c);
1625
}
1626
},
1627
};
1628
}