Constellation, Spacedust, Slingshot, UFOs: atproto crates and services for microcosm

metrics! server and previously-logged stats

logs are still there but they're moved to `trace`.

+25 -3
Cargo.lock
··· 742 742 "links", 743 743 "mediatype", 744 744 "metrics", 745 - "metrics-exporter-prometheus", 745 + "metrics-exporter-prometheus 0.16.2", 746 746 "metrics-process", 747 747 "num-format", 748 748 "ratelimit", ··· 2171 2171 2172 2172 [[package]] 2173 2173 name = "metrics" 2174 - version = "0.24.1" 2174 + version = "0.24.2" 2175 2175 source = "registry+https://github.com/rust-lang/crates.io-index" 2176 - checksum = "7a7deb012b3b2767169ff203fadb4c6b0b82b947512e5eb9e0b78c2e186ad9e3" 2176 + checksum = "25dea7ac8057892855ec285c440160265225438c3c45072613c25a4b26e98ef5" 2177 2177 dependencies = [ 2178 2178 "ahash", 2179 2179 "portable-atomic", ··· 2195 2195 "metrics-util", 2196 2196 "quanta", 2197 2197 "thiserror 1.0.69", 2198 + "tokio", 2199 + "tracing", 2200 + ] 2201 + 2202 + [[package]] 2203 + name = "metrics-exporter-prometheus" 2204 + version = "0.17.0" 2205 + source = "registry+https://github.com/rust-lang/crates.io-index" 2206 + checksum = "df88858cd28baaaf2cfc894e37789ed4184be0e1351157aec7bf3c2266c793fd" 2207 + dependencies = [ 2208 + "base64 0.22.1", 2209 + "http-body-util", 2210 + "hyper", 2211 + "hyper-util", 2212 + "indexmap 2.9.0", 2213 + "ipnet", 2214 + "metrics", 2215 + "metrics-util", 2216 + "quanta", 2217 + "thiserror 2.0.12", 2198 2218 "tokio", 2199 2219 "tracing", 2200 2220 ] ··· 3836 3856 "jetstream", 3837 3857 "log", 3838 3858 "lsm-tree", 3859 + "metrics", 3860 + "metrics-exporter-prometheus 0.17.0", 3839 3861 "schemars", 3840 3862 "semver", 3841 3863 "serde",
+1 -1
cozy-setup (move to another repo).md
··· 339 339 340 340 - systemd unit for running: `sudo nano /etc/systemd/system/constellation.service` 341 341 342 - ```toml 342 + ```ini 343 343 [Unit] 344 344 Description=Constellation backlinks index 345 345 After=network.target
+11
jetstream/src/events.rs
··· 142 142 let t: SystemTime = self.into(); 143 143 t.elapsed() 144 144 } 145 + /// Compute the age of the cursor vs the local clock 146 + /// 147 + /// Converts the resulting duration into an f64, which can be negative! 148 + /// 149 + /// Warning: this exploits the internal implementation detail of jetstream cursors 150 + pub fn elapsed_micros_f64(&self) -> f64 { 151 + match self.elapsed() { 152 + Ok(d) => d.as_micros() as f64, 153 + Err(e) => -(e.duration().as_micros() as f64), 154 + } 155 + } 145 156 /// Get the immediate next cursor value 146 157 /// 147 158 /// This is possible for the implementation of jetstream cursors
+196
ufos ops (move to micro-ops).md
··· 1 + ufos ops 2 + 3 + btrfs snapshots: snapper 4 + 5 + ```bash 6 + sudo apt install snapper 7 + sudo snapper -c ufos-db create-config /mnt/ufos-db 8 + 9 + # edit /etc/snapper/configs/ufos-db 10 + # change 11 + TIMELINE_MIN_AGE="1800" 12 + TIMELINE_LIMIT_HOURLY="10" 13 + TIMELINE_LIMIT_DAILY="10" 14 + TIMELINE_LIMIT_WEEKLY="0" 15 + TIMELINE_LIMIT_MONTHLY="10" 16 + TIMELINE_LIMIT_YEARLY="10" 17 + # to 18 + TIMELINE_MIN_AGE="1800" 19 + TIMELINE_LIMIT_HOURLY="22" 20 + TIMELINE_LIMIT_DAILY="4" 21 + TIMELINE_LIMIT_WEEKLY="0" 22 + TIMELINE_LIMIT_MONTHLY="0" 23 + TIMELINE_LIMIT_YEARLY="0" 24 + ``` 25 + 26 + this should be enough? 27 + 28 + list snapshots: 29 + 30 + ```bash 31 + sudo snapper -c ufos-db list 32 + ``` 33 + 34 + systemd 35 + 36 + create file: `/etc/systemd/system/ufos.service` 37 + 38 + ```ini 39 + [Unit] 40 + Description=UFOs-API 41 + After=network.target 42 + 43 + [Service] 44 + User=pi 45 + WorkingDirectory=/home/pi/ 46 + ExecStart=/home/pi/ufos --jetstream us-west-2 --data /mnt/ufos-db/ 47 + Environment="RUST_LOG=info" 48 + LimitNOFILE=16384 49 + Restart=always 50 + 51 + [Install] 52 + WantedBy=multi-user.target 53 + ``` 54 + 55 + then 56 + 57 + ```bash 58 + sudo systemctl daemon-reload 59 + sudo systemctl enable ufos 60 + sudo systemctl start ufos 61 + ``` 62 + 63 + monitor with 64 + 65 + ```bash 66 + journalctl -u ufos -f 67 + ``` 68 + 69 + make sure a backup dir exists 70 + 71 + ```bash 72 + mkdir /home/pi/backup 73 + ``` 74 + 75 + mount the NAS 76 + 77 + ```bash 78 + sudo mount.cifs "//truenas.local/folks data" /home/pi/backup -o user=phil,uid=pi 79 + ``` 80 + 81 + manual rsync 82 + 83 + ```bash 84 + sudo rsync -ahP --delete /mnt/ufos-db/.snapshots/1/snapshot/ backup/ufos/ 85 + ``` 86 + 87 + backup script sketch 88 + 89 + ```bash 90 + NUM=$(sudo snapper --csvout -c ufos-db list --type single --columns number | tail -n1) 91 + sudo rsync -ahP --delete "/mnt/ufos-db/.snapshots/${NUM}/snapshot/" backup/ufos/ 92 + ``` 93 + 94 + just crontab it? 95 + 96 + `sudo crontab -e` 97 + ```bash 98 + 0 1/6 * * * rsync -ahP --delete "/mnt/ufos-db/.snapshots/$(sudo snapper --csvout -c ufos-db list --columns number | tail -n1)/snapshot/" backup/ufos/ 99 + ``` 100 + 101 + ^^ try once initial backup is done 102 + 103 + 104 + --columns subvolume,number 105 + 106 + subvolume 107 + number 108 + 109 + 110 + 111 + 112 + gateway: follow constellation for nginx->prom thing 113 + 114 + config at `/etc/prometheus-nginxlog-exporter.hcl` 115 + 116 + before: `/etc/prometheus-nginxlog-exporter.hcl` 117 + 118 + ```hcl 119 + listen { 120 + port = 4044 121 + } 122 + 123 + namespace "nginx" { 124 + source = { 125 + files = [ 126 + "/var/log/nginx/constellation-access.log" 127 + ] 128 + } 129 + 130 + format = "$remote_addr - $remote_user [$time_local] \"$request\" $status $upstream_cache_status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$http_x_forwarded_for\"" 131 + 132 + labels { 133 + app = "constellation" 134 + } 135 + 136 + relabel "cache_status" { 137 + from = "upstream_cache_status" 138 + } 139 + } 140 + ``` 141 + 142 + after: 143 + 144 + ```hcl 145 + listen { 146 + port = 4044 147 + } 148 + 149 + namespace "constellation" { 150 + source = { 151 + files = [ 152 + "/var/log/nginx/constellation-access.log" 153 + ] 154 + } 155 + 156 + format = "$remote_addr - $remote_user [$time_local] \"$request\" $status $upstream_cache_status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$http_x_forwarded_for\"" 157 + 158 + labels { 159 + app = "constellation" 160 + } 161 + 162 + relabel "cache_status" { 163 + from = "upstream_cache_status" 164 + } 165 + 166 + namespace_label = "vhost" 167 + metrics_override = { prefix = "nginx" } 168 + } 169 + 170 + namespace "ufos" { 171 + source = { 172 + files = [ 173 + "/var/log/nginx/ufos-access.log" 174 + ] 175 + } 176 + 177 + format = "$remote_addr - $remote_user [$time_local] \"$request\" $status $upstream_cache_status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$http_x_forwarded_for\"" 178 + 179 + labels { 180 + app = "ufos" 181 + } 182 + 183 + relabel "cache_status" { 184 + from = "upstream_cache_status" 185 + } 186 + 187 + namespace_label = "vhost" 188 + metrics_override = { prefix = "nginx" } 189 + } 190 + ``` 191 + 192 + 193 + ```bash 194 + systemctl start prometheus-nginxlog-exporter.service 195 + ``` 196 +
+2
ufos/Cargo.toml
··· 19 19 jetstream = { path = "../jetstream" } 20 20 log = "0.4.26" 21 21 lsm-tree = "2.6.6" 22 + metrics = "0.24.2" 23 + metrics-exporter-prometheus = { version = "0.17.0", default-features = false, features = ["http-listener"] } 22 24 schemars = { version = "0.8.22", features = ["raw_value", "chrono"] } 23 25 semver = "1.0.26" 24 26 serde = "1.0.219"
+37 -4
ufos/src/consumer.rs
··· 5 5 DefaultJetstreamEndpoints, JetstreamCompression, JetstreamConfig, JetstreamConnector, 6 6 JetstreamReceiver, 7 7 }; 8 + use metrics::{ 9 + counter, describe_counter, describe_gauge, describe_histogram, gauge, histogram, Unit, 10 + }; 8 11 use std::mem; 9 12 use std::time::Duration; 10 13 use tokio::sync::mpsc::{channel, Receiver, Sender}; ··· 79 82 batch_sender: Sender<LimitedBatch>, 80 83 sketch_secret: SketchSecretPrefix, 81 84 ) -> Self { 85 + describe_counter!( 86 + "batcher_batches_sent", 87 + Unit::Count, 88 + "how many batches of events were sent from Batcher to storage" 89 + ); 90 + describe_gauge!( 91 + "batcher_batch_age", 92 + Unit::Microseconds, 93 + "how old the last-sent batch was" 94 + ); 95 + describe_gauge!( 96 + "batcher_send_queue_capacity", 97 + Unit::Count, 98 + "how many spaces are available for batches in the send queue" 99 + ); 100 + describe_histogram!( 101 + "batcher_total_collections", 102 + Unit::Count, 103 + "how many collections are in this batch" 104 + ); 82 105 let mut rate_limit = tokio::time::interval(std::time::Duration::from_millis(3)); 83 106 rate_limit.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); 84 107 Self { ··· 188 211 // holds up all consumer progress until it can send to the channel 189 212 // use this when the current batch is too full to add more to it 190 213 async fn send_current_batch_now(&mut self, small: bool, referrer: &str) -> anyhow::Result<()> { 214 + let size_label = if small { "small" } else { "full" }; 215 + let queue_cap = self.batch_sender.capacity(); 216 + 217 + if let Some(cursor) = self.current_batch.initial_cursor { 218 + gauge!("batcher_batch_age", "size" => size_label).set(cursor.elapsed_micros_f64()); 219 + } 220 + histogram!("batcher_total_collections", "size" => size_label) 221 + .record(self.current_batch.batch.total_collections() as f64); 222 + gauge!("batcher_send_queue_capacity").set(queue_cap as f64); 223 + 191 224 let beginning = match self.current_batch.initial_cursor.map(|c| c.elapsed()) { 192 225 None => "unknown".to_string(), 193 226 Some(Ok(t)) => format!("{:?}", t), 194 227 Some(Err(e)) => format!("+{:?}", e.duration()), 195 228 }; 196 - log::info!( 197 - "sending batch now from {beginning}, {}, queue capacity: {}, referrer: {referrer}", 198 - if small { "small" } else { "full" }, 199 - self.batch_sender.capacity(), 229 + log::trace!( 230 + "sending batch now from {beginning}, {size_label}, queue capacity: {queue_cap}, referrer: {referrer}", 200 231 ); 201 232 let current = mem::take(&mut self.current_batch); 202 233 self.rate_limit.tick().await; 203 234 self.batch_sender 204 235 .send_timeout(current.batch, Duration::from_secs_f64(SEND_TIMEOUT_S)) 205 236 .await?; 237 + counter!("batcher_batches_sent", "size" => size_label, "referrer" => referrer.to_string()) 238 + .increment(1); 206 239 Ok(()) 207 240 } 208 241 }
+43 -1
ufos/src/main.rs
··· 1 1 use clap::Parser; 2 2 use jetstream::events::Cursor; 3 + use metrics::{describe_gauge, gauge, Unit}; 4 + use metrics_exporter_prometheus::PrometheusBuilder; 3 5 use std::path::PathBuf; 4 6 use std::time::{Duration, SystemTime}; 5 7 use ufos::consumer; ··· 104 106 105 107 let stating = do_update_stuff(read_store); 106 108 109 + install_metrics_server()?; 110 + 107 111 tokio::select! { 108 112 z = serving => log::warn!("serve task ended: {z:?}"), 109 113 z = rolling => log::warn!("rollup task ended: {z:?}"), ··· 116 120 Ok(()) 117 121 } 118 122 123 + fn install_metrics_server() -> anyhow::Result<()> { 124 + log::info!("installing metrics server..."); 125 + let host = [0, 0, 0, 0]; 126 + let port = 8765; 127 + PrometheusBuilder::new() 128 + .set_quantiles(&[0.5, 0.9, 0.99, 1.0])? 129 + .set_bucket_duration(Duration::from_secs(60))? 130 + .set_bucket_count(std::num::NonZero::new(10).unwrap()) // count * duration = 10 mins. stuff doesn't happen that fast here. 131 + .set_enable_unit_suffix(false) // this seemed buggy for constellation (sometimes wouldn't engage) 132 + .with_http_listener((host, port)) 133 + .install()?; 134 + log::info!( 135 + "metrics server installed! listening on http://{}.{}.{}.{}:{port}", 136 + host[0], 137 + host[1], 138 + host[2], 139 + host[3] 140 + ); 141 + Ok(()) 142 + } 143 + 119 144 async fn do_update_stuff(read_store: impl StoreReader) { 145 + describe_gauge!( 146 + "persisted_cursor_age", 147 + Unit::Microseconds, 148 + "microseconds between our clock and the latest persisted event's cursor" 149 + ); 150 + describe_gauge!( 151 + "rollup_cursor_age", 152 + Unit::Microseconds, 153 + "microseconds between our clock and the latest rollup cursor" 154 + ); 120 155 let started_at = std::time::SystemTime::now(); 121 156 let mut first_cursor = None; 122 157 let mut first_rollup = None; ··· 170 205 started_at: SystemTime, 171 206 now: SystemTime, 172 207 ) { 208 + if let Some(cursor) = latest_cursor { 209 + gauge!("persisted_cursor_age").set(cursor.elapsed_micros_f64()); 210 + } 211 + if let Some(cursor) = rollup_cursor { 212 + gauge!("rollup_cursor_age").set(cursor.elapsed_micros_f64()); 213 + } 214 + 173 215 let nice_dt_two_maybes = |earlier: Option<Cursor>, later: Option<Cursor>| match (earlier, later) 174 216 { 175 217 (Some(earlier), Some(later)) => match later.duration_since(&earlier) { ··· 208 250 let rollup_rate = rate(rollup_cursor, last_rollup, dt_real); 209 251 let rollup_avg = rate(rollup_cursor, first_rollup, dt_real_total); 210 252 211 - log::info!( 253 + log::trace!( 212 254 "cursor: {} behind (→{}, {cursor_rate}x, {cursor_avg}x avg). rollup: {} behind (→{}, {rollup_rate}x, {rollup_avg}x avg).", 213 255 latest_cursor.map(|c| c.elapsed().map(nice_duration).unwrap_or("++".to_string())).unwrap_or("?".to_string()), 214 256 nice_dt_two_maybes(last_cursor, latest_cursor),
+41 -21
ufos/src/storage_fjall.rs
··· 23 23 Batch as FjallBatch, Config, Keyspace, PartitionCreateOptions, PartitionHandle, Snapshot, 24 24 }; 25 25 use jetstream::events::Cursor; 26 + use metrics::{counter, describe_counter, describe_histogram, histogram, Unit}; 26 27 use std::collections::{HashMap, HashSet}; 27 28 use std::iter::Peekable; 28 29 use std::ops::Bound; ··· 1263 1264 impl StoreWriter<FjallBackground> for FjallWriter { 1264 1265 fn background_tasks(&mut self, reroll: bool) -> StorageResult<FjallBackground> { 1265 1266 if self.bg_taken.swap(true, Ordering::SeqCst) { 1266 - Err(StorageError::BackgroundAlreadyStarted) 1267 - } else { 1268 - if reroll { 1269 - log::info!("reroll: resetting rollup cursor..."); 1270 - insert_static_neu::<NewRollupCursorKey>(&self.global, Cursor::from_start())?; 1271 - log::info!("reroll: clearing trim cursors..."); 1272 - let mut batch = self.keyspace.batch(); 1273 - for kv in self 1274 - .global 1275 - .prefix(TrimCollectionCursorKey::from_prefix_to_db_bytes( 1276 - &Default::default(), 1277 - )?) 1278 - { 1279 - let (k, _) = kv?; 1280 - batch.remove(&self.global, k); 1281 - } 1282 - let n = batch.len(); 1283 - batch.commit()?; 1284 - log::info!("reroll: cleared {n} trim cursors."); 1267 + return Err(StorageError::BackgroundAlreadyStarted); 1268 + } 1269 + describe_histogram!( 1270 + "storage_trim_dirty_nsids", 1271 + Unit::Count, 1272 + "number of NSIDs trimmed" 1273 + ); 1274 + describe_histogram!( 1275 + "storage_trim_duration", 1276 + Unit::Microseconds, 1277 + "how long it took to trim the dirty NSIDs" 1278 + ); 1279 + describe_counter!( 1280 + "storage_trim_removed", 1281 + Unit::Count, 1282 + "how many records were removed during trim" 1283 + ); 1284 + if reroll { 1285 + log::info!("reroll: resetting rollup cursor..."); 1286 + insert_static_neu::<NewRollupCursorKey>(&self.global, Cursor::from_start())?; 1287 + log::info!("reroll: clearing trim cursors..."); 1288 + let mut batch = self.keyspace.batch(); 1289 + for kv in self 1290 + .global 1291 + .prefix(TrimCollectionCursorKey::from_prefix_to_db_bytes( 1292 + &Default::default(), 1293 + )?) 1294 + { 1295 + let (k, _) = kv?; 1296 + batch.remove(&self.global, k); 1285 1297 } 1286 - Ok(FjallBackground(self.clone())) 1298 + let n = batch.len(); 1299 + batch.commit()?; 1300 + log::info!("reroll: cleared {n} trim cursors."); 1287 1301 } 1302 + Ok(FjallBackground(self.clone())) 1288 1303 } 1289 1304 1290 1305 fn insert_batch<const LIMIT: usize>( ··· 1599 1614 break; 1600 1615 } 1601 1616 } 1617 + let dt = t0.elapsed(); 1618 + log::trace!("finished trimming {n} nsids in {:?}: {total_danglers} dangling and {total_deleted} total removed.", dt); 1619 + histogram!("storage_trim_dirty_nsids").record(completed.len() as f64); 1620 + histogram!("storage_trim_duration").record(dt.as_micros() as f64); 1621 + counter!("storage_trim_removed", "dangling" => "true").increment(total_danglers as u64); 1622 + counter!("storage_trim_removed", "dangling" => "false").increment((total_deleted - total_danglers) as u64); 1602 1623 for c in completed { 1603 1624 dirty_nsids.remove(&c); 1604 1625 } 1605 - log::info!("finished trimming {n} nsids in {:?}: {total_danglers} dangling and {total_deleted} total removed.", t0.elapsed()); 1606 1626 }, 1607 1627 }; 1608 1628 }