Constellation, Spacedust, Slingshot, UFOs: atproto crates and services for microcosm

cursor handling

Changed files
+156 -35
jetstream
src
events
+2 -3
jetstream/README.md
··· 1 - # jetstream-oxide 1 + # fork of the awesome jetstream-oxide 2 2 3 - [![Crate](https://img.shields.io/crates/v/jetstream-oxide.svg)](https://crates.io/crates/jetstream-oxide) 4 - [![docs.rs](https://docs.rs/jetstream-oxide/badge.svg)](https://docs.rs/jetstream-oxide/latest/jetstream_oxide) 3 + fork note: this readme is likely a bit out of date! i've been messing around with some apis. 5 4 6 5 A typed Rust library for easily interacting with and consuming the 7 6 Bluesky [Jetstream](https://github.com/bluesky-social/jetstream)
+76 -1
jetstream/src/events/mod.rs
··· 2 2 pub mod commit; 3 3 pub mod identity; 4 4 5 + use std::time::{ 6 + Duration, 7 + SystemTime, 8 + UNIX_EPOCH, 9 + }; 10 + 5 11 use serde::Deserialize; 6 12 7 13 use crate::exports; 8 14 15 + /// Opaque wrapper for the time_us cursor used by jetstream 16 + /// 17 + /// Generally, you should use a cursor 18 + #[derive(Deserialize, Debug, Clone)] 19 + pub struct Cursor(u64); 20 + 9 21 /// Basic data that is included with every event. 10 22 #[derive(Deserialize, Debug)] 11 23 pub struct EventInfo { 12 24 pub did: exports::Did, 13 - pub time_us: u64, 25 + pub time_us: Cursor, 14 26 pub kind: EventKind, 15 27 } 16 28 ··· 29 41 Identity, 30 42 Account, 31 43 } 44 + 45 + impl<R> JetstreamEvent<R> { 46 + pub fn cursor(&self) -> Cursor { 47 + match self { 48 + JetstreamEvent::Commit(commit::CommitEvent::Create { info, .. }) => { 49 + info.time_us.clone() 50 + } 51 + JetstreamEvent::Commit(commit::CommitEvent::Update { info, .. }) => { 52 + info.time_us.clone() 53 + } 54 + JetstreamEvent::Commit(commit::CommitEvent::Delete { info, .. }) => { 55 + info.time_us.clone() 56 + } 57 + JetstreamEvent::Identity(e) => e.info.time_us.clone(), 58 + JetstreamEvent::Account(e) => e.info.time_us.clone(), 59 + } 60 + } 61 + } 62 + 63 + impl Cursor { 64 + /// Get a cursor that will consume all available jetstream replay 65 + /// 66 + /// This sets the cursor to zero. 67 + /// 68 + /// Jetstream instances typically only have a few days of replay. 69 + pub fn from_start() -> Self { 70 + Self(0) 71 + } 72 + /// Get a cursor for a specific time 73 + /// 74 + /// Panics: if t is older than the unix epoch: Jan 1, 1970. 75 + /// 76 + /// If you want to receive all available jetstream replay (typically a few days), use 77 + /// .from_start() 78 + pub fn at(t: SystemTime) -> Self { 79 + let unix_dt = t 80 + .duration_since(UNIX_EPOCH) 81 + .expect("cannot set jetstream cursor earlier than unix epoch"); 82 + Self(unix_dt.as_micros() as u64) 83 + } 84 + /// Get a cursor rewound from now by this amount 85 + /// 86 + /// Panics: if d is greater than the time since the unix epoch: Jan 1, 1970. 87 + /// 88 + /// Jetstream instances typically only have a few days of replay. 89 + pub fn back_by(d: Duration) -> Self { 90 + Self::at(SystemTime::now() - d) 91 + } 92 + /// Get a Cursor from a raw u64 93 + /// 94 + /// For example, from a jetstream event's `time_us` field. 95 + pub fn from_raw_u64(time_us: u64) -> Self { 96 + Self(time_us) 97 + } 98 + /// Get the raw u64 value from this cursor. 99 + pub fn to_raw_u64(&self) -> u64 { 100 + self.0 101 + } 102 + /// Format the cursor value for use in a jetstream connection url querystring 103 + pub fn to_jetstream(&self) -> String { 104 + self.0.to_string() 105 + } 106 + }
+78 -31
jetstream/src/lib.rs
··· 4 4 5 5 use std::{ 6 6 io::{ 7 - Cursor, 7 + Cursor as IoCursor, 8 8 Read, 9 9 }, 10 10 marker::PhantomData, ··· 16 16 }; 17 17 18 18 use atrium_api::record::KnownRecord; 19 - use chrono::Utc; 20 19 use futures_util::{ 21 20 stream::StreamExt, 22 21 SinkExt, ··· 49 48 ConnectionError, 50 49 JetstreamEventError, 51 50 }, 52 - events::JetstreamEvent, 51 + events::{ 52 + Cursor, 53 + JetstreamEvent, 54 + }, 53 55 }; 54 56 55 57 /// The Jetstream endpoints officially provided by Bluesky themselves. ··· 167 169 pub wanted_dids: Vec<exports::Did>, 168 170 /// The compression algorithm to request and use for the WebSocket connection (if any). 169 171 pub compression: JetstreamCompression, 170 - /// An optional timestamp to begin playback from. 172 + /// Enable automatic cursor for auto-reconnect 171 173 /// 172 - /// An absent cursor or a cursor from the future will result in live-tail operation. 174 + /// By default, reconnects will never set a cursor for the connection, so a small number of 175 + /// events will always be dropped. 173 176 /// 174 - /// When reconnecting, use the time_us from your most recently processed event and maybe 175 - /// provide a negative buffer (i.e. subtract a few seconds) to ensure gapless playback. 176 - pub cursor: Option<chrono::DateTime<Utc>>, 177 + /// If you want gapless playback across reconnects, set this to `true`. If you always want 178 + /// the latest available events and can tolerate missing some: `false`. 179 + pub replay_on_reconnect: bool, 177 180 /// Maximum size of send channel for jetstream events. 178 181 /// 179 182 /// If your consuming task can't keep up with every new jetstream event in real-time, ··· 197 200 wanted_collections: Vec::new(), 198 201 wanted_dids: Vec::new(), 199 202 compression: JetstreamCompression::None, 200 - cursor: None, 203 + replay_on_reconnect: false, 201 204 channel_size: 4096, // a few seconds of firehose buffer 202 205 record_type: PhantomData, 203 206 } ··· 225 228 }, 226 229 ); 227 230 228 - let cursor = self 229 - .cursor 230 - .map(|c| ("cursor", c.timestamp_micros().to_string())); 231 - 232 231 let params = did_search_query 233 232 .chain(collection_search_query) 234 233 .chain(std::iter::once(compression)) 235 - .chain(cursor) 236 234 .collect::<Vec<(&str, String)>>(); 237 235 238 236 Url::parse_with_params(endpoint, params) ··· 276 274 /// A [JetstreamReceiver] is returned which can be used to respond to events. When all instances 277 275 /// of this receiver are dropped, the connection and task are automatically closed. 278 276 pub async fn connect(&self) -> Result<JetstreamReceiver<R>, ConnectionError> { 277 + self.base_connect(None).await 278 + } 279 + 280 + /// Connects to a Jetstream instance as defined in the [JetstreamConfig] with playback from a 281 + /// cursor 282 + /// 283 + /// A cursor from the future will result in live-tail operation. 284 + /// 285 + /// The cursor is only used for first successfull connection -- on auto-reconnect it will 286 + /// live-tail by default. Set `replay_on_reconnect: true` in the config if you need to 287 + /// receive every event, which will keep track of the last-seen cursor and reconnect from 288 + /// there. 289 + pub async fn connect_cursor( 290 + &self, 291 + cursor: Cursor, 292 + ) -> Result<JetstreamReceiver<R>, ConnectionError> { 293 + self.base_connect(Some(cursor)).await 294 + } 295 + 296 + async fn base_connect( 297 + &self, 298 + cursor: Option<Cursor>, 299 + ) -> Result<JetstreamReceiver<R>, ConnectionError> { 279 300 // We validate the config again for good measure. Probably not necessary but it can't hurt. 280 301 self.config 281 302 .validate() ··· 288 309 .construct_endpoint(&self.config.endpoint) 289 310 .map_err(ConnectionError::InvalidEndpoint)?; 290 311 312 + let replay_on_reconnect = self.config.replay_on_reconnect; 313 + 291 314 tokio::task::spawn(async move { 292 315 let max_retries = 30; 293 316 let base_delay_ms = 1_000; // 1 second ··· 295 318 let success_threshold_s = 15; // 15 seconds, retry count is reset if we were connected at least this long 296 319 297 320 let mut retry_attempt = 0; 321 + let mut connect_cursor = cursor; 298 322 loop { 299 323 let dict = DecoderDictionary::copy(JETSTREAM_ZSTD_DICTIONARY); 300 324 325 + let mut configured_endpoint = configured_endpoint.clone(); 326 + if let Some(ref cursor) = connect_cursor { 327 + configured_endpoint 328 + .query_pairs_mut() 329 + .append_pair("cursor", &cursor.to_jetstream()); 330 + } 331 + 332 + let mut last_cursor = connect_cursor.clone(); 333 + 301 334 retry_attempt += 1; 302 335 if let Ok((ws_stream, _)) = connect_async(&configured_endpoint).await { 303 336 let t_connected = Instant::now(); 304 - if let Err(e) = websocket_task(dict, ws_stream, send_channel.clone()).await { 337 + if let Err(e) = 338 + websocket_task(dict, ws_stream, send_channel.clone(), &mut last_cursor) 339 + .await 340 + { 305 341 log::error!("Jetstream closed after encountering error: {e:?}"); 306 342 } else { 307 343 log::error!("Jetstream connection closed cleanly"); 308 344 } 309 345 if t_connected.elapsed() > Duration::from_secs(success_threshold_s) { 310 346 retry_attempt = 0; 311 - continue; 312 347 } 313 348 } 314 349 315 350 if retry_attempt >= max_retries { 316 - eprintln!("max retries, bye"); 351 + log::error!("hit max retries, bye"); 317 352 break; 318 353 } 319 354 320 - eprintln!("will try to reconnect"); 355 + connect_cursor = if replay_on_reconnect { 356 + last_cursor 357 + } else { 358 + None 359 + }; 321 360 322 - // Exponential backoff 323 - let delay_ms = base_delay_ms * (2_u64.pow(retry_attempt)); 324 - 325 - log::error!("Connection failed, retrying in {delay_ms}ms..."); 326 - tokio::time::sleep(Duration::from_millis(delay_ms.min(max_delay_ms))).await; 327 - log::info!("Attempting to reconnect...") 361 + if retry_attempt > 0 { 362 + // Exponential backoff 363 + let delay_ms = base_delay_ms * (2_u64.pow(retry_attempt)); 364 + log::error!("Connection failed, retrying in {delay_ms}ms..."); 365 + tokio::time::sleep(Duration::from_millis(delay_ms.min(max_delay_ms))).await; 366 + log::info!("Attempting to reconnect..."); 367 + } 328 368 } 329 369 log::error!("Connection retries exhausted. Jetstream is disconnected."); 330 370 }); ··· 339 379 dictionary: DecoderDictionary<'_>, 340 380 ws: WebSocketStream<MaybeTlsStream<TcpStream>>, 341 381 send_channel: JetstreamSender<R>, 382 + last_cursor: &mut Option<Cursor>, 342 383 ) -> Result<(), JetstreamEventError> { 343 384 // TODO: Use the write half to allow the user to change configuration settings on the fly. 344 385 let (socket_write, mut socket_read) = ws.split(); ··· 373 414 Some(Ok(message)) => { 374 415 match message { 375 416 Message::Text(json) => { 376 - let event = serde_json::from_str(&json) 417 + let event: JetstreamEvent<R> = serde_json::from_str(&json) 377 418 .map_err(JetstreamEventError::ReceivedMalformedJSON)?; 419 + let event_cursor = event.cursor(); 378 420 379 421 if send_channel.send(event).await.is_err() { 380 422 // We can assume that all receivers have been dropped, so we can close 381 423 // the connection and exit the task. 382 424 log::info!( 383 - "All receivers for the Jetstream connection have been dropped, closing connection." 384 - ); 425 + "All receivers for the Jetstream connection have been dropped, closing connection." 426 + ); 385 427 closing_connection = true; 428 + } else if let Some(v) = last_cursor.as_mut() { 429 + *v = event_cursor; 386 430 } 387 431 } 388 432 Message::Binary(zstd_json) => { 389 - let mut cursor = Cursor::new(zstd_json); 433 + let mut cursor = IoCursor::new(zstd_json); 390 434 let mut decoder = zstd::stream::Decoder::with_prepared_dictionary( 391 435 &mut cursor, 392 436 &dictionary, ··· 398 442 .read_to_string(&mut json) 399 443 .map_err(JetstreamEventError::CompressionDecoderError)?; 400 444 401 - let event = serde_json::from_str(&json) 445 + let event: JetstreamEvent<R> = serde_json::from_str(&json) 402 446 .map_err(JetstreamEventError::ReceivedMalformedJSON)?; 447 + let event_cursor = event.cursor(); 403 448 404 449 if send_channel.send(event).await.is_err() { 405 450 // We can assume that all receivers have been dropped, so we can close 406 451 // the connection and exit the task. 407 452 log::info!( 408 - "All receivers for the Jetstream connection have been dropped, closing connection..." 409 - ); 453 + "All receivers for the Jetstream connection have been dropped, closing connection..." 454 + ); 410 455 closing_connection = true; 456 + } else if let Some(v) = last_cursor.as_mut() { 457 + *v = event_cursor; 411 458 } 412 459 } 413 460 Message::Ping(vec) => {