Server tools to backfill, tail, mirror, and verify PLC logs
52
fork

Configure Feed

Select the types of activity you want to include in your feed.

pg copy in (deadlocks at end)

(or something)

+83 -273
+21 -10
src/bin/allegedly.rs
··· 1 - use allegedly::{Dt, FolderSource, HttpSource, backfill, bin_init, pages_to_weeks, poll_upstream}; 1 + use allegedly::{Db, Dt, ExportPage, FolderSource, HttpSource, backfill, bin_init, pages_to_weeks, poll_upstream, write_bulk as pages_to_pg}; 2 2 use clap::{Parser, Subcommand}; 3 3 use std::path::PathBuf; 4 4 use url::Url; ··· 28 28 #[arg(long)] 29 29 #[clap(default_value = "4")] 30 30 source_workers: usize, 31 + /// Bulk load into did-method-plc-compatible postgres instead of stdout 32 + /// 33 + /// Pass a postgres connection url like "postgresql://localhost:5432" 34 + #[arg(long)] 35 + to_postgres: Option<Url>, 31 36 }, 32 37 /// Scrape a PLC server, collecting ops into weekly bundles 33 38 /// ··· 58 63 }, 59 64 } 60 65 66 + async fn pages_to_stdout(rx: flume::Receiver<ExportPage>) -> Result<(), flume::RecvError> { 67 + loop { 68 + for op in rx.recv_async().await?.ops { 69 + println!("{op}") 70 + } 71 + } 72 + } 73 + 61 74 #[tokio::main] 62 75 async fn main() { 63 76 bin_init("main"); ··· 69 82 http, 70 83 dir, 71 84 source_workers, 85 + to_postgres, 72 86 } => { 73 87 let (tx, rx) = flume::bounded(1024); // big pages 74 88 tokio::task::spawn(async move { ··· 84 98 .unwrap(); 85 99 } 86 100 }); 87 - loop { 88 - for op in rx.recv_async().await.unwrap().ops { 89 - println!("{op}") 90 - } 101 + if let Some(url) = to_postgres { 102 + let db = Db::new(url.as_str()); 103 + pages_to_pg(db, rx).await.unwrap(); 104 + } else { 105 + pages_to_stdout(rx).await.unwrap(); 91 106 } 92 107 } 93 108 Commands::Bundle { ··· 109 124 let start_at = after.or_else(|| Some(chrono::Utc::now())); 110 125 let (tx, rx) = flume::bounded(0); // rendezvous, don't read ahead 111 126 tokio::task::spawn(async move { poll_upstream(start_at, url, tx).await.unwrap() }); 112 - loop { 113 - for op in rx.recv_async().await.unwrap().ops { 114 - println!("{op}") 115 - } 116 - } 127 + pages_to_stdout(rx).await.unwrap(); 117 128 } 118 129 } 119 130 }
-214
src/bin/backfill.rs
··· 1 - use clap::Parser; 2 - use std::time::Duration; 3 - use url::Url; 4 - 5 - use allegedly::{Db, Dt, ExportPage, Op, bin_init, poll_upstream}; 6 - 7 - const EXPORT_PAGE_QUEUE_SIZE: usize = 0; // rendezvous for now 8 - const WEEK_IN_SECONDS: u64 = 7 * 86400; 9 - 10 - #[derive(Parser)] 11 - struct Args { 12 - /// Upstream PLC server to mirror 13 - /// 14 - /// default: https://plc.directory 15 - #[arg(long, env)] 16 - #[clap(default_value = "https://plc.directory")] 17 - upstream: Url, 18 - /// Bulk export source prefix 19 - /// 20 - /// Must be a prefix for urls ending with {WEEK_TIMESTAMP}.jsonl.gz 21 - /// 22 - /// default: https://plc.t3.storage.dev/plc.directory/ 23 - /// 24 - /// pass "off" to skip fast bulk backfilling 25 - #[arg(long, env)] 26 - #[clap(default_value = "https://plc.t3.storage.dev/plc.directory/")] 27 - upstream_bulk: Url, 28 - /// The oldest available bulk upstream export timestamp 29 - /// 30 - /// Must be a week-truncated unix timestamp 31 - /// 32 - /// plc.directory's oldest week is `1668643200`; you probably don't want to change this. 33 - #[arg(long, env)] 34 - #[clap(default_value = "1668643200")] 35 - bulk_epoch: u64, 36 - /// Mirror PLC's postgres database 37 - /// 38 - /// URI string with credentials etc 39 - #[arg(long, env)] 40 - postgres: String, 41 - } 42 - 43 - async fn bulk_backfill((_upstream, epoch): (Url, u64), _tx: flume::Sender<ExportPage>) { 44 - let immutable_cutoff = std::time::SystemTime::now() - Duration::from_secs((7 + 4) * 86400); 45 - let immutable_ts = (immutable_cutoff.duration_since(std::time::SystemTime::UNIX_EPOCH)) 46 - .unwrap() 47 - .as_secs(); 48 - let _immutable_week = (immutable_ts / WEEK_IN_SECONDS) * WEEK_IN_SECONDS; 49 - let _week = epoch; 50 - let _week_n = 0; 51 - todo!(); 52 - // while week < immutable_week { 53 - // log::info!("backfilling week {week_n} ({week})"); 54 - // let url = upstream.join(&format!("{week}.jsonl.gz")).unwrap(); 55 - // week_to_pages(url, tx.clone()).await.unwrap(); 56 - // week_n += 1; 57 - // week += WEEK_IN_SECONDS; 58 - // } 59 - } 60 - 61 - async fn export_upstream( 62 - upstream: Url, 63 - bulk: (Url, u64), 64 - tx: flume::Sender<ExportPage>, 65 - pg_client: tokio_postgres::Client, 66 - ) { 67 - let latest = get_latest(&pg_client).await; 68 - 69 - if latest.is_none() { 70 - bulk_backfill(bulk, tx.clone()).await; 71 - } 72 - let mut upstream = upstream; 73 - upstream.set_path("/export"); 74 - poll_upstream(latest, upstream, tx).await.unwrap(); 75 - } 76 - 77 - async fn write_pages( 78 - rx: flume::Receiver<ExportPage>, 79 - mut pg_client: tokio_postgres::Client, 80 - ) -> Result<(), anyhow::Error> { 81 - // TODO: one big upsert at the end from select distinct on the other table 82 - 83 - // let upsert_did = &pg_client 84 - // .prepare( 85 - // r#" 86 - // INSERT INTO dids (did) VALUES ($1) 87 - // ON CONFLICT DO NOTHING"#, 88 - // ) 89 - // .await 90 - // .unwrap(); 91 - 92 - let insert_op = &pg_client 93 - .prepare( 94 - r#" 95 - INSERT INTO operations (did, operation, cid, nullified, "createdAt") 96 - VALUES ($1, $2, $3, $4, $5) 97 - ON CONFLICT (did, cid) DO UPDATE 98 - SET nullified = excluded.nullified, 99 - "createdAt" = excluded."createdAt" 100 - WHERE operations.nullified = excluded.nullified 101 - OR operations."createdAt" = excluded."createdAt""#, 102 - ) // idea: op is provable via cid, so leave it out. after did/cid (pk) that leaves nullified and createdAt 103 - // that we want to notice changing. 104 - // normal insert: no conflict, rows changed = 1 105 - // conflict (exact match): where clause passes, rows changed = 1 106 - // conflict (mismatch): where clause fails, rows changed = 0 (detect this and warn!) 107 - .await 108 - .unwrap(); 109 - 110 - while let Ok(page) = rx.recv_async().await { 111 - log::trace!("got a page..."); 112 - 113 - let tx = pg_client.transaction().await.unwrap(); 114 - 115 - // TODO: probably figure out postgres COPY IN 116 - // for now just write everything into a transaction 117 - 118 - log::trace!("setting up inserts..."); 119 - for op_line in page 120 - .ops 121 - .into_iter() 122 - .flat_map(|s| { 123 - s.replace("}{", "}\n{") 124 - .split('\n') 125 - .map(|s| s.trim()) 126 - .map(Into::into) 127 - .collect::<Vec<String>>() 128 - }) 129 - .filter(|s| !s.is_empty()) 130 - { 131 - let Ok(op) = serde_json::from_str::<Op>(&op_line) 132 - .inspect_err(|e| log::error!("failing! at the {op_line}! {e}")) 133 - else { 134 - log::error!("ayeeeee just ignoring this error for now......"); 135 - continue; 136 - }; 137 - // let client = &tx; 138 - 139 - // client.execute(upsert_did, &[&op.did]).await.unwrap(); 140 - 141 - // let sp = tx.savepoint("op").await.unwrap(); 142 - let inserted = tx 143 - .execute( 144 - insert_op, 145 - &[ 146 - &op.did, 147 - &tokio_postgres::types::Json(op.operation), 148 - &op.cid, 149 - &op.nullified, 150 - &op.created_at, 151 - ], 152 - ) 153 - .await 154 - .unwrap(); 155 - if inserted != 1 { 156 - log::warn!( 157 - "possible log modification: {inserted} rows changed after upserting {op:?}" 158 - ); 159 - } 160 - // { 161 - // if e.code() != Some(&tokio_postgres::error::SqlState::UNIQUE_VIOLATION) { 162 - // anyhow::bail!(e); 163 - // } 164 - // // TODO: assert that the row has not changed 165 - // log::warn!("ignoring dup"); 166 - // } 167 - } 168 - 169 - tx.commit().await.unwrap(); 170 - } 171 - Ok(()) 172 - } 173 - 174 - async fn get_latest(pg_client: &tokio_postgres::Client) -> Option<Dt> { 175 - pg_client 176 - .query_opt( 177 - r#"SELECT "createdAt" FROM operations 178 - ORDER BY "createdAt" DESC LIMIT 1"#, 179 - &[], 180 - ) 181 - .await 182 - .unwrap() 183 - .map(|r| r.get(0)) 184 - } 185 - 186 - #[tokio::main] 187 - async fn main() -> anyhow::Result<()> { 188 - bin_init("main"); 189 - let args = Args::parse(); 190 - let db = Db::new(&args.postgres); 191 - let (tx, rx) = flume::bounded(EXPORT_PAGE_QUEUE_SIZE); 192 - 193 - log::trace!("connecting postgres for export task..."); 194 - let pg_client = db.connect().await?; 195 - let export_task = tokio::task::spawn(export_upstream( 196 - args.upstream, 197 - (args.upstream_bulk, args.bulk_epoch), 198 - tx, 199 - pg_client, 200 - )); 201 - 202 - log::trace!("connecting postgres for writer task..."); 203 - let pg_client = db.connect().await?; 204 - let writer_task = tokio::task::spawn(write_pages(rx, pg_client)); 205 - 206 - tokio::select! { 207 - z = export_task => log::warn!("export task ended: {z:?}"), 208 - z = writer_task => log::warn!("writer task ended: {z:?}"), 209 - }; 210 - 211 - log::error!("todo: shutdown"); 212 - 213 - Ok(()) 214 - }
-47
src/bin/get_backfill_chunk_adsf.rs
··· 1 - use allegedly::{HttpSource, Week, week_to_pages}; 2 - use std::io::Write; 3 - 4 - #[tokio::main] 5 - async fn main() { 6 - let url: url::Url = "https://plc.t3.storage.dev/plc.directory/".parse().unwrap(); 7 - let source = HttpSource(url); 8 - // let source = FolderSource("./weekly/".into()); 9 - let week = Week::from_n(1699488000); 10 - 11 - let (tx, rx) = flume::bounded(32); 12 - 13 - tokio::task::spawn(async move { 14 - week_to_pages(source, week, tx).await.unwrap(); 15 - }); 16 - 17 - let mut n = 0; 18 - 19 - print!("receiving"); 20 - while let Ok(page) = rx.recv_async().await { 21 - print!("."); 22 - std::io::stdout().flush().unwrap(); 23 - n += page.ops.len(); 24 - } 25 - println!(); 26 - 27 - println!("bye ({n})"); 28 - 29 - // let reader = CLIENT 30 - // .get("https://plc.t3.storage.dev/plc.directory/1699488000.jsonl.gz") 31 - // // .get("https://plc.t3.storage.dev/plc.directory/1669248000.jsonl.gz") 32 - // .send() 33 - // .await 34 - // .unwrap() 35 - // .error_for_status() 36 - // .unwrap() 37 - // .bytes_stream() 38 - // .map_err(io::Error::other) 39 - // .into_async_read(); 40 - 41 - // let decoder = GzipDecoder::new(io::BufReader::new(reader)); 42 - // let mut chunks = io::BufReader::new(decoder).lines().chunks(1000); 43 - // while let Some(ref _chunk) = chunks.next().await { 44 - // print!("."); 45 - // } 46 - // println!(); 47 - }
+1 -1
src/lib.rs
··· 8 8 9 9 pub use backfill::backfill; 10 10 pub use client::CLIENT; 11 - pub use plc_pg::Db; 11 + pub use plc_pg::{Db, write_bulk}; 12 12 pub use poll::{get_page, poll_upstream}; 13 13 pub use weekly::{BundleSource, FolderSource, HttpSource, Week, pages_to_weeks, week_to_pages}; 14 14
+61 -1
src/plc_pg.rs
··· 1 - use tokio_postgres::{Client, Error as PgError, NoTls, connect}; 1 + use crate::{ExportPage, Op}; 2 + use tokio_postgres::{Client, types::{Type, Json}, Error as PgError, NoTls, connect, binary_copy::BinaryCopyInWriter}; 3 + use std::pin::pin; 4 + 2 5 3 6 /// a little tokio-postgres helper 7 + /// 8 + /// it's clone for easiness. it doesn't share any resources underneath after 9 + /// cloning at all so it's not meant for 4 10 #[derive(Debug, Clone)] 5 11 pub struct Db { 6 12 pg_uri: String, ··· 29 35 Ok(client) 30 36 } 31 37 } 38 + 39 + pub async fn write_bulk( 40 + db: Db, 41 + pages: flume::Receiver<ExportPage>, 42 + ) -> Result<(), PgError> { 43 + let mut client = db.connect().await?; 44 + let tx = client.transaction().await?; 45 + 46 + tx 47 + .execute(r#" 48 + CREATE TABLE backfill ( 49 + did text not null, 50 + cid text not null, 51 + operation jsonb not null, 52 + nullified boolean not null, 53 + createdAt timestamptz not null 54 + )"#, &[]) 55 + .await?; 56 + 57 + 58 + let types = &[ 59 + Type::TEXT, 60 + Type::TEXT, 61 + Type::JSONB, 62 + Type::BOOL, 63 + Type::TIMESTAMPTZ, 64 + ]; 65 + 66 + let sync = tx.copy_in("COPY backfill FROM STDIN BINARY").await?; 67 + let mut writer = pin!(BinaryCopyInWriter::new(sync, types)); 68 + 69 + while let Ok(page) = pages.recv_async().await { 70 + for s in page.ops { 71 + let Ok(op) = serde_json::from_str::<Op>(&s) else { 72 + log::warn!("ignoring unparseable op: {s:?}"); 73 + continue; 74 + }; 75 + writer.as_mut().write(&[ 76 + &op.did, 77 + &op.cid, 78 + &Json(op.operation), 79 + &op.nullified, 80 + &op.created_at, 81 + ]).await?; 82 + } 83 + } 84 + 85 + let n = writer.as_mut().finish().await?; 86 + log::info!("copied in {n} rows"); 87 + 88 + tx.commit().await?; 89 + 90 + Ok(()) 91 + }