Fast and robust atproto CAR file processing in rust
at main 4.5 kB view raw
1//! Low-level types for parsing raw atproto MST CARs 2//! 3//! The primary aim is to work through the **tree** structure. Non-node blocks 4//! are left as raw bytes, for upper levels to parse into DAG-CBOR or whatever. 5 6use ipld_core::cid::Cid; 7use serde::Deserialize; 8 9/// The top-level data object in a repository's tree is a signed commit. 10#[derive(Debug, Deserialize)] 11// #[serde(deny_unknown_fields)] 12pub struct Commit { 13 /// the account DID associated with the repo, in strictly normalized form 14 /// (eg, lowercase as appropriate) 15 pub did: String, 16 /// fixed value of 3 for this repo format version 17 pub version: u64, 18 /// pointer to the top of the repo contents tree structure (MST) 19 pub data: Cid, 20 /// revision of the repo, used as a logical clock. 21 /// 22 /// TID format. Must increase monotonically. Recommend using current 23 /// timestamp as TID; rev values in the "future" (beyond a fudge factor) 24 /// should be ignored and not processed 25 pub rev: String, 26 /// pointer (by hash) to a previous commit object for this repository. 27 /// 28 /// Could be used to create a chain of history, but largely unused (included 29 /// for v2 backwards compatibility). In version 3 repos, this field must 30 /// exist in the CBOR object, but is virtually always null. NOTE: previously 31 /// specified as nullable and optional, but this caused interoperability 32 /// issues. 33 pub prev: Option<Cid>, 34 /// cryptographic signature of this commit, as raw bytes 35 #[serde(with = "serde_bytes")] 36 pub sig: Vec<u8>, 37} 38 39/// MST node data schema 40#[derive(Debug, Deserialize, PartialEq)] 41#[serde(deny_unknown_fields)] 42pub(crate) struct Node { 43 /// link to sub-tree Node on a lower level and with all keys sorting before 44 /// keys at this node 45 #[serde(rename = "l")] 46 pub left: Option<Cid>, 47 /// ordered list of TreeEntry objects 48 /// 49 /// atproto MSTs have a fanout of 4, so there can be max 4 entries. 50 #[serde(rename = "e")] 51 pub entries: Vec<Entry>, // maybe we can do [Option<Entry>; 4]? 52} 53 54impl Node { 55 /// test if a block could possibly be a node 56 /// 57 /// we can't eagerly decode records except where we're *sure* they cannot be 58 /// an mst node (and even then we can only attempt) because you can't know 59 /// with certainty what a block is supposed to be without actually walking 60 /// the tree. 61 /// 62 /// so if a block *could be* a node, any record converter must postpone 63 /// processing. if it turns out it happens to be a very node-looking record, 64 /// well, sorry, it just has to only be processed later when that's known. 65 pub(crate) fn could_be(bytes: impl AsRef<[u8]>) -> bool { 66 const NODE_FINGERPRINT: [u8; 3] = [ 67 0xA2, // map length 2 (for "l" and "e" keys) 68 0x61, // text length 1 69 b'e', // "e" before "l" because map keys have to be lex-sorted 70 // 0x8?: "e" has array (0x100 upper 3 bits) of some length 71 ]; 72 let bytes = bytes.as_ref(); 73 bytes.starts_with(&NODE_FINGERPRINT) 74 && bytes 75 .get(3) 76 .map(|b| b & 0b1110_0000 == 0x80) 77 .unwrap_or(false) 78 } 79 80 /// Check if a node has any entries 81 /// 82 /// An empty repository with no records is represented as a single MST node 83 /// with an empty array of entries. This is the only situation in which a 84 /// tree may contain an empty leaf node which does not either contain keys 85 /// ("entries") or point to a sub-tree containing entries. 86 pub(crate) fn is_empty(&self) -> bool { 87 self.left.is_none() && self.entries.is_empty() 88 } 89} 90 91/// TreeEntry object 92#[derive(Debug, Deserialize, PartialEq)] 93#[serde(deny_unknown_fields)] 94pub(crate) struct Entry { 95 /// count of bytes shared with previous TreeEntry in this Node (if any) 96 #[serde(rename = "p")] 97 pub prefix_len: usize, 98 /// remainder of key for this TreeEntry, after "prefixlen" have been removed 99 #[serde(rename = "k", with = "serde_bytes")] 100 pub keysuffix: Vec<u8>, // can we String this here? 101 /// link to the record data (CBOR) for this entry 102 #[serde(rename = "v")] 103 pub value: Cid, 104 /// link to a sub-tree Node at a lower level 105 /// 106 /// the lower level must have keys sorting after this TreeEntry's key (to 107 /// the "right"), but before the next TreeEntry's key in this Node (if any) 108 #[serde(rename = "t")] 109 pub tree: Option<Cid>, 110}