···87878888 /// DEPRECATED: Unused
8989 pub rebase: bool,
9090-9191- /// Debug: block sources for validation analysis
9292- #[cfg(debug_assertions)]
9393- pub block_sources: BTreeMap<IpldCid, String>,
9494-9595- #[cfg(debug_assertions)]
9696- pub excluded_blocks: BTreeMap<IpldCid, Vec<u8>>, // blocks we skipped
9797-9898- #[cfg(debug_assertions)]
9999- pub excluded_metadata: BTreeMap<IpldCid, Vec<String>>, // context about excluded blocks
10090}
1019110292/// A repository operation (mutation of a single record)
···203193 blobs: self.blobs.into_iter().map(|b| b.into_static()).collect(),
204194 too_big: self.too_big,
205195 rebase: self.rebase,
206206- #[cfg(debug_assertions)]
207207- block_sources: self.block_sources,
208208- #[cfg(debug_assertions)]
209209- excluded_blocks: self.excluded_blocks,
210210- #[cfg(debug_assertions)]
211211- excluded_metadata: self.excluded_metadata,
212196 }
213197 }
214198}
···234218use crate::mst::{Mst, VerifiedWriteOp};
235219use crate::storage::{BlockStore, LayeredBlockStore, MemoryBlockStore};
236220use cid::Cid as IpldCid;
237237-use std::collections::BTreeMap;
238221use std::sync::Arc;
239222240223impl<'a> FirehoseCommit<'a> {
···360343 // 2. Parse CAR blocks from the firehose message into temporary storage
361344 let parsed = parse_car_bytes(&self.blocks).await?;
362345363363- #[cfg(debug_assertions)]
364364- let provided_blocks = parsed
365365- .blocks
366366- .keys()
367367- .cloned()
368368- .collect::<std::collections::HashSet<_>>();
369369- #[cfg(debug_assertions)]
370370- let accessed_blocks = Arc::new(std::sync::RwLock::new(std::collections::HashSet::new()));
371371-372372- #[cfg(debug_assertions)]
373373- let missing_blocks = Arc::new(std::sync::RwLock::new(Vec::new()));
374374-375375- #[cfg(debug_assertions)]
376376- let block_categories = self.block_sources.clone();
377377-378378- #[cfg(debug_assertions)]
379379- let excluded_blocks_ref = self.excluded_blocks.clone();
380380-381346 let temp_storage = Arc::new(MemoryBlockStore::new_from_blocks(parsed.blocks));
382347383383- #[cfg(debug_assertions)]
384384- let tracking_storage = {
385385- use crate::storage::BlockStore;
386386- #[derive(Clone)]
387387- struct TrackingStorage {
388388- inner: Arc<MemoryBlockStore>,
389389- accessed: Arc<std::sync::RwLock<std::collections::HashSet<IpldCid>>>,
390390- missing: Arc<std::sync::RwLock<Vec<IpldCid>>>,
391391- excluded: BTreeMap<IpldCid, Vec<u8>>,
392392- }
393393- impl BlockStore for TrackingStorage {
394394- async fn get(&self, cid: &IpldCid) -> Result<Option<Bytes>> {
395395- self.accessed.write().unwrap().insert(*cid);
396396- let result = self.inner.get(cid).await?;
397397-398398- if result.is_none() {
399399- // Check if this block was excluded
400400- if let Some(excluded_block) = self.excluded.get(cid) {
401401- self.missing.write().unwrap().push(*cid);
402402- eprintln!(
403403- "[MISS] Block {} was EXCLUDED but needed during validation",
404404- cid
405405- );
406406- // Return the excluded block so validation can continue
407407- return Ok(Some(Bytes::copy_from_slice(&excluded_block)));
408408- } else {
409409- self.missing.write().unwrap().push(*cid);
410410- eprintln!(
411411- "[MISS] Block {} not found (never seen during commit creation)",
412412- cid
413413- );
414414- }
415415- }
416416-417417- Ok(result)
418418- }
419419- async fn put(&self, data: &[u8]) -> Result<IpldCid> {
420420- self.inner.put(data).await
421421- }
422422- async fn has(&self, cid: &IpldCid) -> Result<bool> {
423423- self.inner.has(cid).await
424424- }
425425- async fn put_many(
426426- &self,
427427- blocks: impl IntoIterator<Item = (IpldCid, Bytes)> + Send,
428428- ) -> Result<()> {
429429- self.inner.put_many(blocks).await
430430- }
431431- async fn get_many(&self, cids: &[IpldCid]) -> Result<Vec<Option<Bytes>>> {
432432- self.inner.get_many(cids).await
433433- }
434434- async fn apply_commit(&self, commit: crate::repo::CommitData) -> Result<()> {
435435- self.inner.apply_commit(commit).await
436436- }
437437- }
438438- Arc::new(TrackingStorage {
439439- inner: temp_storage.clone(),
440440- accessed: accessed_blocks.clone(),
441441- missing: missing_blocks.clone(),
442442- excluded: excluded_blocks_ref,
443443- })
444444- };
445445-446446- #[cfg(not(debug_assertions))]
447447- let tracking_storage = temp_storage.clone();
448448-449348 // 3. Extract and verify commit object from temporary storage
450349 let commit_cid: IpldCid = self
451350 .commit
452351 .to_ipld()
453352 .map_err(|e| RepoError::invalid_cid_conversion(e, "commit CID"))?;
454454- let commit_bytes = tracking_storage
353353+ let commit_bytes = temp_storage
455354 .get(&commit_cid)
456355 .await?
457356 .ok_or_else(|| RepoError::not_found("commit block", &commit_cid))?;
···474373 // 5. Load new MST from commit.data (claimed result)
475374 let expected_root = *commit.data();
476375477477- let mut new_mst = Mst::load(tracking_storage, expected_root, None);
376376+ let mut new_mst = Mst::load(temp_storage, expected_root, None);
478377479378 let verified_ops = self
480379 .ops
···507406 "MST root mismatch: expected {}, got {}",
508407 prev_data_cid, computed_root
509408 )));
510510- }
511511-512512- #[cfg(debug_assertions)]
513513- {
514514- let accessed = accessed_blocks.read().unwrap();
515515- let missing = missing_blocks.read().unwrap();
516516- let unused: Vec<_> = provided_blocks.difference(&*accessed).copied().collect();
517517-518518- println!("[validation stats]");
519519- println!(" provided: {} blocks", provided_blocks.len());
520520- println!(" accessed: {} blocks", accessed.len());
521521-522522- if !missing.is_empty() {
523523- println!(
524524- " MISSING: {} blocks NEEDED but not provided!",
525525- missing.len()
526526- );
527527-528528- // Show operation breakdown for this commit
529529- let mut op_counts: BTreeMap<&str, usize> = BTreeMap::new();
530530- for op in &self.ops {
531531- *op_counts.entry(op.action.as_ref()).or_insert(0) += 1;
532532- }
533533- println!(" operations in this commit:");
534534- for (action, count) in op_counts {
535535- println!(" {}: {}", action, count);
536536- }
537537-538538- println!(" missing block CIDs:");
539539- for cid in missing.iter() {
540540- if self.excluded_blocks.contains_key(cid) {
541541- println!(" {} (was excluded)", cid);
542542- if let Some(metadata) = self.excluded_metadata.get(cid) {
543543- for context in metadata {
544544- println!(" -> {}", context);
545545- }
546546- }
547547- } else {
548548- println!(" {} (never seen)", cid);
549549- }
550550- }
551551- }
552552-553553- if !unused.is_empty() {
554554- println!(
555555- " UNUSED: {} blocks ({}%)",
556556- unused.len(),
557557- (unused.len() * 100) / provided_blocks.len()
558558- );
559559-560560- // Show breakdown by category
561561- let mut category_stats: BTreeMap<&str, (usize, usize)> = BTreeMap::new();
562562- for (cid, category) in &block_categories {
563563- let stats = category_stats.entry(category).or_insert((0, 0));
564564- stats.0 += 1; // total provided
565565- if accessed.contains(cid) {
566566- stats.1 += 1; // accessed
567567- }
568568- }
569569-570570- println!("\n breakdown by category:");
571571- for (category, (total, accessed_count)) in category_stats {
572572- let unused_count = total - accessed_count;
573573- let unused_pct = if total > 0 {
574574- (unused_count * 100) / total
575575- } else {
576576- 0
577577- };
578578- println!(
579579- " {}: {} total, {} accessed, {} unused ({}%)",
580580- category, total, accessed_count, unused_count, unused_pct
581581- );
582582- }
583583- } else {
584584- println!(" ✓ all blocks were used");
585585- }
586409 }
587410588411 Ok(expected_root)
-47
crates/jacquard-repo/src/mst/cursor.rs
···77use cid::Cid as IpldCid;
88use smol_str::SmolStr;
991010-#[cfg(debug_assertions)]
1111-use std::collections::HashSet;
1212-#[cfg(debug_assertions)]
1313-use std::sync::{Arc, RwLock};
1414-1510/// Position within an MST traversal
1611#[derive(Debug, Clone)]
1712pub enum CursorPosition<S: BlockStore> {
···70657166 /// Current position in traversal
7267 current: CursorPosition<S>,
7373-7474- /// Track CIDs accessed during traversal (debug only)
7575- #[cfg(debug_assertions)]
7676- accessed_cids: Option<Arc<RwLock<HashSet<IpldCid>>>>,
7768}
78697970impl<S: BlockStore + Sync + 'static> MstCursor<S> {
···8576 Self {
8677 path: Vec::new(),
8778 current: CursorPosition::Tree { mst: root },
8888- #[cfg(debug_assertions)]
8989- accessed_cids: None,
9090- }
9191- }
9292-9393- /// Create new cursor with dirty tracking enabled
9494- ///
9595- /// Records all CIDs accessed during traversal in the provided set.
9696- #[cfg(debug_assertions)]
9797- pub fn new_with_tracking(root: Mst<S>, tracking: Arc<RwLock<HashSet<IpldCid>>>) -> Self {
9898- Self {
9999- path: Vec::new(),
100100- current: CursorPosition::Tree { mst: root },
101101- accessed_cids: Some(tracking),
10279 }
10380 }
10481···126103 /// If at the root level (before stepping in), returns root's layer + 1.
127104 pub async fn layer(&self) -> Result<usize> {
128105 if let Some((walking_node, _, _)) = self.path.last() {
129129- // Track CID access
130130- #[cfg(debug_assertions)]
131131- if let Some(ref tracking) = self.accessed_cids {
132132- if let Ok(cid) = walking_node.get_pointer().await {
133133- tracking.write().unwrap().insert(cid);
134134- }
135135- }
136136-137106 // We're inside a node - return its layer
138107 walking_node.get_layer().await
139108 } else {
···142111 // is one layer higher than being "inside" the root
143112 match &self.current {
144113 CursorPosition::Tree { mst } => {
145145- // Track CID access
146146- #[cfg(debug_assertions)]
147147- if let Some(ref tracking) = self.accessed_cids {
148148- if let Ok(cid) = mst.get_pointer().await {
149149- tracking.write().unwrap().insert(cid);
150150- }
151151- }
152152-153114 let root_layer = mst.get_layer().await?;
154115 Ok(root_layer + 1)
155116 }
···225186226187 /// Descend into a tree node
227188 async fn step_into(&mut self, mst: Mst<S>) -> Result<()> {
228228- // Track CID access
229229- #[cfg(debug_assertions)]
230230- if let Some(ref tracking) = self.accessed_cids {
231231- if let Ok(cid) = mst.get_pointer().await {
232232- tracking.write().unwrap().insert(cid);
233233- }
234234- }
235235-236189 let entries = mst.get_entries().await?;
237190238191 if entries.is_empty() {
+1-40
crates/jacquard-repo/src/mst/diff.rs
···44use std::future::Future;
55use std::pin::Pin;
6677-#[cfg(debug_assertions)]
88-use std::collections::HashSet;
99-#[cfg(debug_assertions)]
1010-use std::sync::{Arc, RwLock};
1111-127use super::cursor::{CursorPosition, MstCursor};
138use super::tree::Mst;
149use super::util::serialize_node_data;
···6560 /// When modifying a tree, old MST nodes along changed paths become unreachable.
6661 /// This tracks those nodes for garbage collection.
6762 pub removed_mst_blocks: Vec<IpldCid>,
6868-6969- /// CIDs accessed from old tree during diff (debug only)
7070- ///
7171- /// Tracks all blocks touched when walking the old tree during diff.
7272- /// This is the precise set of blocks needed for validation.
7373- #[cfg(debug_assertions)]
7474- pub old_tree_accessed: Vec<IpldCid>,
7575-7676- /// CIDs accessed from new tree during diff (debug only)
7777- #[cfg(debug_assertions)]
7878- pub new_tree_accessed: Vec<IpldCid>,
7963}
80648165use super::tree::VerifiedWriteOp;
···9175 removed_cids: Vec::new(),
9276 new_mst_blocks: BTreeMap::new(),
9377 removed_mst_blocks: Vec::new(),
9494- #[cfg(debug_assertions)]
9595- old_tree_accessed: Vec::new(),
9696- #[cfg(debug_assertions)]
9797- new_tree_accessed: Vec::new(),
9878 }
9979 }
10080···254234 return Ok(());
255235 }
256236257257- // CIDs differ - use cursors to walk both trees with tracking
258258- #[cfg(debug_assertions)]
259259- let old_tracking = Arc::new(RwLock::new(HashSet::new()));
260260- #[cfg(debug_assertions)]
261261- let new_tracking = Arc::new(RwLock::new(HashSet::new()));
262262-263263- #[cfg(debug_assertions)]
264264- let mut old_cursor = MstCursor::new_with_tracking(old.clone(), old_tracking.clone());
265265- #[cfg(debug_assertions)]
266266- let mut new_cursor = MstCursor::new_with_tracking(new.clone(), new_tracking.clone());
267267-268268- #[cfg(not(debug_assertions))]
237237+ // CIDs differ - use cursors to walk both trees
269238 let mut old_cursor = MstCursor::new(old.clone());
270270- #[cfg(not(debug_assertions))]
271239 let mut new_cursor = MstCursor::new(new.clone());
272240273241 // Don't advance yet - let loop handle roots like any other tree comparison
···425393 }
426394 }
427395 }
428428- }
429429-430430- // Collect tracking data
431431- #[cfg(debug_assertions)]
432432- {
433433- diff.old_tree_accessed = old_tracking.read().unwrap().iter().copied().collect();
434434- diff.new_tree_accessed = new_tracking.read().unwrap().iter().copied().collect();
435396 }
436397437398 Ok(())
+13-129
crates/jacquard-repo/src/repo.rs
···66use crate::commit::firehose::{FirehoseCommit, RepoOp};
77use crate::commit::{Commit, SigningKey};
88use crate::error::{RepoError, Result};
99-use crate::mst::{Mst, MstDiff, RecordWriteOp};
99+use crate::mst::{Mst, RecordWriteOp};
1010use crate::storage::BlockStore;
1111use bytes::Bytes;
1212use cid::Cid as IpldCid;
···58585959 /// CIDs of blocks to delete
6060 pub deleted_cids: Vec<IpldCid>,
6161-6262- /// Debug: block sources for validation analysis
6363- #[cfg(debug_assertions)]
6464- pub block_sources: BTreeMap<IpldCid, String>,
6565- #[cfg(debug_assertions)]
6666- pub excluded_blocks: BTreeMap<IpldCid, Bytes>, // blocks we skipped
6767- #[cfg(debug_assertions)]
6868- pub excluded_metadata: BTreeMap<IpldCid, Vec<String>>, // context about excluded blocks
6961}
70627163impl CommitData {
···9991 blobs,
10092 too_big: false,
10193 rebase: false,
102102- #[cfg(debug_assertions)]
103103- block_sources: self.block_sources.clone(),
104104- #[cfg(debug_assertions)]
105105- excluded_blocks: self
106106- .excluded_blocks
107107- .iter()
108108- .map(|(cid, b)| (cid.clone(), b.to_vec()))
109109- .collect(),
110110- #[cfg(debug_assertions)]
111111- excluded_metadata: self.excluded_metadata.clone(),
11294 })
11395 }
11496}
···251233 blocks: blocks.clone(),
252234 relevant_blocks: blocks,
253235 deleted_cids: Vec::new(),
254254- #[cfg(debug_assertions)]
255255- block_sources: BTreeMap::new(),
256256- #[cfg(debug_assertions)]
257257- excluded_blocks: BTreeMap::new(),
258258- #[cfg(debug_assertions)]
259259- excluded_metadata: BTreeMap::new(),
260236 })
261237 }
262238···490466 .into_iter()
491467 .map(|op| op.into_static())
492468 .collect();
493493- let deleted_cids = diff.removed_cids;
494469495495- // Step 4: Build blocks and relevant_blocks collections
470470+ // Step 4: Build blocks and relevant_blocks collections using diff tracking
496471 let mut blocks = diff.new_mst_blocks;
497472 let mut relevant_blocks = BTreeMap::new();
498473499499- #[cfg(debug_assertions)]
500500- let mut block_sources: BTreeMap<IpldCid, &str> = BTreeMap::new();
501501-502502- // // Add the previous MST root block (needed to load prev_data in validation)
503503- // if let Some(prev_root_block) = self.storage.get(&prev_data).await? {
504504- // #[cfg(debug_assertions)]
505505- // block_sources.insert(prev_data, "prev_root");
506506- // relevant_blocks.insert(prev_data, prev_root_block);
507507- // }
508508-509509- let mut new_tree_cids = std::collections::HashSet::new();
510474 for op in ops {
511475 let key = format_smolstr!("{}/{}", op.collection().as_ref(), op.rkey().as_ref());
512476 updated_tree
513477 .blocks_for_path(&key, &mut relevant_blocks)
514478 .await?;
515479516516- let new_path_cids = updated_tree.cids_for_path(&key).await?;
517517- for cid in &new_path_cids {
518518- new_tree_cids.insert(*cid);
519519- #[cfg(debug_assertions)]
520520- block_sources.insert(*cid, "new_tree_path");
521521- }
522522- }
523523- let mut old_path_blocks = BTreeMap::new();
524524- let mut old_path_cids = std::collections::HashSet::new();
525525-526526- // Step 2: Walk OLD tree, only add blocks NOT in new tree (changed nodes)
527527- for op in ops {
528528- let key = format_smolstr!("{}/{}", op.collection().as_ref(), op.rkey().as_ref());
529529-530530- self.mst.blocks_for_path(&key, &mut old_path_blocks).await?;
531531- for cid in updated_tree.cids_for_path(&key).await? {
532532- old_path_cids.insert(cid);
533533- }
534534- }
535535-536536- let mut excluded_blocks = BTreeMap::new();
537537- #[cfg(debug_assertions)]
538538- let mut excluded_metadata: BTreeMap<IpldCid, Vec<String>> = BTreeMap::new();
539539-540540- // Re-walk old tree paths to collect metadata about excluded blocks
541541- #[cfg(debug_assertions)]
542542- for (op_idx, op) in ops.iter().enumerate() {
543543- let key = format_smolstr!("{}/{}", op.collection().as_ref(), op.rkey().as_ref());
544544- let old_path_cids = self.mst.cids_for_path(&key).await?;
545545-546546- for (depth, cid) in old_path_cids.iter().enumerate() {
547547- if !new_tree_cids.contains(cid) {
548548- let metadata = format!("op#{} ({}) path depth {}", op_idx, key, depth);
549549- excluded_metadata.entry(*cid).or_insert_with(Vec::new).push(metadata);
550550- }
551551- }
552552- }
553553-554554- for (cid, block) in old_path_blocks.into_iter() {
555555- // Only include if this block CHANGED (different CID in new tree)
556556- if !new_tree_cids.contains(&cid) {
557557- //relevant_blocks.insert(cid, block);
558558- excluded_blocks.insert(cid, block);
559559- #[cfg(debug_assertions)]
560560- block_sources.insert(cid, "old_tree_changed");
480480+ // For CREATE ops in multi-op commits, include old tree paths.
481481+ // Empirically necessary: tree restructuring from multiple creates
482482+ // can access old MST nodes during inversion (reason TBD).
483483+ if let RecordWriteOp::Create { .. } = op
484484+ && ops.len() > 1
485485+ {
486486+ self.mst.blocks_for_path(&key, &mut relevant_blocks).await?;
561487 }
562488 }
563489564564- // // Add new leaf blocks to both collections (single iteration)
565565- // for (cid, block) in &leaf_blocks {
566566- // if diff.new_leaf_cids.contains(cid) {
567567- // blocks.insert(*cid, block.clone());
568568- // #[cfg(debug_assertions)]
569569- // block_sources.insert(*cid, "new_leaf");
570570- // relevant_blocks.insert(*cid, block.clone());
571571- // }
572572- // }
490490+ let deleted_cids = diff.removed_cids;
573491574574- // For DELETE operations, we need the deleted record blocks for inversion
575575- // (when inverting a delete, we insert the prev CID back)
576576- // for cid in &deleted_cids {
577577- // if let Some(block) = self.storage.get(cid).await? {
578578- // #[cfg(debug_assertions)]
579579- // block_sources.insert(*cid, "deleted_leaf");
580580- // relevant_blocks.insert(*cid, block);
581581- // }
582582- // }
583583-584584- // Step 6: Create and sign commit
492492+ // Step 5: Create and sign commit
585493 let rev = Ticker::new().next(Some(self.commit.rev.clone()));
586494 let commit = Commit::new_unsigned(did.clone().into_static(), data, rev.clone(), prev)
587495 .sign(signing_key)?;
···590498 let commit_cid = crate::mst::util::compute_cid(&commit_cbor)?;
591499 let commit_bytes = bytes::Bytes::from(commit_cbor);
592500593593- // Step 7: Add commit block to both collections
501501+ // Step 6: Add commit block to both collections
594502 blocks.insert(commit_cid, commit_bytes.clone());
595595- #[cfg(debug_assertions)]
596596- block_sources.insert(commit_cid, "commit");
597503 relevant_blocks.insert(commit_cid, commit_bytes);
598504599599- #[cfg(debug_assertions)]
600600- {
601601- let mut by_source: BTreeMap<&str, usize> = BTreeMap::new();
602602- for source in block_sources.values() {
603603- *by_source.entry(source).or_insert(0) += 1;
604604- }
605605- println!("[commit creation] relevant_blocks by source:");
606606- for (source, count) in by_source {
607607- println!(" {}: {}", source, count);
608608- }
609609- println!(" TOTAL: {}", relevant_blocks.len());
610610- }
611611-612612- // Step 8: Update internal MST state
505505+ // Step 7: Update internal MST state
613506 self.mst = updated_tree;
614507615508 Ok((
···624517 blocks,
625518 relevant_blocks,
626519 deleted_cids,
627627- #[cfg(debug_assertions)]
628628- block_sources: block_sources
629629- .into_iter()
630630- .map(|(k, v)| (k, v.to_string()))
631631- .collect(),
632632- #[cfg(debug_assertions)]
633633- excluded_blocks,
634634- #[cfg(debug_assertions)]
635635- excluded_metadata,
636520 },
637521 ))
638522 }