Persistent store with Git semantics: lazy reads, delayed writes, content-addressing
at benchs+cache 456 lines 16 kB view raw
1module type S = sig 2 type node 3 type hash 4 5 type entry = 6 [ `Node of hash | `Contents of hash | `Contents_inlined of string ] 7 8 val inline_threshold : int 9 val hash_node : node -> hash 10 val hash_contents : string -> hash 11 val node_of_bytes : string -> (node, [> `Msg of string ]) result 12 val bytes_of_node : node -> string 13 val empty_node : node 14 val find : node -> string -> entry option 15 val add : node -> string -> entry -> node 16 val remove : node -> string -> node 17 val list : node -> (string * entry) list 18 val is_empty : node -> bool 19 20 (* Hash operations *) 21 val hash_to_bytes : hash -> string 22 val hash_to_hex : hash -> string 23 val hash_of_hex : string -> (hash, [> `Msg of string ]) result 24 val hash_equal : hash -> hash -> bool 25 val hash_compare : hash -> hash -> int 26 27 (* Commit operations *) 28 type commit 29 30 val commit_make : 31 tree:hash -> 32 parents:hash list -> 33 author:string -> 34 committer:string -> 35 message:string -> 36 timestamp:int64 -> 37 commit 38 39 val commit_tree : commit -> hash 40 val commit_parents : commit -> hash list 41 val commit_author : commit -> string 42 val commit_committer : commit -> string 43 val commit_message : commit -> string 44 val commit_timestamp : commit -> int64 45 val commit_of_bytes : string -> (commit, [> `Msg of string ]) result 46 val commit_to_bytes : commit -> string 47 val commit_hash : commit -> hash 48end 49 50module type SHA1 = S with type hash = Hash.sha1 51module type SHA256 = S with type hash = Hash.sha256 52 53(** Git tree object format using ocaml-git. *) 54module Git : SHA1 = struct 55 type hash = Hash.sha1 56 57 type entry = 58 [ `Node of hash | `Contents of hash | `Contents_inlined of string ] 59 60 (* Node wraps a Git tree plus a map of inlined small contents *) 61 type node = { 62 tree : Git.Tree.t; 63 inlined : (string * string) list; (* name -> inlined content *) 64 } 65 66 (* Inline contents up to 48 bytes directly in tree nodes, avoiding a 67 separate content-addressable store lookup. This breaks Git interop 68 for inlined entries but improves performance on small values. *) 69 let inline_threshold = 48 70 71 (* Convert between irmin Hash.sha1 and Git.Hash.t *) 72 let git_hash_of_sha1 (h : hash) : Git.Hash.t = 73 Git.Hash.of_raw_string (Hash.to_bytes h) 74 75 let sha1_of_git_hash (h : Git.Hash.t) : hash = 76 Hash.sha1_of_bytes (Git.Hash.to_raw_string h) 77 78 let empty_node = { tree = Git.Tree.empty; inlined = [] } 79 let is_empty node = Git.Tree.is_empty node.tree && node.inlined = [] 80 81 let find node name = 82 match List.assoc_opt name node.inlined with 83 | Some data -> Some (`Contents_inlined data) 84 | None -> ( 85 match Git.Tree.find ~name node.tree with 86 | None -> None 87 | Some entry -> ( 88 let h = sha1_of_git_hash entry.hash in 89 match entry.perm with 90 | `Dir -> Some (`Node h) 91 | _ -> Some (`Contents h))) 92 93 let add node name kind = 94 match kind with 95 | `Contents_inlined data -> 96 let tree = Git.Tree.remove ~name node.tree in 97 let inlined = 98 (name, data) :: List.filter (fun (n, _) -> n <> name) node.inlined 99 in 100 { tree; inlined } 101 | `Node h -> 102 let inlined = List.filter (fun (n, _) -> n <> name) node.inlined in 103 let entry = Git.Tree.entry ~perm:`Dir ~name (git_hash_of_sha1 h) in 104 { tree = Git.Tree.add entry node.tree; inlined } 105 | `Contents h -> 106 let inlined = List.filter (fun (n, _) -> n <> name) node.inlined in 107 let entry = 108 Git.Tree.entry ~perm:`Normal ~name (git_hash_of_sha1 h) 109 in 110 { tree = Git.Tree.add entry node.tree; inlined } 111 112 let remove node name = 113 let inlined = List.filter (fun (n, _) -> n <> name) node.inlined in 114 { tree = Git.Tree.remove ~name node.tree; inlined } 115 116 let list node = 117 let tree_entries = 118 Git.Tree.to_list node.tree 119 |> List.map (fun (entry : Git.Tree.entry) -> 120 let h = sha1_of_git_hash entry.hash in 121 let kind = 122 match entry.perm with `Dir -> `Node h | _ -> `Contents h 123 in 124 (entry.name, kind)) 125 in 126 let inlined_entries = 127 List.map 128 (fun (name, data) -> (name, `Contents_inlined data)) 129 node.inlined 130 in 131 List.sort 132 (fun (a, _) (b, _) -> String.compare a b) 133 (tree_entries @ inlined_entries) 134 135 (* Serialization format: 136 - Version 0 (backward compat): raw Git tree bytes (starts with ASCII digit) 137 - Version 1: \x01 + 4-byte Git tree length + Git tree bytes + inlined entries 138 Each inlined entry: 2-byte name length + name + 4-byte data length + data 139 Standard Git trees always start with a mode digit (0-9), never \x01. *) 140 141 let bytes_of_node node = 142 let tree_bytes = Git.Tree.to_string node.tree in 143 if node.inlined = [] then tree_bytes 144 else 145 let buf = Buffer.create (String.length tree_bytes + 64) in 146 Buffer.add_char buf '\x01'; 147 Buffer.add_int32_be buf (Int32.of_int (String.length tree_bytes)); 148 Buffer.add_string buf tree_bytes; 149 let sorted = 150 List.sort (fun (a, _) (b, _) -> String.compare a b) node.inlined 151 in 152 List.iter 153 (fun (name, data) -> 154 let nlen = String.length name in 155 let dlen = String.length data in 156 Buffer.add_uint16_be buf nlen; 157 Buffer.add_string buf name; 158 Buffer.add_int32_be buf (Int32.of_int dlen); 159 Buffer.add_string buf data) 160 sorted; 161 Buffer.contents buf 162 163 let parse_inlined_entries s offset = 164 let len = String.length s in 165 let rec loop pos acc = 166 if pos >= len then List.rev acc 167 else 168 let nlen = Char.code s.[pos] lsl 8 lor Char.code s.[pos + 1] in 169 let name = String.sub s (pos + 2) nlen in 170 let dpos = pos + 2 + nlen in 171 let dlen = 172 Char.code s.[dpos] lsl 24 173 lor (Char.code s.[dpos + 1] lsl 16) 174 lor (Char.code s.[dpos + 2] lsl 8) 175 lor Char.code s.[dpos + 3] 176 in 177 let data = String.sub s (dpos + 4) dlen in 178 loop (dpos + 4 + dlen) ((name, data) :: acc) 179 in 180 loop offset [] 181 182 let node_of_bytes s : (node, [> `Msg of string ]) result = 183 if String.length s = 0 then Ok { tree = Git.Tree.empty; inlined = [] } 184 else if Char.code s.[0] = 0x01 then begin 185 (* Version 1: has inlined entries *) 186 let tree_len = 187 Char.code s.[1] lsl 24 188 lor (Char.code s.[2] lsl 16) 189 lor (Char.code s.[3] lsl 8) 190 lor Char.code s.[4] 191 in 192 let tree_bytes = String.sub s 5 tree_len in 193 let inlined_start = 5 + tree_len in 194 let inlined = parse_inlined_entries s inlined_start in 195 match Git.Tree.of_string tree_bytes with 196 | Ok tree -> Ok { tree; inlined } 197 | Error (`Msg m) -> Error (`Msg m) 198 end 199 else 200 (* Version 0: standard Git tree, no inlined entries *) 201 match Git.Tree.of_string s with 202 | Ok tree -> Ok { tree; inlined = [] } 203 | Error (`Msg m) -> Error (`Msg m) 204 205 let hash_node node = 206 let data = bytes_of_node node in 207 sha1_of_git_hash (Git.Hash.digest_string ~kind:`Tree data) 208 209 let hash_contents data = 210 sha1_of_git_hash (Git.Hash.digest_string ~kind:`Blob data) 211 212 let hash_to_bytes = Hash.to_bytes 213 let hash_to_hex = Hash.to_hex 214 let hash_of_hex s : (hash, [> `Msg of string ]) result = Hash.sha1_of_hex s 215 let hash_equal = Hash.equal 216 let hash_compare = Hash.compare 217 218 (* Commit operations using ocaml-git *) 219 type commit = Git.Commit.t 220 221 let commit_make ~tree ~parents ~author ~committer ~message ~timestamp = 222 let user_of_string s = 223 (* Parse "Name <email>" format *) 224 match String.index_opt s '<' with 225 | None -> Git.User.v ~name:s ~email:"" ~date:timestamp () 226 | Some i -> 227 let name = String.trim (String.sub s 0 i) in 228 let rest = String.sub s (i + 1) (String.length s - i - 1) in 229 let email = 230 match String.index_opt rest '>' with 231 | None -> rest 232 | Some j -> String.sub rest 0 j 233 in 234 Git.User.v ~name ~email ~date:timestamp () 235 in 236 Git.Commit.v ~tree:(git_hash_of_sha1 tree) 237 ~parents:(List.map git_hash_of_sha1 parents) 238 ~author:(user_of_string author) ~committer:(user_of_string committer) 239 (Some message) 240 241 let commit_tree c = sha1_of_git_hash (Git.Commit.tree c) 242 let commit_parents c = List.map sha1_of_git_hash (Git.Commit.parents c) 243 244 let user_to_string u = 245 let name = Git.User.name u in 246 let email = Git.User.email u in 247 if email = "" then name else Printf.sprintf "%s <%s>" name email 248 249 let commit_author c = user_to_string (Git.Commit.author c) 250 let commit_committer c = user_to_string (Git.Commit.committer c) 251 let commit_message c = Option.value ~default:"" (Git.Commit.message c) 252 let commit_timestamp c = Git.User.date (Git.Commit.author c) 253 254 let commit_of_bytes s : (commit, [> `Msg of string ]) result = 255 match Git.Commit.of_string s with 256 | Ok c -> Ok c 257 | Error (`Msg m) -> Error (`Msg m) 258 259 let commit_to_bytes = Git.Commit.to_string 260 let commit_hash c = sha1_of_git_hash (Git.Commit.digest c) 261end 262 263(** ATProto Merkle Search Tree format using ocaml-atp. 264 265 MST uses SHA-256 with 2-bit prefix counting for tree depth. Keys are stored 266 sorted with common prefix compression. Encoded as DAG-CBOR. *) 267module Mst : SHA256 = struct 268 type hash = Hash.sha256 269 270 type entry = 271 [ `Node of hash | `Contents of hash | `Contents_inlined of string ] 272 273 let inline_threshold = 48 274 275 (* Convert between irmin Hash.sha256 and Atp.Cid.t *) 276 let cid_of_sha256 (h : hash) : Atp.Cid.t = 277 Atp.Cid.of_digest `Dag_cbor (Hash.to_bytes h) 278 279 let sha256_of_cid (cid : Atp.Cid.t) : hash = 280 Hash.sha256_of_bytes (Atp.Cid.digest cid) 281 282 (* Our node wraps Atp.Mst.Raw.node for serialization *) 283 type node = Atp.Mst.Raw.node 284 285 let empty_node : node = { l = None; e = [] } 286 let is_empty (node : node) = node.l = None && node.e = [] 287 288 (* Decompress key from entry list *) 289 let decompress_keys (entries : Atp.Mst.Raw.entry list) : 290 (string * Atp.Mst.Raw.entry) list = 291 let rec loop prev_key acc = function 292 | [] -> List.rev acc 293 | (e : Atp.Mst.Raw.entry) :: rest -> 294 let key = String.sub prev_key 0 e.p ^ e.k in 295 loop key ((key, e) :: acc) rest 296 in 297 loop "" [] entries 298 299 let find (node : node) name = 300 let entries = decompress_keys node.e in 301 match List.find_opt (fun (k, _) -> k = name) entries with 302 | None -> None 303 | Some (_, e) -> 304 (* In MST, all values are content CIDs, subtrees are in 't' field *) 305 Some (`Contents (sha256_of_cid e.v)) 306 307 (* Compress keys for serialization *) 308 let compress_keys entries = 309 let sorted = 310 List.sort (fun (k1, _) (k2, _) -> String.compare k1 k2) entries 311 in 312 let rec loop prev_key acc = function 313 | [] -> List.rev acc 314 | (key, (v, t)) :: rest -> 315 let p = 316 let rec shared i = 317 if i >= String.length prev_key || i >= String.length key then i 318 else if prev_key.[i] = key.[i] then shared (i + 1) 319 else i 320 in 321 shared 0 322 in 323 let k = String.sub key p (String.length key - p) in 324 let entry : Atp.Mst.Raw.entry = { p; k; v; t } in 325 loop key (entry :: acc) rest 326 in 327 loop "" [] sorted 328 329 let add (node : node) name kind = 330 let entries = decompress_keys node.e in 331 let v = 332 match kind with 333 | `Contents h -> cid_of_sha256 h 334 | `Node h -> cid_of_sha256 h 335 | `Contents_inlined s -> 336 (* Fallback: hash the content and store as CID *) 337 cid_of_sha256 (Hash.sha256 s) 338 in 339 let entries = List.filter (fun (k, _) -> k <> name) entries in 340 let entries = 341 (name, (v, None)) 342 :: List.map (fun (k, (e : Atp.Mst.Raw.entry)) -> (k, (e.v, e.t))) entries 343 in 344 let compressed = compress_keys entries in 345 { node with e = compressed } 346 347 let remove (node : node) name = 348 let entries = decompress_keys node.e in 349 let entries = List.filter (fun (k, _) -> k <> name) entries in 350 let entries = 351 List.map (fun (k, (e : Atp.Mst.Raw.entry)) -> (k, (e.v, e.t))) entries 352 in 353 let compressed = compress_keys entries in 354 { node with e = compressed } 355 356 let list (node : node) = 357 let entries = decompress_keys node.e in 358 List.map 359 (fun (key, (e : Atp.Mst.Raw.entry)) -> 360 (key, `Contents (sha256_of_cid e.v))) 361 entries 362 363 let bytes_of_node node = Atp.Mst.Raw.encode_bytes node 364 365 let node_of_bytes data : (node, [> `Msg of string ]) result = 366 try Ok (Atp.Mst.Raw.decode_bytes data) 367 with _ -> Error (`Msg "failed to decode MST node") 368 369 let hash_node node = 370 let data = bytes_of_node node in 371 Hash.sha256 data 372 373 let hash_contents data = Hash.sha256 data 374 let hash_to_bytes = Hash.to_bytes 375 let hash_to_hex = Hash.to_hex 376 let hash_of_hex s : (hash, [> `Msg of string ]) result = Hash.sha256_of_hex s 377 let hash_equal = Hash.equal 378 let hash_compare = Hash.compare 379 380 (* Commit operations for MST format using DAG-CBOR *) 381 type commit = { 382 tree : hash; 383 parents : hash list; 384 author : string; 385 committer : string; 386 message : string; 387 timestamp : int64; 388 } 389 390 let commit_make ~tree ~parents ~author ~committer ~message ~timestamp = 391 { tree; parents; author; committer; message; timestamp } 392 393 let commit_tree c = c.tree 394 let commit_parents c = c.parents 395 let commit_author c = c.author 396 let commit_committer c = c.committer 397 let commit_message c = c.message 398 let commit_timestamp c = c.timestamp 399 400 let commit_of_bytes s : (commit, [> `Msg of string ]) result = 401 try 402 let v = Atp.Dagcbor.decode_string ~cid_format:`Atproto s in 403 match v with 404 | `Map fields -> 405 let get_string key = 406 match List.assoc_opt key fields with 407 | Some (`String s) -> s 408 | _ -> "" 409 in 410 let get_int64 key = 411 match List.assoc_opt key fields with Some (`Int i) -> i | _ -> 0L 412 in 413 let get_link key = 414 match List.assoc_opt key fields with 415 | Some (`Link cid) -> sha256_of_cid cid 416 | _ -> Hash.sha256 "" 417 in 418 let get_links key = 419 match List.assoc_opt key fields with 420 | Some (`List links) -> 421 List.filter_map 422 (function `Link cid -> Some (sha256_of_cid cid) | _ -> None) 423 links 424 | _ -> [] 425 in 426 Ok 427 { 428 tree = get_link "tree"; 429 parents = get_links "parents"; 430 author = get_string "author"; 431 committer = get_string "committer"; 432 message = get_string "message"; 433 timestamp = get_int64 "timestamp"; 434 } 435 | _ -> Error (`Msg "expected map for commit") 436 with Eio.Io _ as e -> Error (`Msg (Printexc.to_string e)) 437 438 let commit_to_bytes c = 439 let v : Atp.Dagcbor.value = 440 `Map 441 [ 442 ("author", `String c.author); 443 ("committer", `String c.committer); 444 ("message", `String c.message); 445 ( "parents", 446 `List (List.map (fun h -> `Link (cid_of_sha256 h)) c.parents) ); 447 ("timestamp", `Int c.timestamp); 448 ("tree", `Link (cid_of_sha256 c.tree)); 449 ] 450 in 451 Atp.Dagcbor.encode_string ~cid_format:`Atproto v 452 453 let commit_hash c = 454 let data = commit_to_bytes c in 455 Hash.sha256 data 456end