Persistent store with Git semantics: lazy reads, delayed writes, content-addressing
at perf 465 lines 16 kB view raw
1module type S = sig 2 type node 3 type hash 4 5 type entry = 6 [ `Node of hash | `Contents of hash | `Contents_inlined of string ] 7 8 val inline_threshold : int 9 val hash_node : node -> hash 10 val hash_contents : string -> hash 11 val node_of_bytes : string -> (node, [> `Msg of string ]) result 12 val bytes_of_node : node -> string 13 val empty_node : node 14 val find : node -> string -> entry option 15 val add : node -> string -> entry -> node 16 val remove : node -> string -> node 17 val list : node -> (string * entry) list 18 val is_empty : node -> bool 19 20 (* Hash operations *) 21 val hash_to_bytes : hash -> string 22 val hash_to_hex : hash -> string 23 val hash_of_hex : string -> (hash, [> `Msg of string ]) result 24 val hash_of_raw_bytes : string -> hash 25 val hash_equal : hash -> hash -> bool 26 val hash_compare : hash -> hash -> int 27 28 (* Commit operations *) 29 type commit 30 31 val commit_make : 32 tree:hash -> 33 parents:hash list -> 34 author:string -> 35 committer:string -> 36 message:string -> 37 timestamp:int64 -> 38 commit 39 40 val commit_tree : commit -> hash 41 val commit_parents : commit -> hash list 42 val commit_author : commit -> string 43 val commit_committer : commit -> string 44 val commit_message : commit -> string 45 val commit_timestamp : commit -> int64 46 val commit_of_bytes : string -> (commit, [> `Msg of string ]) result 47 val commit_to_bytes : commit -> string 48 val commit_hash : commit -> hash 49end 50 51module type SHA1 = S with type hash = Hash.sha1 52module type SHA256 = S with type hash = Hash.sha256 53 54(** Git tree object format using ocaml-git. *) 55module Git : SHA1 = struct 56 type hash = Hash.sha1 57 58 type entry = 59 [ `Node of hash | `Contents of hash | `Contents_inlined of string ] 60 61 (* Node wraps a Git tree plus a map of inlined small contents *) 62 type node = { 63 tree : Git.Tree.t; 64 inlined : (string * string) list; (* name -> inlined content *) 65 } 66 67 (* Inline contents up to 48 bytes directly in tree nodes, avoiding a 68 separate content-addressable store lookup. This breaks Git interop 69 for inlined entries but improves performance on small values. *) 70 let inline_threshold = 48 71 72 (* Convert between irmin Hash.sha1 and Git.Hash.t *) 73 let git_hash_of_sha1 (h : hash) : Git.Hash.t = 74 Git.Hash.of_raw_string (Hash.to_bytes h) 75 76 let sha1_of_git_hash (h : Git.Hash.t) : hash = 77 Hash.sha1_of_bytes (Git.Hash.to_raw_string h) 78 79 let empty_node = { tree = Git.Tree.empty; inlined = [] } 80 let is_empty node = Git.Tree.is_empty node.tree && node.inlined = [] 81 82 let find node name = 83 match List.assoc_opt name node.inlined with 84 | Some data -> Some (`Contents_inlined data) 85 | None -> ( 86 match Git.Tree.find ~name node.tree with 87 | None -> None 88 | Some entry -> ( 89 let h = sha1_of_git_hash entry.hash in 90 match entry.perm with 91 | `Dir -> Some (`Node h) 92 | _ -> Some (`Contents h))) 93 94 let add node name kind = 95 match kind with 96 | `Contents_inlined data -> 97 let tree = Git.Tree.remove ~name node.tree in 98 let inlined = 99 (name, data) :: List.filter (fun (n, _) -> n <> name) node.inlined 100 in 101 { tree; inlined } 102 | `Node h -> 103 let inlined = List.filter (fun (n, _) -> n <> name) node.inlined in 104 let tree = Git.Tree.remove ~name node.tree in 105 let entry = Git.Tree.entry ~perm:`Dir ~name (git_hash_of_sha1 h) in 106 { tree = Git.Tree.add entry tree; inlined } 107 | `Contents h -> 108 let inlined = List.filter (fun (n, _) -> n <> name) node.inlined in 109 let tree = Git.Tree.remove ~name node.tree in 110 let entry = 111 Git.Tree.entry ~perm:`Normal ~name (git_hash_of_sha1 h) 112 in 113 { tree = Git.Tree.add entry tree; inlined } 114 115 let remove node name = 116 let inlined = List.filter (fun (n, _) -> n <> name) node.inlined in 117 { tree = Git.Tree.remove ~name node.tree; inlined } 118 119 let list node = 120 let tree_entries = 121 Git.Tree.to_list node.tree 122 |> List.map (fun (entry : Git.Tree.entry) -> 123 let h = sha1_of_git_hash entry.hash in 124 let kind = 125 match entry.perm with `Dir -> `Node h | _ -> `Contents h 126 in 127 (entry.name, kind)) 128 in 129 let inlined_entries = 130 List.map 131 (fun (name, data) -> (name, `Contents_inlined data)) 132 node.inlined 133 in 134 List.sort 135 (fun (a, _) (b, _) -> String.compare a b) 136 (tree_entries @ inlined_entries) 137 138 (* Serialization format: 139 - Version 0 (backward compat): raw Git tree bytes (starts with ASCII digit) 140 - Version 1: \x01 + 4-byte Git tree length + Git tree bytes + inlined entries 141 Each inlined entry: 2-byte name length + name + 4-byte data length + data 142 Standard Git trees always start with a mode digit (0-9), never \x01. *) 143 144 let bytes_of_node node = 145 let tree_bytes = Git.Tree.to_string node.tree in 146 if node.inlined = [] then tree_bytes 147 else 148 let buf = Buffer.create (String.length tree_bytes + 64) in 149 Buffer.add_char buf '\x01'; 150 Buffer.add_int32_be buf (Int32.of_int (String.length tree_bytes)); 151 Buffer.add_string buf tree_bytes; 152 let sorted = 153 List.sort (fun (a, _) (b, _) -> String.compare a b) node.inlined 154 in 155 List.iter 156 (fun (name, data) -> 157 let nlen = String.length name in 158 let dlen = String.length data in 159 Buffer.add_uint16_be buf nlen; 160 Buffer.add_string buf name; 161 Buffer.add_int32_be buf (Int32.of_int dlen); 162 Buffer.add_string buf data) 163 sorted; 164 Buffer.contents buf 165 166 let parse_inlined_entries s offset = 167 let len = String.length s in 168 let rec loop pos acc = 169 if pos >= len then List.rev acc 170 else 171 let nlen = Char.code s.[pos] lsl 8 lor Char.code s.[pos + 1] in 172 let name = String.sub s (pos + 2) nlen in 173 let dpos = pos + 2 + nlen in 174 let dlen = 175 Char.code s.[dpos] lsl 24 176 lor (Char.code s.[dpos + 1] lsl 16) 177 lor (Char.code s.[dpos + 2] lsl 8) 178 lor Char.code s.[dpos + 3] 179 in 180 let data = String.sub s (dpos + 4) dlen in 181 loop (dpos + 4 + dlen) ((name, data) :: acc) 182 in 183 loop offset [] 184 185 let node_of_bytes s : (node, [> `Msg of string ]) result = 186 if String.length s = 0 then Ok { tree = Git.Tree.empty; inlined = [] } 187 else if Char.code s.[0] = 0x01 then begin 188 (* Version 1: has inlined entries *) 189 let tree_len = 190 Char.code s.[1] lsl 24 191 lor (Char.code s.[2] lsl 16) 192 lor (Char.code s.[3] lsl 8) 193 lor Char.code s.[4] 194 in 195 let tree_bytes = String.sub s 5 tree_len in 196 let inlined_start = 5 + tree_len in 197 let inlined = parse_inlined_entries s inlined_start in 198 match Git.Tree.of_string tree_bytes with 199 | Ok tree -> Ok { tree; inlined } 200 | Error (`Msg m) -> Error (`Msg m) 201 end 202 else 203 (* Version 0: standard Git tree, no inlined entries *) 204 match Git.Tree.of_string s with 205 | Ok tree -> Ok { tree; inlined = [] } 206 | Error (`Msg m) -> Error (`Msg m) 207 208 let hash_node node = 209 let data = bytes_of_node node in 210 sha1_of_git_hash (Git.Hash.digest_string ~kind:`Tree data) 211 212 let hash_contents data = 213 sha1_of_git_hash (Git.Hash.digest_string ~kind:`Blob data) 214 215 let hash_to_bytes = Hash.to_bytes 216 let hash_to_hex = Hash.to_hex 217 let hash_of_hex s : (hash, [> `Msg of string ]) result = Hash.sha1_of_hex s 218 let hash_of_raw_bytes = Hash.sha1_of_bytes 219 let hash_equal = Hash.equal 220 let hash_compare = Hash.compare 221 222 (* Commit operations using ocaml-git *) 223 type commit = Git.Commit.t 224 225 let commit_make ~tree ~parents ~author ~committer ~message ~timestamp = 226 let user_of_string s = 227 (* Parse "Name <email>" format *) 228 match String.index_opt s '<' with 229 | None -> Git.User.v ~name:s ~email:"" ~date:timestamp () 230 | Some i -> 231 let name = String.trim (String.sub s 0 i) in 232 let rest = String.sub s (i + 1) (String.length s - i - 1) in 233 let email = 234 match String.index_opt rest '>' with 235 | None -> rest 236 | Some j -> String.sub rest 0 j 237 in 238 Git.User.v ~name ~email ~date:timestamp () 239 in 240 Git.Commit.v ~tree:(git_hash_of_sha1 tree) 241 ~parents:(List.map git_hash_of_sha1 parents) 242 ~author:(user_of_string author) ~committer:(user_of_string committer) 243 (Some message) 244 245 let commit_tree c = sha1_of_git_hash (Git.Commit.tree c) 246 let commit_parents c = List.map sha1_of_git_hash (Git.Commit.parents c) 247 248 let user_to_string u = 249 let name = Git.User.name u in 250 let email = Git.User.email u in 251 if email = "" then name else Fmt.str "%s <%s>" name email 252 253 let commit_author c = user_to_string (Git.Commit.author c) 254 let commit_committer c = user_to_string (Git.Commit.committer c) 255 let commit_message c = Option.value ~default:"" (Git.Commit.message c) 256 let commit_timestamp c = Git.User.date (Git.Commit.author c) 257 258 let commit_of_bytes s : (commit, [> `Msg of string ]) result = 259 match Git.Commit.of_string s with 260 | Ok c -> Ok c 261 | Error (`Msg m) -> Error (`Msg m) 262 263 let commit_to_bytes = Git.Commit.to_string 264 let commit_hash c = sha1_of_git_hash (Git.Commit.digest c) 265end 266 267(** ATProto Merkle Search Tree format using ocaml-atp. 268 269 MST uses SHA-256 with 2-bit prefix counting for tree depth. Keys are stored 270 sorted with common prefix compression. Encoded as DAG-CBOR. *) 271module Mst : SHA256 = struct 272 type hash = Hash.sha256 273 274 type entry = 275 [ `Node of hash | `Contents of hash | `Contents_inlined of string ] 276 277 (* MST/DAG-CBOR stores all values as CID references. Inlining is not 278 supported: Contents_inlined is converted to a hash in [add], so the 279 data would be lost if not written separately to the backend. *) 280 let inline_threshold = 0 281 282 (* Convert between irmin Hash.sha256 and Atp.Cid.t *) 283 let cid_of_sha256 (h : hash) : Atp.Cid.t = 284 Atp.Cid.of_digest `Dag_cbor (Hash.to_bytes h) 285 286 let sha256_of_cid (cid : Atp.Cid.t) : hash = 287 Hash.sha256_of_bytes (Atp.Cid.digest cid) 288 289 (* Our node wraps Atp.Mst.Raw.node for serialization *) 290 type node = Atp.Mst.Raw.node 291 292 let empty_node : node = { l = None; e = [] } 293 let is_empty (node : node) = node.l = None && node.e = [] 294 295 (* Decompress key from entry list *) 296 let decompress_keys (entries : Atp.Mst.Raw.entry list) : 297 (string * Atp.Mst.Raw.entry) list = 298 let rec loop prev_key acc = function 299 | [] -> List.rev acc 300 | (e : Atp.Mst.Raw.entry) :: rest -> 301 let key = String.sub prev_key 0 e.p ^ e.k in 302 loop key ((key, e) :: acc) rest 303 in 304 loop "" [] entries 305 306 let find (node : node) name = 307 let entries = decompress_keys node.e in 308 match List.find_opt (fun (k, _) -> k = name) entries with 309 | None -> None 310 | Some (_, e) -> 311 (* In MST, all values are content CIDs, subtrees are in 't' field *) 312 Some (`Contents (sha256_of_cid e.v)) 313 314 (* Compress keys for serialization *) 315 let compress_keys entries = 316 let sorted = 317 List.sort (fun (k1, _) (k2, _) -> String.compare k1 k2) entries 318 in 319 let rec loop prev_key acc = function 320 | [] -> List.rev acc 321 | (key, (v, t)) :: rest -> 322 let p = 323 let rec shared i = 324 if i >= String.length prev_key || i >= String.length key then i 325 else if prev_key.[i] = key.[i] then shared (i + 1) 326 else i 327 in 328 shared 0 329 in 330 let k = String.sub key p (String.length key - p) in 331 let entry : Atp.Mst.Raw.entry = { p; k; v; t } in 332 loop key (entry :: acc) rest 333 in 334 loop "" [] sorted 335 336 let add (node : node) name kind = 337 let entries = decompress_keys node.e in 338 let v = 339 match kind with 340 | `Contents h -> cid_of_sha256 h 341 | `Node h -> cid_of_sha256 h 342 | `Contents_inlined s -> 343 (* Fallback: hash the content and store as CID *) 344 cid_of_sha256 (Hash.sha256 s) 345 in 346 let entries = List.filter (fun (k, _) -> k <> name) entries in 347 let entries = 348 (name, (v, None)) 349 :: List.map (fun (k, (e : Atp.Mst.Raw.entry)) -> (k, (e.v, e.t))) entries 350 in 351 let compressed = compress_keys entries in 352 { node with e = compressed } 353 354 let remove (node : node) name = 355 let entries = decompress_keys node.e in 356 let entries = List.filter (fun (k, _) -> k <> name) entries in 357 let entries = 358 List.map (fun (k, (e : Atp.Mst.Raw.entry)) -> (k, (e.v, e.t))) entries 359 in 360 let compressed = compress_keys entries in 361 { node with e = compressed } 362 363 let list (node : node) = 364 let entries = decompress_keys node.e in 365 List.map 366 (fun (key, (e : Atp.Mst.Raw.entry)) -> 367 (key, `Contents (sha256_of_cid e.v))) 368 entries 369 370 let bytes_of_node node = Atp.Mst.Raw.encode_bytes node 371 372 let node_of_bytes data : (node, [> `Msg of string ]) result = 373 try Ok (Atp.Mst.Raw.decode_bytes data) 374 with exn -> 375 Error (`Msg ("failed to decode MST node: " ^ Printexc.to_string exn)) 376 377 let hash_node node = 378 let data = bytes_of_node node in 379 Hash.sha256 data 380 381 let hash_contents data = Hash.sha256 data 382 let hash_to_bytes = Hash.to_bytes 383 let hash_to_hex = Hash.to_hex 384 let hash_of_hex s : (hash, [> `Msg of string ]) result = Hash.sha256_of_hex s 385 let hash_of_raw_bytes = Hash.sha256_of_bytes 386 let hash_equal = Hash.equal 387 let hash_compare = Hash.compare 388 389 (* Commit operations for MST format using DAG-CBOR *) 390 type commit = { 391 tree : hash; 392 parents : hash list; 393 author : string; 394 committer : string; 395 message : string; 396 timestamp : int64; 397 } 398 399 let commit_make ~tree ~parents ~author ~committer ~message ~timestamp = 400 { tree; parents; author; committer; message; timestamp } 401 402 let commit_tree c = c.tree 403 let commit_parents c = c.parents 404 let commit_author c = c.author 405 let commit_committer c = c.committer 406 let commit_message c = c.message 407 let commit_timestamp c = c.timestamp 408 409 let commit_of_bytes s : (commit, [> `Msg of string ]) result = 410 try 411 let v = Atp.Dagcbor.decode_string ~cid_format:`Atproto s in 412 match v with 413 | `Map fields -> 414 let get_string key = 415 match List.assoc_opt key fields with 416 | Some (`String s) -> s 417 | _ -> "" 418 in 419 let get_int64 key = 420 match List.assoc_opt key fields with Some (`Int i) -> i | _ -> 0L 421 in 422 let get_link key = 423 match List.assoc_opt key fields with 424 | Some (`Link cid) -> sha256_of_cid cid 425 | _ -> Hash.sha256 "" 426 in 427 let get_links key = 428 match List.assoc_opt key fields with 429 | Some (`List links) -> 430 List.filter_map 431 (function `Link cid -> Some (sha256_of_cid cid) | _ -> None) 432 links 433 | _ -> [] 434 in 435 Ok 436 { 437 tree = get_link "tree"; 438 parents = get_links "parents"; 439 author = get_string "author"; 440 committer = get_string "committer"; 441 message = get_string "message"; 442 timestamp = get_int64 "timestamp"; 443 } 444 | _ -> Error (`Msg "expected map for commit") 445 with Eio.Io _ as e -> Error (`Msg (Printexc.to_string e)) 446 447 let commit_to_bytes c = 448 let v : Atp.Dagcbor.value = 449 `Map 450 [ 451 ("author", `String c.author); 452 ("committer", `String c.committer); 453 ("message", `String c.message); 454 ( "parents", 455 `List (List.map (fun h -> `Link (cid_of_sha256 h)) c.parents) ); 456 ("timestamp", `Int c.timestamp); 457 ("tree", `Link (cid_of_sha256 c.tree)); 458 ] 459 in 460 Atp.Dagcbor.encode_string ~cid_format:`Atproto v 461 462 let commit_hash c = 463 let data = commit_to_bytes c in 464 Hash.sha256 data 465end