forked from
gazagnaire.org/irmin
Persistent store with Git semantics: lazy reads, delayed writes, content-addressing
1module type S = sig
2 type node
3 type hash
4
5 type entry =
6 [ `Node of hash | `Contents of hash | `Contents_inlined of string ]
7
8 val inline_threshold : int
9 val hash_node : node -> hash
10 val hash_contents : string -> hash
11 val node_of_bytes : string -> (node, [> `Msg of string ]) result
12 val bytes_of_node : node -> string
13 val empty_node : node
14 val find : node -> string -> entry option
15 val add : node -> string -> entry -> node
16 val remove : node -> string -> node
17 val list : node -> (string * entry) list
18 val is_empty : node -> bool
19
20 (* Hash operations *)
21 val hash_to_bytes : hash -> string
22 val hash_to_hex : hash -> string
23 val hash_of_hex : string -> (hash, [> `Msg of string ]) result
24 val hash_of_raw_bytes : string -> hash
25 val hash_equal : hash -> hash -> bool
26 val hash_compare : hash -> hash -> int
27
28 (* Commit operations *)
29 type commit
30
31 val commit_make :
32 tree:hash ->
33 parents:hash list ->
34 author:string ->
35 committer:string ->
36 message:string ->
37 timestamp:int64 ->
38 commit
39
40 val commit_tree : commit -> hash
41 val commit_parents : commit -> hash list
42 val commit_author : commit -> string
43 val commit_committer : commit -> string
44 val commit_message : commit -> string
45 val commit_timestamp : commit -> int64
46 val commit_of_bytes : string -> (commit, [> `Msg of string ]) result
47 val commit_to_bytes : commit -> string
48 val commit_hash : commit -> hash
49end
50
51module type SHA1 = S with type hash = Hash.sha1
52module type SHA256 = S with type hash = Hash.sha256
53
54(** Git tree object format using ocaml-git. *)
55module Git : SHA1 = struct
56 type hash = Hash.sha1
57
58 type entry =
59 [ `Node of hash | `Contents of hash | `Contents_inlined of string ]
60
61 (* Node wraps a Git tree plus a map of inlined small contents *)
62 type node = {
63 tree : Git.Tree.t;
64 inlined : (string * string) list; (* name -> inlined content *)
65 }
66
67 (* Inline contents up to 48 bytes directly in tree nodes, avoiding a
68 separate content-addressable store lookup. This breaks Git interop
69 for inlined entries but improves performance on small values. *)
70 let inline_threshold = 48
71
72 (* Convert between irmin Hash.sha1 and Git.Hash.t *)
73 let git_hash_of_sha1 (h : hash) : Git.Hash.t =
74 Git.Hash.of_raw_string (Hash.to_bytes h)
75
76 let sha1_of_git_hash (h : Git.Hash.t) : hash =
77 Hash.sha1_of_bytes (Git.Hash.to_raw_string h)
78
79 let empty_node = { tree = Git.Tree.empty; inlined = [] }
80 let is_empty node = Git.Tree.is_empty node.tree && node.inlined = []
81
82 let find node name =
83 match List.assoc_opt name node.inlined with
84 | Some data -> Some (`Contents_inlined data)
85 | None -> (
86 match Git.Tree.find ~name node.tree with
87 | None -> None
88 | Some entry -> (
89 let h = sha1_of_git_hash entry.hash in
90 match entry.perm with
91 | `Dir -> Some (`Node h)
92 | _ -> Some (`Contents h)))
93
94 let add node name kind =
95 match kind with
96 | `Contents_inlined data ->
97 let tree = Git.Tree.remove ~name node.tree in
98 let inlined =
99 (name, data) :: List.filter (fun (n, _) -> n <> name) node.inlined
100 in
101 { tree; inlined }
102 | `Node h ->
103 let inlined = List.filter (fun (n, _) -> n <> name) node.inlined in
104 let tree = Git.Tree.remove ~name node.tree in
105 let entry = Git.Tree.entry ~perm:`Dir ~name (git_hash_of_sha1 h) in
106 { tree = Git.Tree.add entry tree; inlined }
107 | `Contents h ->
108 let inlined = List.filter (fun (n, _) -> n <> name) node.inlined in
109 let tree = Git.Tree.remove ~name node.tree in
110 let entry =
111 Git.Tree.entry ~perm:`Normal ~name (git_hash_of_sha1 h)
112 in
113 { tree = Git.Tree.add entry tree; inlined }
114
115 let remove node name =
116 let inlined = List.filter (fun (n, _) -> n <> name) node.inlined in
117 { tree = Git.Tree.remove ~name node.tree; inlined }
118
119 let list node =
120 let tree_entries =
121 Git.Tree.to_list node.tree
122 |> List.map (fun (entry : Git.Tree.entry) ->
123 let h = sha1_of_git_hash entry.hash in
124 let kind =
125 match entry.perm with `Dir -> `Node h | _ -> `Contents h
126 in
127 (entry.name, kind))
128 in
129 let inlined_entries =
130 List.map
131 (fun (name, data) -> (name, `Contents_inlined data))
132 node.inlined
133 in
134 List.sort
135 (fun (a, _) (b, _) -> String.compare a b)
136 (tree_entries @ inlined_entries)
137
138 (* Serialization format:
139 - Version 0 (backward compat): raw Git tree bytes (starts with ASCII digit)
140 - Version 1: \x01 + 4-byte Git tree length + Git tree bytes + inlined entries
141 Each inlined entry: 2-byte name length + name + 4-byte data length + data
142 Standard Git trees always start with a mode digit (0-9), never \x01. *)
143
144 let bytes_of_node node =
145 let tree_bytes = Git.Tree.to_string node.tree in
146 if node.inlined = [] then tree_bytes
147 else
148 let buf = Buffer.create (String.length tree_bytes + 64) in
149 Buffer.add_char buf '\x01';
150 Buffer.add_int32_be buf (Int32.of_int (String.length tree_bytes));
151 Buffer.add_string buf tree_bytes;
152 let sorted =
153 List.sort (fun (a, _) (b, _) -> String.compare a b) node.inlined
154 in
155 List.iter
156 (fun (name, data) ->
157 let nlen = String.length name in
158 let dlen = String.length data in
159 Buffer.add_uint16_be buf nlen;
160 Buffer.add_string buf name;
161 Buffer.add_int32_be buf (Int32.of_int dlen);
162 Buffer.add_string buf data)
163 sorted;
164 Buffer.contents buf
165
166 let parse_inlined_entries s offset =
167 let len = String.length s in
168 let rec loop pos acc =
169 if pos >= len then List.rev acc
170 else
171 let nlen = Char.code s.[pos] lsl 8 lor Char.code s.[pos + 1] in
172 let name = String.sub s (pos + 2) nlen in
173 let dpos = pos + 2 + nlen in
174 let dlen =
175 Char.code s.[dpos] lsl 24
176 lor (Char.code s.[dpos + 1] lsl 16)
177 lor (Char.code s.[dpos + 2] lsl 8)
178 lor Char.code s.[dpos + 3]
179 in
180 let data = String.sub s (dpos + 4) dlen in
181 loop (dpos + 4 + dlen) ((name, data) :: acc)
182 in
183 loop offset []
184
185 let node_of_bytes s : (node, [> `Msg of string ]) result =
186 if String.length s = 0 then Ok { tree = Git.Tree.empty; inlined = [] }
187 else if Char.code s.[0] = 0x01 then begin
188 (* Version 1: has inlined entries *)
189 let tree_len =
190 Char.code s.[1] lsl 24
191 lor (Char.code s.[2] lsl 16)
192 lor (Char.code s.[3] lsl 8)
193 lor Char.code s.[4]
194 in
195 let tree_bytes = String.sub s 5 tree_len in
196 let inlined_start = 5 + tree_len in
197 let inlined = parse_inlined_entries s inlined_start in
198 match Git.Tree.of_string tree_bytes with
199 | Ok tree -> Ok { tree; inlined }
200 | Error (`Msg m) -> Error (`Msg m)
201 end
202 else
203 (* Version 0: standard Git tree, no inlined entries *)
204 match Git.Tree.of_string s with
205 | Ok tree -> Ok { tree; inlined = [] }
206 | Error (`Msg m) -> Error (`Msg m)
207
208 let hash_node node =
209 let data = bytes_of_node node in
210 sha1_of_git_hash (Git.Hash.digest_string ~kind:`Tree data)
211
212 let hash_contents data =
213 sha1_of_git_hash (Git.Hash.digest_string ~kind:`Blob data)
214
215 let hash_to_bytes = Hash.to_bytes
216 let hash_to_hex = Hash.to_hex
217 let hash_of_hex s : (hash, [> `Msg of string ]) result = Hash.sha1_of_hex s
218 let hash_of_raw_bytes = Hash.sha1_of_bytes
219 let hash_equal = Hash.equal
220 let hash_compare = Hash.compare
221
222 (* Commit operations using ocaml-git *)
223 type commit = Git.Commit.t
224
225 let commit_make ~tree ~parents ~author ~committer ~message ~timestamp =
226 let user_of_string s =
227 (* Parse "Name <email>" format *)
228 match String.index_opt s '<' with
229 | None -> Git.User.v ~name:s ~email:"" ~date:timestamp ()
230 | Some i ->
231 let name = String.trim (String.sub s 0 i) in
232 let rest = String.sub s (i + 1) (String.length s - i - 1) in
233 let email =
234 match String.index_opt rest '>' with
235 | None -> rest
236 | Some j -> String.sub rest 0 j
237 in
238 Git.User.v ~name ~email ~date:timestamp ()
239 in
240 Git.Commit.v ~tree:(git_hash_of_sha1 tree)
241 ~parents:(List.map git_hash_of_sha1 parents)
242 ~author:(user_of_string author) ~committer:(user_of_string committer)
243 (Some message)
244
245 let commit_tree c = sha1_of_git_hash (Git.Commit.tree c)
246 let commit_parents c = List.map sha1_of_git_hash (Git.Commit.parents c)
247
248 let user_to_string u =
249 let name = Git.User.name u in
250 let email = Git.User.email u in
251 if email = "" then name else Fmt.str "%s <%s>" name email
252
253 let commit_author c = user_to_string (Git.Commit.author c)
254 let commit_committer c = user_to_string (Git.Commit.committer c)
255 let commit_message c = Option.value ~default:"" (Git.Commit.message c)
256 let commit_timestamp c = Git.User.date (Git.Commit.author c)
257
258 let commit_of_bytes s : (commit, [> `Msg of string ]) result =
259 match Git.Commit.of_string s with
260 | Ok c -> Ok c
261 | Error (`Msg m) -> Error (`Msg m)
262
263 let commit_to_bytes = Git.Commit.to_string
264 let commit_hash c = sha1_of_git_hash (Git.Commit.digest c)
265end
266
267(** ATProto Merkle Search Tree format using ocaml-atp.
268
269 MST uses SHA-256 with 2-bit prefix counting for tree depth. Keys are stored
270 sorted with common prefix compression. Encoded as DAG-CBOR. *)
271module Mst : SHA256 = struct
272 type hash = Hash.sha256
273
274 type entry =
275 [ `Node of hash | `Contents of hash | `Contents_inlined of string ]
276
277 (* MST/DAG-CBOR stores all values as CID references. Inlining is not
278 supported: Contents_inlined is converted to a hash in [add], so the
279 data would be lost if not written separately to the backend. *)
280 let inline_threshold = 0
281
282 (* Convert between irmin Hash.sha256 and Atp.Cid.t *)
283 let cid_of_sha256 (h : hash) : Atp.Cid.t =
284 Atp.Cid.of_digest `Dag_cbor (Hash.to_bytes h)
285
286 let sha256_of_cid (cid : Atp.Cid.t) : hash =
287 Hash.sha256_of_bytes (Atp.Cid.digest cid)
288
289 (* Our node wraps Atp.Mst.Raw.node for serialization *)
290 type node = Atp.Mst.Raw.node
291
292 let empty_node : node = { l = None; e = [] }
293 let is_empty (node : node) = node.l = None && node.e = []
294
295 (* Decompress key from entry list *)
296 let decompress_keys (entries : Atp.Mst.Raw.entry list) :
297 (string * Atp.Mst.Raw.entry) list =
298 let rec loop prev_key acc = function
299 | [] -> List.rev acc
300 | (e : Atp.Mst.Raw.entry) :: rest ->
301 let key = String.sub prev_key 0 e.p ^ e.k in
302 loop key ((key, e) :: acc) rest
303 in
304 loop "" [] entries
305
306 let find (node : node) name =
307 let entries = decompress_keys node.e in
308 match List.find_opt (fun (k, _) -> k = name) entries with
309 | None -> None
310 | Some (_, e) ->
311 (* In MST, all values are content CIDs, subtrees are in 't' field *)
312 Some (`Contents (sha256_of_cid e.v))
313
314 (* Compress keys for serialization *)
315 let compress_keys entries =
316 let sorted =
317 List.sort (fun (k1, _) (k2, _) -> String.compare k1 k2) entries
318 in
319 let rec loop prev_key acc = function
320 | [] -> List.rev acc
321 | (key, (v, t)) :: rest ->
322 let p =
323 let rec shared i =
324 if i >= String.length prev_key || i >= String.length key then i
325 else if prev_key.[i] = key.[i] then shared (i + 1)
326 else i
327 in
328 shared 0
329 in
330 let k = String.sub key p (String.length key - p) in
331 let entry : Atp.Mst.Raw.entry = { p; k; v; t } in
332 loop key (entry :: acc) rest
333 in
334 loop "" [] sorted
335
336 let add (node : node) name kind =
337 let entries = decompress_keys node.e in
338 let v =
339 match kind with
340 | `Contents h -> cid_of_sha256 h
341 | `Node h -> cid_of_sha256 h
342 | `Contents_inlined s ->
343 (* Fallback: hash the content and store as CID *)
344 cid_of_sha256 (Hash.sha256 s)
345 in
346 let entries = List.filter (fun (k, _) -> k <> name) entries in
347 let entries =
348 (name, (v, None))
349 :: List.map (fun (k, (e : Atp.Mst.Raw.entry)) -> (k, (e.v, e.t))) entries
350 in
351 let compressed = compress_keys entries in
352 { node with e = compressed }
353
354 let remove (node : node) name =
355 let entries = decompress_keys node.e in
356 let entries = List.filter (fun (k, _) -> k <> name) entries in
357 let entries =
358 List.map (fun (k, (e : Atp.Mst.Raw.entry)) -> (k, (e.v, e.t))) entries
359 in
360 let compressed = compress_keys entries in
361 { node with e = compressed }
362
363 let list (node : node) =
364 let entries = decompress_keys node.e in
365 List.map
366 (fun (key, (e : Atp.Mst.Raw.entry)) ->
367 (key, `Contents (sha256_of_cid e.v)))
368 entries
369
370 let bytes_of_node node = Atp.Mst.Raw.encode_bytes node
371
372 let node_of_bytes data : (node, [> `Msg of string ]) result =
373 try Ok (Atp.Mst.Raw.decode_bytes data)
374 with exn ->
375 Error (`Msg ("failed to decode MST node: " ^ Printexc.to_string exn))
376
377 let hash_node node =
378 let data = bytes_of_node node in
379 Hash.sha256 data
380
381 let hash_contents data = Hash.sha256 data
382 let hash_to_bytes = Hash.to_bytes
383 let hash_to_hex = Hash.to_hex
384 let hash_of_hex s : (hash, [> `Msg of string ]) result = Hash.sha256_of_hex s
385 let hash_of_raw_bytes = Hash.sha256_of_bytes
386 let hash_equal = Hash.equal
387 let hash_compare = Hash.compare
388
389 (* Commit operations for MST format using DAG-CBOR *)
390 type commit = {
391 tree : hash;
392 parents : hash list;
393 author : string;
394 committer : string;
395 message : string;
396 timestamp : int64;
397 }
398
399 let commit_make ~tree ~parents ~author ~committer ~message ~timestamp =
400 { tree; parents; author; committer; message; timestamp }
401
402 let commit_tree c = c.tree
403 let commit_parents c = c.parents
404 let commit_author c = c.author
405 let commit_committer c = c.committer
406 let commit_message c = c.message
407 let commit_timestamp c = c.timestamp
408
409 let commit_of_bytes s : (commit, [> `Msg of string ]) result =
410 try
411 let v = Atp.Dagcbor.decode_string ~cid_format:`Atproto s in
412 match v with
413 | `Map fields ->
414 let get_string key =
415 match List.assoc_opt key fields with
416 | Some (`String s) -> s
417 | _ -> ""
418 in
419 let get_int64 key =
420 match List.assoc_opt key fields with Some (`Int i) -> i | _ -> 0L
421 in
422 let get_link key =
423 match List.assoc_opt key fields with
424 | Some (`Link cid) -> sha256_of_cid cid
425 | _ -> Hash.sha256 ""
426 in
427 let get_links key =
428 match List.assoc_opt key fields with
429 | Some (`List links) ->
430 List.filter_map
431 (function `Link cid -> Some (sha256_of_cid cid) | _ -> None)
432 links
433 | _ -> []
434 in
435 Ok
436 {
437 tree = get_link "tree";
438 parents = get_links "parents";
439 author = get_string "author";
440 committer = get_string "committer";
441 message = get_string "message";
442 timestamp = get_int64 "timestamp";
443 }
444 | _ -> Error (`Msg "expected map for commit")
445 with Eio.Io _ as e -> Error (`Msg (Printexc.to_string e))
446
447 let commit_to_bytes c =
448 let v : Atp.Dagcbor.value =
449 `Map
450 [
451 ("author", `String c.author);
452 ("committer", `String c.committer);
453 ("message", `String c.message);
454 ( "parents",
455 `List (List.map (fun h -> `Link (cid_of_sha256 h)) c.parents) );
456 ("timestamp", `Int c.timestamp);
457 ("tree", `Link (cid_of_sha256 c.tree));
458 ]
459 in
460 Atp.Dagcbor.encode_string ~cid_format:`Atproto v
461
462 let commit_hash c =
463 let data = commit_to_bytes c in
464 Hash.sha256 data
465end