forked from
gazagnaire.org/irmin
Persistent store with Git semantics: lazy reads, delayed writes, content-addressing
1module type S = sig
2 type node
3 type hash
4
5 type entry =
6 [ `Node of hash | `Contents of hash | `Contents_inlined of string ]
7
8 val inline_threshold : int
9 val hash_node : node -> hash
10 val hash_contents : string -> hash
11 val node_of_bytes : string -> (node, [> `Msg of string ]) result
12 val bytes_of_node : node -> string
13 val empty_node : node
14 val find : node -> string -> entry option
15 val add : node -> string -> entry -> node
16 val remove : node -> string -> node
17 val list : node -> (string * entry) list
18 val is_empty : node -> bool
19
20 (* Hash operations *)
21 val hash_to_bytes : hash -> string
22 val hash_to_hex : hash -> string
23 val hash_of_hex : string -> (hash, [> `Msg of string ]) result
24 val hash_equal : hash -> hash -> bool
25 val hash_compare : hash -> hash -> int
26
27 (* Commit operations *)
28 type commit
29
30 val commit_make :
31 tree:hash ->
32 parents:hash list ->
33 author:string ->
34 committer:string ->
35 message:string ->
36 timestamp:int64 ->
37 commit
38
39 val commit_tree : commit -> hash
40 val commit_parents : commit -> hash list
41 val commit_author : commit -> string
42 val commit_committer : commit -> string
43 val commit_message : commit -> string
44 val commit_timestamp : commit -> int64
45 val commit_of_bytes : string -> (commit, [> `Msg of string ]) result
46 val commit_to_bytes : commit -> string
47 val commit_hash : commit -> hash
48end
49
50module type SHA1 = S with type hash = Hash.sha1
51module type SHA256 = S with type hash = Hash.sha256
52
53(** Git tree object format using ocaml-git. *)
54module Git : SHA1 = struct
55 type hash = Hash.sha1
56
57 type entry =
58 [ `Node of hash | `Contents of hash | `Contents_inlined of string ]
59
60 (* Node wraps a Git tree plus a map of inlined small contents *)
61 type node = {
62 tree : Git.Tree.t;
63 inlined : (string * string) list; (* name -> inlined content *)
64 }
65
66 (* Inline contents up to 48 bytes directly in tree nodes, avoiding a
67 separate content-addressable store lookup. This breaks Git interop
68 for inlined entries but improves performance on small values. *)
69 let inline_threshold = 48
70
71 (* Convert between irmin Hash.sha1 and Git.Hash.t *)
72 let git_hash_of_sha1 (h : hash) : Git.Hash.t =
73 Git.Hash.of_raw_string (Hash.to_bytes h)
74
75 let sha1_of_git_hash (h : Git.Hash.t) : hash =
76 Hash.sha1_of_bytes (Git.Hash.to_raw_string h)
77
78 let empty_node = { tree = Git.Tree.empty; inlined = [] }
79 let is_empty node = Git.Tree.is_empty node.tree && node.inlined = []
80
81 let find node name =
82 match List.assoc_opt name node.inlined with
83 | Some data -> Some (`Contents_inlined data)
84 | None -> (
85 match Git.Tree.find ~name node.tree with
86 | None -> None
87 | Some entry -> (
88 let h = sha1_of_git_hash entry.hash in
89 match entry.perm with
90 | `Dir -> Some (`Node h)
91 | _ -> Some (`Contents h)))
92
93 let add node name kind =
94 match kind with
95 | `Contents_inlined data ->
96 let tree = Git.Tree.remove ~name node.tree in
97 let inlined =
98 (name, data) :: List.filter (fun (n, _) -> n <> name) node.inlined
99 in
100 { tree; inlined }
101 | `Node h ->
102 let inlined = List.filter (fun (n, _) -> n <> name) node.inlined in
103 let entry = Git.Tree.entry ~perm:`Dir ~name (git_hash_of_sha1 h) in
104 { tree = Git.Tree.add entry node.tree; inlined }
105 | `Contents h ->
106 let inlined = List.filter (fun (n, _) -> n <> name) node.inlined in
107 let entry =
108 Git.Tree.entry ~perm:`Normal ~name (git_hash_of_sha1 h)
109 in
110 { tree = Git.Tree.add entry node.tree; inlined }
111
112 let remove node name =
113 let inlined = List.filter (fun (n, _) -> n <> name) node.inlined in
114 { tree = Git.Tree.remove ~name node.tree; inlined }
115
116 let list node =
117 let tree_entries =
118 Git.Tree.to_list node.tree
119 |> List.map (fun (entry : Git.Tree.entry) ->
120 let h = sha1_of_git_hash entry.hash in
121 let kind =
122 match entry.perm with `Dir -> `Node h | _ -> `Contents h
123 in
124 (entry.name, kind))
125 in
126 let inlined_entries =
127 List.map
128 (fun (name, data) -> (name, `Contents_inlined data))
129 node.inlined
130 in
131 List.sort
132 (fun (a, _) (b, _) -> String.compare a b)
133 (tree_entries @ inlined_entries)
134
135 (* Serialization format:
136 - Version 0 (backward compat): raw Git tree bytes (starts with ASCII digit)
137 - Version 1: \x01 + 4-byte Git tree length + Git tree bytes + inlined entries
138 Each inlined entry: 2-byte name length + name + 4-byte data length + data
139 Standard Git trees always start with a mode digit (0-9), never \x01. *)
140
141 let bytes_of_node node =
142 let tree_bytes = Git.Tree.to_string node.tree in
143 if node.inlined = [] then tree_bytes
144 else
145 let buf = Buffer.create (String.length tree_bytes + 64) in
146 Buffer.add_char buf '\x01';
147 Buffer.add_int32_be buf (Int32.of_int (String.length tree_bytes));
148 Buffer.add_string buf tree_bytes;
149 let sorted =
150 List.sort (fun (a, _) (b, _) -> String.compare a b) node.inlined
151 in
152 List.iter
153 (fun (name, data) ->
154 let nlen = String.length name in
155 let dlen = String.length data in
156 Buffer.add_uint16_be buf nlen;
157 Buffer.add_string buf name;
158 Buffer.add_int32_be buf (Int32.of_int dlen);
159 Buffer.add_string buf data)
160 sorted;
161 Buffer.contents buf
162
163 let parse_inlined_entries s offset =
164 let len = String.length s in
165 let rec loop pos acc =
166 if pos >= len then List.rev acc
167 else
168 let nlen = Char.code s.[pos] lsl 8 lor Char.code s.[pos + 1] in
169 let name = String.sub s (pos + 2) nlen in
170 let dpos = pos + 2 + nlen in
171 let dlen =
172 Char.code s.[dpos] lsl 24
173 lor (Char.code s.[dpos + 1] lsl 16)
174 lor (Char.code s.[dpos + 2] lsl 8)
175 lor Char.code s.[dpos + 3]
176 in
177 let data = String.sub s (dpos + 4) dlen in
178 loop (dpos + 4 + dlen) ((name, data) :: acc)
179 in
180 loop offset []
181
182 let node_of_bytes s : (node, [> `Msg of string ]) result =
183 if String.length s = 0 then Ok { tree = Git.Tree.empty; inlined = [] }
184 else if Char.code s.[0] = 0x01 then begin
185 (* Version 1: has inlined entries *)
186 let tree_len =
187 Char.code s.[1] lsl 24
188 lor (Char.code s.[2] lsl 16)
189 lor (Char.code s.[3] lsl 8)
190 lor Char.code s.[4]
191 in
192 let tree_bytes = String.sub s 5 tree_len in
193 let inlined_start = 5 + tree_len in
194 let inlined = parse_inlined_entries s inlined_start in
195 match Git.Tree.of_string tree_bytes with
196 | Ok tree -> Ok { tree; inlined }
197 | Error (`Msg m) -> Error (`Msg m)
198 end
199 else
200 (* Version 0: standard Git tree, no inlined entries *)
201 match Git.Tree.of_string s with
202 | Ok tree -> Ok { tree; inlined = [] }
203 | Error (`Msg m) -> Error (`Msg m)
204
205 let hash_node node =
206 let data = bytes_of_node node in
207 sha1_of_git_hash (Git.Hash.digest_string ~kind:`Tree data)
208
209 let hash_contents data =
210 sha1_of_git_hash (Git.Hash.digest_string ~kind:`Blob data)
211
212 let hash_to_bytes = Hash.to_bytes
213 let hash_to_hex = Hash.to_hex
214 let hash_of_hex s : (hash, [> `Msg of string ]) result = Hash.sha1_of_hex s
215 let hash_equal = Hash.equal
216 let hash_compare = Hash.compare
217
218 (* Commit operations using ocaml-git *)
219 type commit = Git.Commit.t
220
221 let commit_make ~tree ~parents ~author ~committer ~message ~timestamp =
222 let user_of_string s =
223 (* Parse "Name <email>" format *)
224 match String.index_opt s '<' with
225 | None -> Git.User.v ~name:s ~email:"" ~date:timestamp ()
226 | Some i ->
227 let name = String.trim (String.sub s 0 i) in
228 let rest = String.sub s (i + 1) (String.length s - i - 1) in
229 let email =
230 match String.index_opt rest '>' with
231 | None -> rest
232 | Some j -> String.sub rest 0 j
233 in
234 Git.User.v ~name ~email ~date:timestamp ()
235 in
236 Git.Commit.v ~tree:(git_hash_of_sha1 tree)
237 ~parents:(List.map git_hash_of_sha1 parents)
238 ~author:(user_of_string author) ~committer:(user_of_string committer)
239 (Some message)
240
241 let commit_tree c = sha1_of_git_hash (Git.Commit.tree c)
242 let commit_parents c = List.map sha1_of_git_hash (Git.Commit.parents c)
243
244 let user_to_string u =
245 let name = Git.User.name u in
246 let email = Git.User.email u in
247 if email = "" then name else Printf.sprintf "%s <%s>" name email
248
249 let commit_author c = user_to_string (Git.Commit.author c)
250 let commit_committer c = user_to_string (Git.Commit.committer c)
251 let commit_message c = Option.value ~default:"" (Git.Commit.message c)
252 let commit_timestamp c = Git.User.date (Git.Commit.author c)
253
254 let commit_of_bytes s : (commit, [> `Msg of string ]) result =
255 match Git.Commit.of_string s with
256 | Ok c -> Ok c
257 | Error (`Msg m) -> Error (`Msg m)
258
259 let commit_to_bytes = Git.Commit.to_string
260 let commit_hash c = sha1_of_git_hash (Git.Commit.digest c)
261end
262
263(** ATProto Merkle Search Tree format using ocaml-atp.
264
265 MST uses SHA-256 with 2-bit prefix counting for tree depth. Keys are stored
266 sorted with common prefix compression. Encoded as DAG-CBOR. *)
267module Mst : SHA256 = struct
268 type hash = Hash.sha256
269
270 type entry =
271 [ `Node of hash | `Contents of hash | `Contents_inlined of string ]
272
273 let inline_threshold = 48
274
275 (* Convert between irmin Hash.sha256 and Atp.Cid.t *)
276 let cid_of_sha256 (h : hash) : Atp.Cid.t =
277 Atp.Cid.of_digest `Dag_cbor (Hash.to_bytes h)
278
279 let sha256_of_cid (cid : Atp.Cid.t) : hash =
280 Hash.sha256_of_bytes (Atp.Cid.digest cid)
281
282 (* Our node wraps Atp.Mst.Raw.node for serialization *)
283 type node = Atp.Mst.Raw.node
284
285 let empty_node : node = { l = None; e = [] }
286 let is_empty (node : node) = node.l = None && node.e = []
287
288 (* Decompress key from entry list *)
289 let decompress_keys (entries : Atp.Mst.Raw.entry list) :
290 (string * Atp.Mst.Raw.entry) list =
291 let rec loop prev_key acc = function
292 | [] -> List.rev acc
293 | (e : Atp.Mst.Raw.entry) :: rest ->
294 let key = String.sub prev_key 0 e.p ^ e.k in
295 loop key ((key, e) :: acc) rest
296 in
297 loop "" [] entries
298
299 let find (node : node) name =
300 let entries = decompress_keys node.e in
301 match List.find_opt (fun (k, _) -> k = name) entries with
302 | None -> None
303 | Some (_, e) ->
304 (* In MST, all values are content CIDs, subtrees are in 't' field *)
305 Some (`Contents (sha256_of_cid e.v))
306
307 (* Compress keys for serialization *)
308 let compress_keys entries =
309 let sorted =
310 List.sort (fun (k1, _) (k2, _) -> String.compare k1 k2) entries
311 in
312 let rec loop prev_key acc = function
313 | [] -> List.rev acc
314 | (key, (v, t)) :: rest ->
315 let p =
316 let rec shared i =
317 if i >= String.length prev_key || i >= String.length key then i
318 else if prev_key.[i] = key.[i] then shared (i + 1)
319 else i
320 in
321 shared 0
322 in
323 let k = String.sub key p (String.length key - p) in
324 let entry : Atp.Mst.Raw.entry = { p; k; v; t } in
325 loop key (entry :: acc) rest
326 in
327 loop "" [] sorted
328
329 let add (node : node) name kind =
330 let entries = decompress_keys node.e in
331 let v =
332 match kind with
333 | `Contents h -> cid_of_sha256 h
334 | `Node h -> cid_of_sha256 h
335 | `Contents_inlined s ->
336 (* Fallback: hash the content and store as CID *)
337 cid_of_sha256 (Hash.sha256 s)
338 in
339 let entries = List.filter (fun (k, _) -> k <> name) entries in
340 let entries =
341 (name, (v, None))
342 :: List.map (fun (k, (e : Atp.Mst.Raw.entry)) -> (k, (e.v, e.t))) entries
343 in
344 let compressed = compress_keys entries in
345 { node with e = compressed }
346
347 let remove (node : node) name =
348 let entries = decompress_keys node.e in
349 let entries = List.filter (fun (k, _) -> k <> name) entries in
350 let entries =
351 List.map (fun (k, (e : Atp.Mst.Raw.entry)) -> (k, (e.v, e.t))) entries
352 in
353 let compressed = compress_keys entries in
354 { node with e = compressed }
355
356 let list (node : node) =
357 let entries = decompress_keys node.e in
358 List.map
359 (fun (key, (e : Atp.Mst.Raw.entry)) ->
360 (key, `Contents (sha256_of_cid e.v)))
361 entries
362
363 let bytes_of_node node = Atp.Mst.Raw.encode_bytes node
364
365 let node_of_bytes data : (node, [> `Msg of string ]) result =
366 try Ok (Atp.Mst.Raw.decode_bytes data)
367 with _ -> Error (`Msg "failed to decode MST node")
368
369 let hash_node node =
370 let data = bytes_of_node node in
371 Hash.sha256 data
372
373 let hash_contents data = Hash.sha256 data
374 let hash_to_bytes = Hash.to_bytes
375 let hash_to_hex = Hash.to_hex
376 let hash_of_hex s : (hash, [> `Msg of string ]) result = Hash.sha256_of_hex s
377 let hash_equal = Hash.equal
378 let hash_compare = Hash.compare
379
380 (* Commit operations for MST format using DAG-CBOR *)
381 type commit = {
382 tree : hash;
383 parents : hash list;
384 author : string;
385 committer : string;
386 message : string;
387 timestamp : int64;
388 }
389
390 let commit_make ~tree ~parents ~author ~committer ~message ~timestamp =
391 { tree; parents; author; committer; message; timestamp }
392
393 let commit_tree c = c.tree
394 let commit_parents c = c.parents
395 let commit_author c = c.author
396 let commit_committer c = c.committer
397 let commit_message c = c.message
398 let commit_timestamp c = c.timestamp
399
400 let commit_of_bytes s : (commit, [> `Msg of string ]) result =
401 try
402 let v = Atp.Dagcbor.decode_string ~cid_format:`Atproto s in
403 match v with
404 | `Map fields ->
405 let get_string key =
406 match List.assoc_opt key fields with
407 | Some (`String s) -> s
408 | _ -> ""
409 in
410 let get_int64 key =
411 match List.assoc_opt key fields with Some (`Int i) -> i | _ -> 0L
412 in
413 let get_link key =
414 match List.assoc_opt key fields with
415 | Some (`Link cid) -> sha256_of_cid cid
416 | _ -> Hash.sha256 ""
417 in
418 let get_links key =
419 match List.assoc_opt key fields with
420 | Some (`List links) ->
421 List.filter_map
422 (function `Link cid -> Some (sha256_of_cid cid) | _ -> None)
423 links
424 | _ -> []
425 in
426 Ok
427 {
428 tree = get_link "tree";
429 parents = get_links "parents";
430 author = get_string "author";
431 committer = get_string "committer";
432 message = get_string "message";
433 timestamp = get_int64 "timestamp";
434 }
435 | _ -> Error (`Msg "expected map for commit")
436 with Eio.Io _ as e -> Error (`Msg (Printexc.to_string e))
437
438 let commit_to_bytes c =
439 let v : Atp.Dagcbor.value =
440 `Map
441 [
442 ("author", `String c.author);
443 ("committer", `String c.committer);
444 ("message", `String c.message);
445 ( "parents",
446 `List (List.map (fun h -> `Link (cid_of_sha256 h)) c.parents) );
447 ("timestamp", `Int c.timestamp);
448 ("tree", `Link (cid_of_sha256 c.tree));
449 ]
450 in
451 Atp.Dagcbor.encode_string ~cid_format:`Atproto v
452
453 let commit_hash c =
454 let data = commit_to_bytes c in
455 Hash.sha256 data
456end