forked from
gazagnaire.org/irmin
Persistent store with Git semantics: lazy reads, delayed writes, content-addressing
1(** Inode: structural sharing for large tree nodes.
2
3 Entries are distributed across a 32-way trie based on the hash of their
4 name. Each leaf holds at most [max_entries] entries as a regular flat node.
5 Internal (routing) nodes are serialized with a [\x02] marker followed by
6 (bucket_index, child_hash) pairs.
7
8 Serialization format for inode tree nodes:
9 {v \x02 <count:1 byte> (<index:1 byte> <hash:N bytes>)* v}
10 where N is the hash size (20 for SHA-1, 32 for SHA-256). *)
11
12module Make (F : Codec.S) = struct
13 type hash = F.hash
14
15 let max_entries = 32
16 let branching = 32
17 let log_branching = 5
18 (* Hashtbl.hash returns 30 usable bits; beyond this depth all names
19 map to bucket 0, so we must stop splitting. *)
20 let max_depth = 30 / log_branching
21
22 let inode_tag = 0x02
23
24 let is_inode data =
25 String.length data > 0 && Char.code data.[0] = inode_tag
26
27 (* Hash size in bytes, computed once. *)
28 let hash_size = String.length (F.hash_to_bytes (F.hash_contents ""))
29
30 let bucket ~depth name =
31 let h = Hashtbl.hash name in
32 (h lsr (depth * log_branching)) land (branching - 1)
33
34 (* --- Serialization of inode tree (routing) nodes --- *)
35
36 let serialize_inode children =
37 let buf = Buffer.create (2 + (branching * (1 + hash_size))) in
38 Buffer.add_char buf (Char.chr inode_tag);
39 (* Count non-empty children *)
40 let count = Array.fold_left (fun n c ->
41 match c with None -> n | Some _ -> n + 1) 0 children in
42 Buffer.add_uint8 buf count;
43 Array.iteri (fun i c ->
44 match c with
45 | None -> ()
46 | Some h ->
47 Buffer.add_uint8 buf i;
48 Buffer.add_string buf (F.hash_to_bytes h))
49 children;
50 Buffer.contents buf
51
52 let parse_inode data =
53 let count = Char.code data.[1] in
54 let children = Array.make branching None in
55 let pos = ref 2 in
56 for _ = 1 to count do
57 let idx = Char.code data.[!pos] in
58 let raw = String.sub data (!pos + 1) hash_size in
59 children.(idx) <- Some (F.hash_of_raw_bytes raw);
60 pos := !pos + 1 + hash_size
61 done;
62 children
63
64 (* --- Flat node helpers --- *)
65
66 let entries_of_node node =
67 F.list node
68
69 let node_of_entries entries =
70 List.fold_left (fun n (name, entry) -> F.add n name entry) F.empty_node entries
71
72 let write_flat entries ~(backend : hash Backend.t) =
73 let node = node_of_entries entries in
74 let data = F.bytes_of_node node in
75 let h = F.hash_node node in
76 backend.write h data;
77 h
78
79 let hash_inode_data data =
80 F.hash_contents data
81
82 (* --- Write an inode trie from a flat list of entries --- *)
83
84 let rec write_entries ~depth entries ~(backend : hash Backend.t) =
85 if List.length entries <= max_entries || depth >= max_depth then
86 write_flat entries ~backend
87 else begin
88 let buckets = Array.make branching [] in
89 List.iter (fun ((name, _) as entry) ->
90 let idx = bucket ~depth name in
91 buckets.(idx) <- entry :: buckets.(idx))
92 entries;
93 let children = Array.map (fun bucket ->
94 match bucket with
95 | [] -> None
96 | entries -> Some (write_entries ~depth:(depth + 1) entries ~backend))
97 buckets
98 in
99 let data = serialize_inode children in
100 let h = hash_inode_data data in
101 backend.write h data;
102 h
103 end
104
105 let write entries ~backend = write_entries ~depth:0 entries ~backend
106
107 (* --- Find a single entry by name --- *)
108
109 let rec find_at ~depth ~(backend : hash Backend.t) hash name =
110 match backend.read hash with
111 | None -> None
112 | Some data ->
113 if is_inode data then begin
114 let children = parse_inode data in
115 let idx = bucket ~depth name in
116 match children.(idx) with
117 | None -> None
118 | Some child -> find_at ~depth:(depth + 1) ~backend child name
119 end else
120 match F.node_of_bytes data with
121 | Ok node -> F.find node name
122 | Error _ -> None
123
124 let find ~backend hash name = find_at ~depth:0 ~backend hash name
125
126 (* --- List all entries --- *)
127
128 let rec list_at ~(backend : hash Backend.t) hash =
129 match backend.read hash with
130 | None -> []
131 | Some data ->
132 if is_inode data then begin
133 let children = parse_inode data in
134 Array.to_list children
135 |> List.filter_map Fun.id
136 |> List.concat_map (list_at ~backend)
137 end else
138 match F.node_of_bytes data with
139 | Ok node -> entries_of_node node
140 | Error _ -> []
141
142 let list_all ~backend hash = list_at ~backend hash
143
144 (* --- Incremental update --- *)
145
146 let rec update_at ~depth ~(backend : hash Backend.t) hash ~additions ~removals =
147 match backend.read hash with
148 | None ->
149 (* Missing node: just write additions *)
150 if additions = [] then hash
151 else write_entries ~depth additions ~backend
152 | Some data ->
153 if is_inode data then
154 update_inode ~depth ~backend data ~additions ~removals
155 else
156 update_flat ~depth ~backend data ~additions ~removals
157
158 and update_inode ~depth ~(backend : hash Backend.t) data ~additions ~removals =
159 let children = parse_inode data in
160 (* Group additions and removals by bucket *)
161 let adds = Array.make branching [] in
162 let rems = Array.make branching [] in
163 List.iter (fun ((name, _) as e) ->
164 let idx = bucket ~depth name in
165 adds.(idx) <- e :: adds.(idx))
166 additions;
167 List.iter (fun name ->
168 let idx = bucket ~depth name in
169 rems.(idx) <- name :: rems.(idx))
170 removals;
171 (* Update only affected buckets *)
172 let changed = ref false in
173 let new_children = Array.copy children in
174 for i = 0 to branching - 1 do
175 if adds.(i) <> [] || rems.(i) <> [] then begin
176 changed := true;
177 match children.(i) with
178 | None ->
179 if adds.(i) <> [] then
180 new_children.(i) <-
181 Some (write_entries ~depth:(depth + 1) adds.(i) ~backend)
182 | Some child_hash ->
183 let new_hash =
184 update_at ~depth:(depth + 1) ~backend child_hash
185 ~additions:adds.(i) ~removals:rems.(i)
186 in
187 new_children.(i) <- Some new_hash
188 end
189 done;
190 if not !changed then
191 hash_inode_data data
192 else begin
193 (* Check if we should demote back to flat *)
194 let total =
195 Array.fold_left (fun n c ->
196 match c with None -> n | Some _ -> n + 1) 0 new_children
197 in
198 if total = 0 then
199 write_flat [] ~backend
200 else begin
201 let new_data = serialize_inode new_children in
202 let h = hash_inode_data new_data in
203 backend.write h new_data;
204 h
205 end
206 end
207
208 and update_flat ~depth ~(backend : hash Backend.t) data ~additions ~removals =
209 match F.node_of_bytes data with
210 | Error _ -> write_entries ~depth additions ~backend
211 | Ok node ->
212 let node = List.fold_left (fun n name -> F.remove n name) node removals in
213 let node =
214 List.fold_left (fun n (name, entry) -> F.add n name entry) node additions
215 in
216 let entries = entries_of_node node in
217 let count = List.length entries in
218 if count > max_entries && depth < max_depth then
219 write_entries ~depth entries ~backend
220 else begin
221 let new_data = F.bytes_of_node node in
222 let h = F.hash_node node in
223 backend.write h new_data;
224 h
225 end
226
227 let update ~backend hash ~additions ~removals =
228 update_at ~depth:0 ~backend hash ~additions ~removals
229end