Git object storage and pack files for Eio
1(* Copyright (c) 2013-2017 Thomas Gazagnaire <thomas@gazagnaire.org>
2 Copyright (c) 2017-2024 Romain Calascibetta <romain.calascibetta@gmail.com>
3 Copyright (c) 2024-2026 Thomas Gazagnaire <thomas@gazagnaire.org>
4
5 Permission to use, copy, modify, and distribute this software for any
6 purpose with or without fee is hereby granted, provided that the above
7 copyright notice and this permission notice appear in all copies.
8
9 THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. *)
16
17(** Git pack file support.
18
19 Pack files store multiple git objects in a compressed format with delta
20 encoding for efficient storage. *)
21
22module Reader = Bytesrw.Bytes.Reader
23module Slice = Bytesrw.Bytes.Slice
24
25(** {1 Error helpers} *)
26
27let err_invalid_obj_type n =
28 Error (`Msg (Fmt.str "Invalid pack object type: %d" n))
29
30let err_unsupported_version v =
31 Error (`Msg (Fmt.str "Unsupported pack version: %d" v))
32
33(** {1 Pack file format}
34
35 Pack file structure:
36 - Header: "PACK" (4 bytes) + version (4 bytes, big-endian) + count (4 bytes)
37 - Objects: variable-length entries
38 - Footer: SHA-1 checksum (20 bytes) *)
39
40(** Object types in pack files. *)
41type obj_type =
42 | Commit
43 | Tree
44 | Blob
45 | Tag
46 | Ofs_delta (** Delta relative to offset in same pack *)
47 | Ref_delta (** Delta relative to object by hash *)
48
49let obj_type_to_int = function
50 | Commit -> 1
51 | Tree -> 2
52 | Blob -> 3
53 | Tag -> 4
54 | Ofs_delta -> 6
55 | Ref_delta -> 7
56
57let obj_type_of_int = function
58 | 1 -> Ok Commit
59 | 2 -> Ok Tree
60 | 3 -> Ok Blob
61 | 4 -> Ok Tag
62 | 6 -> Ok Ofs_delta
63 | 7 -> Ok Ref_delta
64 | n -> err_invalid_obj_type n
65
66let pp_obj_type ppf = function
67 | Commit -> Fmt.string ppf "commit"
68 | Tree -> Fmt.string ppf "tree"
69 | Blob -> Fmt.string ppf "blob"
70 | Tag -> Fmt.string ppf "tag"
71 | Ofs_delta -> Fmt.string ppf "ofs-delta"
72 | Ref_delta -> Fmt.string ppf "ref-delta"
73
74type header = { version : int; count : int }
75(** Pack file header. *)
76
77let pp_header ppf h =
78 Fmt.pf ppf "{ version = %d; count = %d }" h.version h.count
79
80type entry_header = {
81 obj_type : obj_type;
82 size : int; (** Uncompressed size *)
83}
84(** Entry header in pack file. *)
85
86(** {1 Reading pack files} *)
87
88(** Read big-endian 32-bit integer from string at offset. *)
89let int32_be s off =
90 let b0 = Char.code s.[off] in
91 let b1 = Char.code s.[off + 1] in
92 let b2 = Char.code s.[off + 2] in
93 let b3 = Char.code s.[off + 3] in
94 (b0 lsl 24) lor (b1 lsl 16) lor (b2 lsl 8) lor b3
95
96(** {1 Zlib decompression using bytesrw.zlib}
97
98 We use bytesrw.zlib with [~leftover:true] to handle concatenated zlib
99 streams in pack files. This positions the reader after each stream ends. *)
100
101(** Create a reader that serves bytes from [data] starting at [first]. Uses
102 {!Bytes.unsafe_of_string} to avoid copying the string. *)
103let reader_of_string_at ~first data =
104 let bytes = Bytes.unsafe_of_string data in
105 let length = String.length data - first in
106 let pos = ref 0 in
107 let slice_length = min length (64 * 1024) in
108 let read () =
109 if !pos >= length then Slice.eod
110 else
111 let len = min slice_length (length - !pos) in
112 let s = Slice.make bytes ~first:(first + !pos) ~length:len in
113 pos := !pos + len;
114 s
115 in
116 Reader.make ~slice_length read
117
118(** Shared buffer for decompression output. Avoids allocating a new {!Buffer.t}
119 per inflate call -- [Buffer.clear] resets the length without freeing the
120 backing storage, so after the first large decompression the buffer never
121 reallocates. *)
122let inflate_buf = Buffer.create (64 * 1024)
123
124(** Decompress zlib-compressed data starting at [first] in [data]. Returns
125 (decompressed_data, bytes_consumed). Zero-copy on input: reads directly from
126 the string via {!reader_of_string_at}. Reuses a shared output buffer to
127 avoid per-call allocation. *)
128let inflate_with_consumed ?(first = 0) data =
129 try
130 let base_reader = reader_of_string_at ~first data in
131 let decompressed =
132 Bytesrw_zlib.Zlib.decompress_reads ~leftover:true () base_reader
133 in
134 Buffer.clear inflate_buf;
135 Reader.add_to_buffer inflate_buf decompressed;
136 let output = Buffer.contents inflate_buf in
137 Ok (output, Reader.pos base_reader)
138 with exn ->
139 Error
140 (`Msg (Fmt.str "zlib decompression error: %s" (Printexc.to_string exn)))
141
142(** Decompress zlib-compressed data starting at [first] in [data]. *)
143let inflate ?first data =
144 match inflate_with_consumed ?first data with
145 | Ok (output, _consumed) -> Ok output
146 | Error e -> Error e
147
148(** {1 Delta decoding}
149
150 Delta format:
151 - Source size (varint)
152 - Target size (varint)
153 - Commands: COPY or INSERT *)
154
155(** Read a variable-length integer from a string at given offset. Returns
156 (value, new_offset). *)
157let read_varint_from_string s off =
158 let rec loop value shift off =
159 if off >= String.length s then (value, off)
160 else
161 let b = Char.code s.[off] in
162 let value = value lor ((b land 0x7F) lsl shift) in
163 if b land 0x80 = 0 then (value, off + 1)
164 else loop value (shift + 7) (off + 1)
165 in
166 loop 0 0 off
167
168(** Parse copy offset from delta command byte and data. Returns (offset,
169 next_pos). *)
170let parse_copy_offset delta cmd off =
171 let read_if mask shift off acc =
172 if cmd land mask <> 0 then
173 (acc lor (Char.code delta.[off] lsl shift), off + 1)
174 else (acc, off)
175 in
176 let offset, off = read_if 0x01 0 off 0 in
177 let offset, off = read_if 0x02 8 off offset in
178 let offset, off = read_if 0x04 16 off offset in
179 let offset, off = read_if 0x08 24 off offset in
180 (offset, off)
181
182(** Parse copy size from delta command byte and data. Returns (size, next_pos).
183*)
184let parse_copy_size delta cmd off =
185 let read_if mask shift off acc =
186 if cmd land mask <> 0 then
187 (acc lor (Char.code delta.[off] lsl shift), off + 1)
188 else (acc, off)
189 in
190 let size, off = read_if 0x10 0 off 0 in
191 let size, off = read_if 0x20 8 off size in
192 let size, off = read_if 0x40 16 off size in
193 (* Size of 0 means 0x10000 *)
194 let size = if size = 0 then 0x10000 else size in
195 (size, off)
196
197(** Apply delta to source to produce target. *)
198let apply_delta ~source ~delta =
199 let delta_len = String.length delta in
200 if delta_len < 2 then Error (`Msg "Delta too short")
201 else
202 let source_size, off = read_varint_from_string delta 0 in
203 if source_size <> String.length source then
204 Error
205 (`Msg
206 (Fmt.str "Delta source size mismatch: expected %d, got %d"
207 source_size (String.length source)))
208 else
209 let target_size, off = read_varint_from_string delta off in
210 let target = Buffer.create target_size in
211 let rec apply off =
212 if off >= delta_len then
213 if Buffer.length target = target_size then Ok (Buffer.contents target)
214 else
215 Error
216 (`Msg
217 (Fmt.str "Delta target size mismatch: expected %d, got %d"
218 target_size (Buffer.length target)))
219 else
220 let cmd = Char.code delta.[off] in
221 if cmd = 0 then Error (`Msg "Invalid delta command: 0")
222 else if cmd land 0x80 <> 0 then begin
223 (* COPY command *)
224 let copy_offset, off = parse_copy_offset delta cmd (off + 1) in
225 let copy_size, off = parse_copy_size delta cmd off in
226 Buffer.add_substring target source copy_offset copy_size;
227 apply off
228 end
229 else begin
230 (* INSERT command - cmd is the number of literal bytes *)
231 Buffer.add_substring target delta (off + 1) cmd;
232 apply (off + 1 + cmd)
233 end
234 in
235 apply off
236
237(** Object kind (base types only). *)
238let kind_of_obj_type = function
239 | Commit -> `Commit
240 | Tree -> `Tree
241 | Blob -> `Blob
242 | Tag -> `Tag
243 | Ofs_delta | Ref_delta -> assert false (* Not base types *)
244
245(** {1 Pack reading with random access} *)
246
247type t = {
248 header : header;
249 data : string; (** Full pack file data for random access *)
250}
251(** A pack file opened for reading. *)
252
253(** Open a pack file from a string. *)
254let of_string data =
255 if String.length data < 12 then Error (`Msg "Pack file too short")
256 else
257 let magic = String.sub data 0 4 in
258 if magic <> "PACK" then Error (`Msg "Invalid pack file: bad magic")
259 else
260 let version = int32_be data 4 in
261 if version <> 2 && version <> 3 then err_unsupported_version version
262 else
263 let count = int32_be data 8 in
264 Ok { header = { version; count }; data }
265
266(** Read entry header at given offset. Returns (header, data_offset). *)
267let read_entry_header_at t offset =
268 let open Result.Syntax in
269 if offset >= String.length t.data then Error (`Msg "Offset beyond pack file")
270 else
271 let first = Char.code t.data.[offset] in
272 let type_bits = (first lsr 4) land 0x07 in
273 let* obj_type = obj_type_of_int type_bits in
274 let size = first land 0x0F in
275 let continue = first land 0x80 <> 0 in
276 if not continue then Ok ({ obj_type; size }, offset + 1)
277 else
278 let rec loop size shift off =
279 let b = Char.code t.data.[off] in
280 let size = size lor ((b land 0x7F) lsl shift) in
281 if b land 0x80 = 0 then Ok (size, off + 1)
282 else loop size (shift + 7) (off + 1)
283 in
284 let* size, data_off = loop size 4 (offset + 1) in
285 Ok ({ obj_type; size }, data_off)
286
287(** Read OFS_DELTA offset at given position. Returns (offset, next_pos). *)
288let read_ofs_offset_at t pos =
289 let first = Char.code t.data.[pos] in
290 let offset = first land 0x7F in
291 if first land 0x80 = 0 then (offset, pos + 1)
292 else
293 let rec loop offset pos =
294 let b = Char.code t.data.[pos] in
295 let offset = ((offset + 1) lsl 7) lor (b land 0x7F) in
296 if b land 0x80 = 0 then (offset, pos + 1) else loop offset (pos + 1)
297 in
298 loop offset (pos + 1)
299
300(** Find the base type by following delta chain. *)
301let rec base_type t off =
302 let open Result.Syntax in
303 let* h, doff = read_entry_header_at t off in
304 match h.obj_type with
305 | Commit | Tree | Blob | Tag -> Ok (kind_of_obj_type h.obj_type)
306 | Ofs_delta ->
307 let rel, _ = read_ofs_offset_at t doff in
308 base_type t (off - rel)
309 | Ref_delta -> Error (`Msg "REF_DELTA in delta chain not supported yet")
310
311(** Read an object at the given offset, resolving deltas. Returns (kind, data).
312*)
313let rec read_object_at t offset =
314 let open Result.Syntax in
315 let* header, data_off = read_entry_header_at t offset in
316 match header.obj_type with
317 | Commit | Tree | Blob | Tag ->
318 let* data = inflate ~first:data_off t.data in
319 Ok (kind_of_obj_type header.obj_type, data)
320 | Ofs_delta ->
321 let rel_offset, delta_off = read_ofs_offset_at t data_off in
322 let source_offset = offset - rel_offset in
323 let* _kind, source = read_object_at t source_offset in
324 let* delta = inflate ~first:delta_off t.data in
325 let* target = apply_delta ~source ~delta in
326 let* kind = base_type t source_offset in
327 Ok (kind, target)
328 | Ref_delta ->
329 (* REF_DELTA requires an index to resolve *)
330 Error (`Msg "REF_DELTA requires pack index for resolution")
331
332(** Read an object at the given offset, also returning the offset of the next
333 entry. This avoids re-decompressing just to find the next position. *)
334let read_object_at_with_next t offset =
335 let open Result.Syntax in
336 let* header, data_off = read_entry_header_at t offset in
337 match header.obj_type with
338 | Commit | Tree | Blob | Tag ->
339 let* data, consumed = inflate_with_consumed ~first:data_off t.data in
340 Ok (kind_of_obj_type header.obj_type, data, data_off + consumed)
341 | Ofs_delta ->
342 let rel_offset, delta_off = read_ofs_offset_at t data_off in
343 let source_offset = offset - rel_offset in
344 let* _kind, source = read_object_at t source_offset in
345 let* delta, consumed = inflate_with_consumed ~first:delta_off t.data in
346 let* target = apply_delta ~source ~delta in
347 let* kind = base_type t source_offset in
348 Ok (kind, target, delta_off + consumed)
349 | Ref_delta ->
350 (* REF_DELTA requires an index to resolve *)
351 Error (`Msg "REF_DELTA requires pack index for resolution")
352
353(** Get the number of objects in the pack. *)
354let count t = t.header.count
355
356(** Get the pack version. *)
357let version t = t.header.version
358
359(** {1 Iterating over pack entries} *)
360
361type 'a entry_handler =
362 offset:int ->
363 kind:[ `Commit | `Tree | `Blob | `Tag ] ->
364 data:string ->
365 'a ->
366 'a
367(** Callback type for pack iteration. *)
368
369(** Iterate over all objects in a pack file. Each object is decompressed exactly
370 once; the next entry offset is tracked during decompression. *)
371let fold f acc t =
372 let rec loop acc offset n =
373 if n >= t.header.count then Ok acc
374 else
375 match read_object_at_with_next t offset with
376 | Error e -> Error e
377 | Ok (kind, data, next_offset) ->
378 let acc = f ~offset ~kind ~data acc in
379 loop acc next_offset (n + 1)
380 in
381 loop acc 12 0 (* Start after 12-byte header *)