Git object storage and pack files for Eio
at main 381 lines 13 kB view raw
1(* Copyright (c) 2013-2017 Thomas Gazagnaire <thomas@gazagnaire.org> 2 Copyright (c) 2017-2024 Romain Calascibetta <romain.calascibetta@gmail.com> 3 Copyright (c) 2024-2026 Thomas Gazagnaire <thomas@gazagnaire.org> 4 5 Permission to use, copy, modify, and distribute this software for any 6 purpose with or without fee is hereby granted, provided that the above 7 copyright notice and this permission notice appear in all copies. 8 9 THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. *) 16 17(** Git pack file support. 18 19 Pack files store multiple git objects in a compressed format with delta 20 encoding for efficient storage. *) 21 22module Reader = Bytesrw.Bytes.Reader 23module Slice = Bytesrw.Bytes.Slice 24 25(** {1 Error helpers} *) 26 27let err_invalid_obj_type n = 28 Error (`Msg (Fmt.str "Invalid pack object type: %d" n)) 29 30let err_unsupported_version v = 31 Error (`Msg (Fmt.str "Unsupported pack version: %d" v)) 32 33(** {1 Pack file format} 34 35 Pack file structure: 36 - Header: "PACK" (4 bytes) + version (4 bytes, big-endian) + count (4 bytes) 37 - Objects: variable-length entries 38 - Footer: SHA-1 checksum (20 bytes) *) 39 40(** Object types in pack files. *) 41type obj_type = 42 | Commit 43 | Tree 44 | Blob 45 | Tag 46 | Ofs_delta (** Delta relative to offset in same pack *) 47 | Ref_delta (** Delta relative to object by hash *) 48 49let obj_type_to_int = function 50 | Commit -> 1 51 | Tree -> 2 52 | Blob -> 3 53 | Tag -> 4 54 | Ofs_delta -> 6 55 | Ref_delta -> 7 56 57let obj_type_of_int = function 58 | 1 -> Ok Commit 59 | 2 -> Ok Tree 60 | 3 -> Ok Blob 61 | 4 -> Ok Tag 62 | 6 -> Ok Ofs_delta 63 | 7 -> Ok Ref_delta 64 | n -> err_invalid_obj_type n 65 66let pp_obj_type ppf = function 67 | Commit -> Fmt.string ppf "commit" 68 | Tree -> Fmt.string ppf "tree" 69 | Blob -> Fmt.string ppf "blob" 70 | Tag -> Fmt.string ppf "tag" 71 | Ofs_delta -> Fmt.string ppf "ofs-delta" 72 | Ref_delta -> Fmt.string ppf "ref-delta" 73 74type header = { version : int; count : int } 75(** Pack file header. *) 76 77let pp_header ppf h = 78 Fmt.pf ppf "{ version = %d; count = %d }" h.version h.count 79 80type entry_header = { 81 obj_type : obj_type; 82 size : int; (** Uncompressed size *) 83} 84(** Entry header in pack file. *) 85 86(** {1 Reading pack files} *) 87 88(** Read big-endian 32-bit integer from string at offset. *) 89let int32_be s off = 90 let b0 = Char.code s.[off] in 91 let b1 = Char.code s.[off + 1] in 92 let b2 = Char.code s.[off + 2] in 93 let b3 = Char.code s.[off + 3] in 94 (b0 lsl 24) lor (b1 lsl 16) lor (b2 lsl 8) lor b3 95 96(** {1 Zlib decompression using bytesrw.zlib} 97 98 We use bytesrw.zlib with [~leftover:true] to handle concatenated zlib 99 streams in pack files. This positions the reader after each stream ends. *) 100 101(** Create a reader that serves bytes from [data] starting at [first]. Uses 102 {!Bytes.unsafe_of_string} to avoid copying the string. *) 103let reader_of_string_at ~first data = 104 let bytes = Bytes.unsafe_of_string data in 105 let length = String.length data - first in 106 let pos = ref 0 in 107 let slice_length = min length (64 * 1024) in 108 let read () = 109 if !pos >= length then Slice.eod 110 else 111 let len = min slice_length (length - !pos) in 112 let s = Slice.make bytes ~first:(first + !pos) ~length:len in 113 pos := !pos + len; 114 s 115 in 116 Reader.make ~slice_length read 117 118(** Shared buffer for decompression output. Avoids allocating a new {!Buffer.t} 119 per inflate call -- [Buffer.clear] resets the length without freeing the 120 backing storage, so after the first large decompression the buffer never 121 reallocates. *) 122let inflate_buf = Buffer.create (64 * 1024) 123 124(** Decompress zlib-compressed data starting at [first] in [data]. Returns 125 (decompressed_data, bytes_consumed). Zero-copy on input: reads directly from 126 the string via {!reader_of_string_at}. Reuses a shared output buffer to 127 avoid per-call allocation. *) 128let inflate_with_consumed ?(first = 0) data = 129 try 130 let base_reader = reader_of_string_at ~first data in 131 let decompressed = 132 Bytesrw_zlib.Zlib.decompress_reads ~leftover:true () base_reader 133 in 134 Buffer.clear inflate_buf; 135 Reader.add_to_buffer inflate_buf decompressed; 136 let output = Buffer.contents inflate_buf in 137 Ok (output, Reader.pos base_reader) 138 with exn -> 139 Error 140 (`Msg (Fmt.str "zlib decompression error: %s" (Printexc.to_string exn))) 141 142(** Decompress zlib-compressed data starting at [first] in [data]. *) 143let inflate ?first data = 144 match inflate_with_consumed ?first data with 145 | Ok (output, _consumed) -> Ok output 146 | Error e -> Error e 147 148(** {1 Delta decoding} 149 150 Delta format: 151 - Source size (varint) 152 - Target size (varint) 153 - Commands: COPY or INSERT *) 154 155(** Read a variable-length integer from a string at given offset. Returns 156 (value, new_offset). *) 157let read_varint_from_string s off = 158 let rec loop value shift off = 159 if off >= String.length s then (value, off) 160 else 161 let b = Char.code s.[off] in 162 let value = value lor ((b land 0x7F) lsl shift) in 163 if b land 0x80 = 0 then (value, off + 1) 164 else loop value (shift + 7) (off + 1) 165 in 166 loop 0 0 off 167 168(** Parse copy offset from delta command byte and data. Returns (offset, 169 next_pos). *) 170let parse_copy_offset delta cmd off = 171 let read_if mask shift off acc = 172 if cmd land mask <> 0 then 173 (acc lor (Char.code delta.[off] lsl shift), off + 1) 174 else (acc, off) 175 in 176 let offset, off = read_if 0x01 0 off 0 in 177 let offset, off = read_if 0x02 8 off offset in 178 let offset, off = read_if 0x04 16 off offset in 179 let offset, off = read_if 0x08 24 off offset in 180 (offset, off) 181 182(** Parse copy size from delta command byte and data. Returns (size, next_pos). 183*) 184let parse_copy_size delta cmd off = 185 let read_if mask shift off acc = 186 if cmd land mask <> 0 then 187 (acc lor (Char.code delta.[off] lsl shift), off + 1) 188 else (acc, off) 189 in 190 let size, off = read_if 0x10 0 off 0 in 191 let size, off = read_if 0x20 8 off size in 192 let size, off = read_if 0x40 16 off size in 193 (* Size of 0 means 0x10000 *) 194 let size = if size = 0 then 0x10000 else size in 195 (size, off) 196 197(** Apply delta to source to produce target. *) 198let apply_delta ~source ~delta = 199 let delta_len = String.length delta in 200 if delta_len < 2 then Error (`Msg "Delta too short") 201 else 202 let source_size, off = read_varint_from_string delta 0 in 203 if source_size <> String.length source then 204 Error 205 (`Msg 206 (Fmt.str "Delta source size mismatch: expected %d, got %d" 207 source_size (String.length source))) 208 else 209 let target_size, off = read_varint_from_string delta off in 210 let target = Buffer.create target_size in 211 let rec apply off = 212 if off >= delta_len then 213 if Buffer.length target = target_size then Ok (Buffer.contents target) 214 else 215 Error 216 (`Msg 217 (Fmt.str "Delta target size mismatch: expected %d, got %d" 218 target_size (Buffer.length target))) 219 else 220 let cmd = Char.code delta.[off] in 221 if cmd = 0 then Error (`Msg "Invalid delta command: 0") 222 else if cmd land 0x80 <> 0 then begin 223 (* COPY command *) 224 let copy_offset, off = parse_copy_offset delta cmd (off + 1) in 225 let copy_size, off = parse_copy_size delta cmd off in 226 Buffer.add_substring target source copy_offset copy_size; 227 apply off 228 end 229 else begin 230 (* INSERT command - cmd is the number of literal bytes *) 231 Buffer.add_substring target delta (off + 1) cmd; 232 apply (off + 1 + cmd) 233 end 234 in 235 apply off 236 237(** Object kind (base types only). *) 238let kind_of_obj_type = function 239 | Commit -> `Commit 240 | Tree -> `Tree 241 | Blob -> `Blob 242 | Tag -> `Tag 243 | Ofs_delta | Ref_delta -> assert false (* Not base types *) 244 245(** {1 Pack reading with random access} *) 246 247type t = { 248 header : header; 249 data : string; (** Full pack file data for random access *) 250} 251(** A pack file opened for reading. *) 252 253(** Open a pack file from a string. *) 254let of_string data = 255 if String.length data < 12 then Error (`Msg "Pack file too short") 256 else 257 let magic = String.sub data 0 4 in 258 if magic <> "PACK" then Error (`Msg "Invalid pack file: bad magic") 259 else 260 let version = int32_be data 4 in 261 if version <> 2 && version <> 3 then err_unsupported_version version 262 else 263 let count = int32_be data 8 in 264 Ok { header = { version; count }; data } 265 266(** Read entry header at given offset. Returns (header, data_offset). *) 267let read_entry_header_at t offset = 268 let open Result.Syntax in 269 if offset >= String.length t.data then Error (`Msg "Offset beyond pack file") 270 else 271 let first = Char.code t.data.[offset] in 272 let type_bits = (first lsr 4) land 0x07 in 273 let* obj_type = obj_type_of_int type_bits in 274 let size = first land 0x0F in 275 let continue = first land 0x80 <> 0 in 276 if not continue then Ok ({ obj_type; size }, offset + 1) 277 else 278 let rec loop size shift off = 279 let b = Char.code t.data.[off] in 280 let size = size lor ((b land 0x7F) lsl shift) in 281 if b land 0x80 = 0 then Ok (size, off + 1) 282 else loop size (shift + 7) (off + 1) 283 in 284 let* size, data_off = loop size 4 (offset + 1) in 285 Ok ({ obj_type; size }, data_off) 286 287(** Read OFS_DELTA offset at given position. Returns (offset, next_pos). *) 288let read_ofs_offset_at t pos = 289 let first = Char.code t.data.[pos] in 290 let offset = first land 0x7F in 291 if first land 0x80 = 0 then (offset, pos + 1) 292 else 293 let rec loop offset pos = 294 let b = Char.code t.data.[pos] in 295 let offset = ((offset + 1) lsl 7) lor (b land 0x7F) in 296 if b land 0x80 = 0 then (offset, pos + 1) else loop offset (pos + 1) 297 in 298 loop offset (pos + 1) 299 300(** Find the base type by following delta chain. *) 301let rec base_type t off = 302 let open Result.Syntax in 303 let* h, doff = read_entry_header_at t off in 304 match h.obj_type with 305 | Commit | Tree | Blob | Tag -> Ok (kind_of_obj_type h.obj_type) 306 | Ofs_delta -> 307 let rel, _ = read_ofs_offset_at t doff in 308 base_type t (off - rel) 309 | Ref_delta -> Error (`Msg "REF_DELTA in delta chain not supported yet") 310 311(** Read an object at the given offset, resolving deltas. Returns (kind, data). 312*) 313let rec read_object_at t offset = 314 let open Result.Syntax in 315 let* header, data_off = read_entry_header_at t offset in 316 match header.obj_type with 317 | Commit | Tree | Blob | Tag -> 318 let* data = inflate ~first:data_off t.data in 319 Ok (kind_of_obj_type header.obj_type, data) 320 | Ofs_delta -> 321 let rel_offset, delta_off = read_ofs_offset_at t data_off in 322 let source_offset = offset - rel_offset in 323 let* _kind, source = read_object_at t source_offset in 324 let* delta = inflate ~first:delta_off t.data in 325 let* target = apply_delta ~source ~delta in 326 let* kind = base_type t source_offset in 327 Ok (kind, target) 328 | Ref_delta -> 329 (* REF_DELTA requires an index to resolve *) 330 Error (`Msg "REF_DELTA requires pack index for resolution") 331 332(** Read an object at the given offset, also returning the offset of the next 333 entry. This avoids re-decompressing just to find the next position. *) 334let read_object_at_with_next t offset = 335 let open Result.Syntax in 336 let* header, data_off = read_entry_header_at t offset in 337 match header.obj_type with 338 | Commit | Tree | Blob | Tag -> 339 let* data, consumed = inflate_with_consumed ~first:data_off t.data in 340 Ok (kind_of_obj_type header.obj_type, data, data_off + consumed) 341 | Ofs_delta -> 342 let rel_offset, delta_off = read_ofs_offset_at t data_off in 343 let source_offset = offset - rel_offset in 344 let* _kind, source = read_object_at t source_offset in 345 let* delta, consumed = inflate_with_consumed ~first:delta_off t.data in 346 let* target = apply_delta ~source ~delta in 347 let* kind = base_type t source_offset in 348 Ok (kind, target, delta_off + consumed) 349 | Ref_delta -> 350 (* REF_DELTA requires an index to resolve *) 351 Error (`Msg "REF_DELTA requires pack index for resolution") 352 353(** Get the number of objects in the pack. *) 354let count t = t.header.count 355 356(** Get the pack version. *) 357let version t = t.header.version 358 359(** {1 Iterating over pack entries} *) 360 361type 'a entry_handler = 362 offset:int -> 363 kind:[ `Commit | `Tree | `Blob | `Tag ] -> 364 data:string -> 365 'a -> 366 'a 367(** Callback type for pack iteration. *) 368 369(** Iterate over all objects in a pack file. Each object is decompressed exactly 370 once; the next entry offset is tracked during decompression. *) 371let fold f acc t = 372 let rec loop acc offset n = 373 if n >= t.header.count then Ok acc 374 else 375 match read_object_at_with_next t offset with 376 | Error e -> Error e 377 | Ok (kind, data, next_offset) -> 378 let acc = f ~offset ~kind ~data acc in 379 loop acc next_offset (n + 1) 380 in 381 loop acc 12 0 (* Start after 12-byte header *)