(* Copyright (c) 2013-2017 Thomas Gazagnaire Copyright (c) 2017-2024 Romain Calascibetta Copyright (c) 2024-2026 Thomas Gazagnaire Permission to use, copy, modify, and distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. *) (** Git pack file support. Pack files store multiple git objects in a compressed format with delta encoding for efficient storage. *) module Reader = Bytesrw.Bytes.Reader module Slice = Bytesrw.Bytes.Slice (** {1 Error helpers} *) let err_invalid_obj_type n = Error (`Msg (Fmt.str "Invalid pack object type: %d" n)) let err_unsupported_version v = Error (`Msg (Fmt.str "Unsupported pack version: %d" v)) (** {1 Pack file format} Pack file structure: - Header: "PACK" (4 bytes) + version (4 bytes, big-endian) + count (4 bytes) - Objects: variable-length entries - Footer: SHA-1 checksum (20 bytes) *) (** Object types in pack files. *) type obj_type = | Commit | Tree | Blob | Tag | Ofs_delta (** Delta relative to offset in same pack *) | Ref_delta (** Delta relative to object by hash *) let obj_type_to_int = function | Commit -> 1 | Tree -> 2 | Blob -> 3 | Tag -> 4 | Ofs_delta -> 6 | Ref_delta -> 7 let obj_type_of_int = function | 1 -> Ok Commit | 2 -> Ok Tree | 3 -> Ok Blob | 4 -> Ok Tag | 6 -> Ok Ofs_delta | 7 -> Ok Ref_delta | n -> err_invalid_obj_type n let pp_obj_type ppf = function | Commit -> Fmt.string ppf "commit" | Tree -> Fmt.string ppf "tree" | Blob -> Fmt.string ppf "blob" | Tag -> Fmt.string ppf "tag" | Ofs_delta -> Fmt.string ppf "ofs-delta" | Ref_delta -> Fmt.string ppf "ref-delta" type header = { version : int; count : int } (** Pack file header. *) let pp_header ppf h = Fmt.pf ppf "{ version = %d; count = %d }" h.version h.count type entry_header = { obj_type : obj_type; size : int; (** Uncompressed size *) } (** Entry header in pack file. *) (** {1 Reading pack files} *) (** Read big-endian 32-bit integer from string at offset. *) let int32_be s off = let b0 = Char.code s.[off] in let b1 = Char.code s.[off + 1] in let b2 = Char.code s.[off + 2] in let b3 = Char.code s.[off + 3] in (b0 lsl 24) lor (b1 lsl 16) lor (b2 lsl 8) lor b3 (** {1 Zlib decompression using bytesrw.zlib} We use bytesrw.zlib with [~leftover:true] to handle concatenated zlib streams in pack files. This positions the reader after each stream ends. *) (** Create a reader that serves bytes from [data] starting at [first]. Uses {!Bytes.unsafe_of_string} to avoid copying the string. *) let reader_of_string_at ~first data = let bytes = Bytes.unsafe_of_string data in let length = String.length data - first in let pos = ref 0 in let slice_length = min length (64 * 1024) in let read () = if !pos >= length then Slice.eod else let len = min slice_length (length - !pos) in let s = Slice.make bytes ~first:(first + !pos) ~length:len in pos := !pos + len; s in Reader.make ~slice_length read (** Shared buffer for decompression output. Avoids allocating a new {!Buffer.t} per inflate call -- [Buffer.clear] resets the length without freeing the backing storage, so after the first large decompression the buffer never reallocates. *) let inflate_buf = Buffer.create (64 * 1024) (** Decompress zlib-compressed data starting at [first] in [data]. Returns (decompressed_data, bytes_consumed). Zero-copy on input: reads directly from the string via {!reader_of_string_at}. Reuses a shared output buffer to avoid per-call allocation. *) let inflate_with_consumed ?(first = 0) data = try let base_reader = reader_of_string_at ~first data in let decompressed = Bytesrw_zlib.Zlib.decompress_reads ~leftover:true () base_reader in Buffer.clear inflate_buf; Reader.add_to_buffer inflate_buf decompressed; let output = Buffer.contents inflate_buf in Ok (output, Reader.pos base_reader) with exn -> Error (`Msg (Fmt.str "zlib decompression error: %s" (Printexc.to_string exn))) (** Decompress zlib-compressed data starting at [first] in [data]. *) let inflate ?first data = match inflate_with_consumed ?first data with | Ok (output, _consumed) -> Ok output | Error e -> Error e (** {1 Delta decoding} Delta format: - Source size (varint) - Target size (varint) - Commands: COPY or INSERT *) (** Read a variable-length integer from a string at given offset. Returns (value, new_offset). *) let read_varint_from_string s off = let rec loop value shift off = if off >= String.length s then (value, off) else let b = Char.code s.[off] in let value = value lor ((b land 0x7F) lsl shift) in if b land 0x80 = 0 then (value, off + 1) else loop value (shift + 7) (off + 1) in loop 0 0 off (** Parse copy offset from delta command byte and data. Returns (offset, next_pos). *) let parse_copy_offset delta cmd off = let read_if mask shift off acc = if cmd land mask <> 0 then (acc lor (Char.code delta.[off] lsl shift), off + 1) else (acc, off) in let offset, off = read_if 0x01 0 off 0 in let offset, off = read_if 0x02 8 off offset in let offset, off = read_if 0x04 16 off offset in let offset, off = read_if 0x08 24 off offset in (offset, off) (** Parse copy size from delta command byte and data. Returns (size, next_pos). *) let parse_copy_size delta cmd off = let read_if mask shift off acc = if cmd land mask <> 0 then (acc lor (Char.code delta.[off] lsl shift), off + 1) else (acc, off) in let size, off = read_if 0x10 0 off 0 in let size, off = read_if 0x20 8 off size in let size, off = read_if 0x40 16 off size in (* Size of 0 means 0x10000 *) let size = if size = 0 then 0x10000 else size in (size, off) (** Apply delta to source to produce target. *) let apply_delta ~source ~delta = let delta_len = String.length delta in if delta_len < 2 then Error (`Msg "Delta too short") else let source_size, off = read_varint_from_string delta 0 in if source_size <> String.length source then Error (`Msg (Fmt.str "Delta source size mismatch: expected %d, got %d" source_size (String.length source))) else let target_size, off = read_varint_from_string delta off in let target = Buffer.create target_size in let rec apply off = if off >= delta_len then if Buffer.length target = target_size then Ok (Buffer.contents target) else Error (`Msg (Fmt.str "Delta target size mismatch: expected %d, got %d" target_size (Buffer.length target))) else let cmd = Char.code delta.[off] in if cmd = 0 then Error (`Msg "Invalid delta command: 0") else if cmd land 0x80 <> 0 then begin (* COPY command *) let copy_offset, off = parse_copy_offset delta cmd (off + 1) in let copy_size, off = parse_copy_size delta cmd off in Buffer.add_substring target source copy_offset copy_size; apply off end else begin (* INSERT command - cmd is the number of literal bytes *) Buffer.add_substring target delta (off + 1) cmd; apply (off + 1 + cmd) end in apply off (** Object kind (base types only). *) let kind_of_obj_type = function | Commit -> `Commit | Tree -> `Tree | Blob -> `Blob | Tag -> `Tag | Ofs_delta | Ref_delta -> assert false (* Not base types *) (** {1 Pack reading with random access} *) type t = { header : header; data : string; (** Full pack file data for random access *) } (** A pack file opened for reading. *) (** Open a pack file from a string. *) let of_string data = if String.length data < 12 then Error (`Msg "Pack file too short") else let magic = String.sub data 0 4 in if magic <> "PACK" then Error (`Msg "Invalid pack file: bad magic") else let version = int32_be data 4 in if version <> 2 && version <> 3 then err_unsupported_version version else let count = int32_be data 8 in Ok { header = { version; count }; data } (** Read entry header at given offset. Returns (header, data_offset). *) let read_entry_header_at t offset = let open Result.Syntax in if offset >= String.length t.data then Error (`Msg "Offset beyond pack file") else let first = Char.code t.data.[offset] in let type_bits = (first lsr 4) land 0x07 in let* obj_type = obj_type_of_int type_bits in let size = first land 0x0F in let continue = first land 0x80 <> 0 in if not continue then Ok ({ obj_type; size }, offset + 1) else let rec loop size shift off = let b = Char.code t.data.[off] in let size = size lor ((b land 0x7F) lsl shift) in if b land 0x80 = 0 then Ok (size, off + 1) else loop size (shift + 7) (off + 1) in let* size, data_off = loop size 4 (offset + 1) in Ok ({ obj_type; size }, data_off) (** Read OFS_DELTA offset at given position. Returns (offset, next_pos). *) let read_ofs_offset_at t pos = let first = Char.code t.data.[pos] in let offset = first land 0x7F in if first land 0x80 = 0 then (offset, pos + 1) else let rec loop offset pos = let b = Char.code t.data.[pos] in let offset = ((offset + 1) lsl 7) lor (b land 0x7F) in if b land 0x80 = 0 then (offset, pos + 1) else loop offset (pos + 1) in loop offset (pos + 1) (** Find the base type by following delta chain. *) let rec base_type t off = let open Result.Syntax in let* h, doff = read_entry_header_at t off in match h.obj_type with | Commit | Tree | Blob | Tag -> Ok (kind_of_obj_type h.obj_type) | Ofs_delta -> let rel, _ = read_ofs_offset_at t doff in base_type t (off - rel) | Ref_delta -> Error (`Msg "REF_DELTA in delta chain not supported yet") (** Read an object at the given offset, resolving deltas. Returns (kind, data). *) let rec read_object_at t offset = let open Result.Syntax in let* header, data_off = read_entry_header_at t offset in match header.obj_type with | Commit | Tree | Blob | Tag -> let* data = inflate ~first:data_off t.data in Ok (kind_of_obj_type header.obj_type, data) | Ofs_delta -> let rel_offset, delta_off = read_ofs_offset_at t data_off in let source_offset = offset - rel_offset in let* _kind, source = read_object_at t source_offset in let* delta = inflate ~first:delta_off t.data in let* target = apply_delta ~source ~delta in let* kind = base_type t source_offset in Ok (kind, target) | Ref_delta -> (* REF_DELTA requires an index to resolve *) Error (`Msg "REF_DELTA requires pack index for resolution") (** Read an object at the given offset, also returning the offset of the next entry. This avoids re-decompressing just to find the next position. *) let read_object_at_with_next t offset = let open Result.Syntax in let* header, data_off = read_entry_header_at t offset in match header.obj_type with | Commit | Tree | Blob | Tag -> let* data, consumed = inflate_with_consumed ~first:data_off t.data in Ok (kind_of_obj_type header.obj_type, data, data_off + consumed) | Ofs_delta -> let rel_offset, delta_off = read_ofs_offset_at t data_off in let source_offset = offset - rel_offset in let* _kind, source = read_object_at t source_offset in let* delta, consumed = inflate_with_consumed ~first:delta_off t.data in let* target = apply_delta ~source ~delta in let* kind = base_type t source_offset in Ok (kind, target, delta_off + consumed) | Ref_delta -> (* REF_DELTA requires an index to resolve *) Error (`Msg "REF_DELTA requires pack index for resolution") (** Get the number of objects in the pack. *) let count t = t.header.count (** Get the pack version. *) let version t = t.header.version (** {1 Iterating over pack entries} *) type 'a entry_handler = offset:int -> kind:[ `Commit | `Tree | `Blob | `Tag ] -> data:string -> 'a -> 'a (** Callback type for pack iteration. *) (** Iterate over all objects in a pack file. Each object is decompressed exactly once; the next entry offset is tracked during decompression. *) let fold f acc t = let rec loop acc offset n = if n >= t.header.count then Ok acc else match read_object_at_with_next t offset with | Error e -> Error e | Ok (kind, data, next_offset) -> let acc = f ~offset ~kind ~data acc in loop acc next_offset (n + 1) in loop acc 12 0 (* Start after 12-byte header *)