(* Copyright (c) 2024-2026 Thomas Gazagnaire Permission to use, copy, modify, and distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. *) let src = Logs.Src.create "git.subtree" ~doc:"Git subtree operations" module L = (val Logs.src_log src : Logs.LOG) (** {1 Persistent cache} *) module Cache = struct type t = { tbl : (Hash.t, Hash.t) Hashtbl.t } let empty () = { tbl = Hashtbl.create 256 } let find t h = Hashtbl.find_opt t.tbl h let add t old_hash new_hash = Hashtbl.replace t.tbl old_hash new_hash let mem t h = Hashtbl.mem t.tbl h let remove t h = Hashtbl.remove t.tbl h let cache_dir_name prefix = (* Normalize prefix to a flat filename for cache storage. *) let s = String.map (fun c -> if c = '/' then '_' else c) prefix in (* Avoid "." as a filename (would conflict with the directory itself). *) if s = "." || s = ".." then "_root_" else s let load repo ~prefix = let t = empty () in let git_dir = Fpath.to_string (Repository.git_dir repo) in let fs = Repository.fs repo in let dir = cache_dir_name prefix in let path = Eio.Path.(fs / git_dir / "subtree-cache" / dir) in (try let data = Eio.Path.load path in let lines = String.split_on_char '\n' data in List.iter (fun line -> if String.length line >= 81 then begin (* Format: <40-hex-old> <40-hex-new> *) let old_hex = String.sub line 0 40 in let new_hex = String.sub line 41 40 in Hashtbl.replace t.tbl (Hash.of_hex old_hex) (Hash.of_hex new_hex) end) lines with Eio.Io _ | Invalid_argument _ -> ()); t let save repo ~prefix t = let git_dir = Fpath.to_string (Repository.git_dir repo) in let fs = Repository.fs repo in let dir = cache_dir_name prefix in let cache_dir = Eio.Path.(fs / git_dir / "subtree-cache") in (try Eio.Path.mkdir ~perm:0o755 cache_dir with Eio.Io _ -> ()); let path = Eio.Path.(cache_dir / dir) in let buf = Buffer.create (Hashtbl.length t.tbl * 82) in Hashtbl.iter (fun old_hash new_hash -> Buffer.add_string buf (Hash.to_hex old_hash); Buffer.add_char buf ' '; Buffer.add_string buf (Hash.to_hex new_hash); Buffer.add_char buf '\n') t.tbl; Eio.Path.save ~create:(`Or_truncate 0o644) path (Buffer.contents buf) let iter t f = Hashtbl.iter f t.tbl let length t = Hashtbl.length t.tbl let clear repo ~prefix = let git_dir = Fpath.to_string (Repository.git_dir repo) in let fs = Repository.fs repo in let dir = cache_dir_name prefix in let path = Eio.Path.(fs / git_dir / "subtree-cache" / dir) in try Eio.Path.unlink path with Eio.Io _ -> () end (** {1 Tree operations} *) let tree_at_prefix repo tree_hash prefix = (* Split prefix into path segments. O(d) where d = depth. *) let segments = String.split_on_char '/' prefix |> List.filter (fun s -> s <> "") in (* Walk the tree path, reading one tree object per segment. Each Tree.find is O(e) where e = number of entries (linear scan). *) let rec walk hash = function | [] -> Some hash | seg :: rest -> ( match Repository.read repo hash with | Ok (Value.Tree tree) -> ( match Tree.find ~name:seg tree with | Some entry when entry.perm = `Dir -> walk entry.hash rest | _ -> None) | _ -> None) in walk tree_hash segments (** {1 Split} *) (** Extract a metadata value from commit message. Looks for ": " pattern in the message lines. *) let extract_metadata key message = match message with | None -> None | Some msg -> let prefix = key ^ ": " in let prefix_len = String.length prefix in let rec find_in_lines = function | [] -> None | line :: rest -> let line = String.trim line in if String.length line > prefix_len && String.sub line 0 prefix_len = prefix then Some (String.sub line prefix_len (String.length line - prefix_len)) else find_in_lines rest in find_in_lines (String.split_on_char '\n' msg) (** Extract subtree directory from commit message if present. *) let extract_subtree_dir message = extract_metadata "git-subtree-dir" message (** Check if subtree is unchanged from first parent (copy_or_skip optimization). For merge commits, skip if the subtree didn't change from the first parent - this filters out unrelated cross-package merges from the history. *) let should_skip_commit repo new_parents sub_hash = match new_parents with | first_parent :: _ -> ( match Repository.read repo first_parent with | Ok (Value.Commit pc) -> let dominated = Hash.equal sub_hash (Commit.tree pc) in if not dominated then L.debug (fun m -> m "not skipping: sub=%s parent_tree=%s" (Hash.to_hex sub_hash) (Hash.to_hex (Commit.tree pc))); dominated | Ok _ -> L.debug (fun m -> m "not skipping: not a commit"); false | Error _ -> L.debug (fun m -> m "not skipping: read error"); false) | [] -> false (** Walk backward through ancestor chain to find the nearest commit that maps to a non-null split hash. This bridges over "gap" commits that don't contain the subtree (e.g. empty-tree commits or subtree-only commits that ended up in the main branch). *) let find_ancestor_split repo cache p = let rec walk visited h = if Hash.Set.mem h visited then None else match Cache.find cache h with | Some s when not (Hash.equal s Hash.null) -> Some s | _ -> ( let visited = Hash.Set.add h visited in match Repository.read repo h with | Ok (Value.Commit c) -> let rec try_parents = function | [] -> None | gp :: rest -> ( match walk visited gp with | Some _ as found -> found | None -> try_parents rest) in try_parents (Commit.parents c) | _ -> None) in walk Hash.Set.empty p (** Process a single commit for split operation. *) let process_split_commit repo cache prefix { Rev_list.hash; parents } = match Repository.read repo hash with | Error _ -> Cache.add cache hash Hash.null | Ok (Value.Commit commit) -> ( let tree_hash = Commit.tree commit in match tree_at_prefix repo tree_hash prefix with | None -> Cache.add cache hash Hash.null | Some sub_hash -> (* Map all parents through cache *) let new_parents = List.filter_map (fun p -> match Cache.find cache p with | Some h when not (Hash.equal h Hash.null) -> Some h | _ -> None) parents in (* If all parents mapped to null, try walking backward through ancestors to bridge over gap commits (e.g. empty-tree or subtree-only commits that don't contain this prefix). *) let new_parents = if new_parents <> [] then new_parents else List.filter_map (find_ancestor_split repo cache) parents in if should_skip_commit repo new_parents sub_hash then Cache.add cache hash (List.hd new_parents) else let new_commit = Commit.v ~tree:sub_hash ~author:(Commit.author commit) ~committer:(Commit.committer commit) ~parents:new_parents ~extra:(Commit.extra commit) (Commit.message commit) in Cache.add cache hash (Repository.write_commit repo new_commit)) | _ -> Cache.add cache hash Hash.null type verify_error = { original : Hash.t; split : Hash.t; reason : string } let verify_cache repo ~prefix cache = let errors = ref [] in let checked = ref 0 in Cache.iter cache (fun orig split -> incr checked; if not (Hash.equal split Hash.null) then match (Repository.read repo orig, Repository.read repo split) with | Ok (Value.Commit orig_c), Ok (Value.Commit split_c) -> ( (* Check tree matches subtree at prefix *) match tree_at_prefix repo (Commit.tree orig_c) prefix with | None -> errors := { original = orig; split; reason = "original has no subtree at prefix"; } :: !errors | Some expected_tree -> if not (Hash.equal expected_tree (Commit.tree split_c)) then begin let short h = String.sub (Hash.to_hex h) 0 7 in errors := { original = orig; split; reason = Fmt.str "tree mismatch: expected %s, got %s" (short expected_tree) (short (Commit.tree split_c)); } :: !errors end else if Commit.parents split_c = [] then begin (* Check parent consistency: if the split has no parents but the original has ancestors with non-null splits, the cache entry was created from a broken parent chain (e.g. gap commits with empty trees). *) let has_ancestor_split = List.exists (fun p -> match find_ancestor_split repo cache p with | Some _ -> true | None -> false) (Commit.parents orig_c) in if has_ancestor_split then errors := { original = orig; split; reason = "orphaned split: parents have reachable splits"; } :: !errors end) | Error _, _ -> errors := { original = orig; split; reason = "cannot read original commit" } :: !errors | _, Error _ -> errors := { original = orig; split; reason = "cannot read split commit" } :: !errors | _ -> ()); (!checked, List.rev !errors) let verify repo ~prefix () = let cache = Cache.load repo ~prefix in verify_cache repo ~prefix cache (** {1 Split (cont.)} *) let split_with_cache repo ~prefix ~head cache = match Rev_list.topo_sort_reverse repo head ~stop:(Cache.mem cache) with | Error e -> Error e | Ok commits -> List.iter (process_split_commit repo cache prefix) commits; Cache.save repo ~prefix cache; Ok (match Cache.find cache head with | Some h when Hash.equal h Hash.null -> None | other -> other) let split repo ~prefix ~head () = let cache = Cache.load repo ~prefix in match Cache.find cache head with | Some h -> (* Cache hit — verify the result is still valid before returning it. *) let _checked, errors = verify_cache repo ~prefix cache in if errors <> [] then begin L.info (fun m -> m "Repairing cache for %s (%d bad entries)" prefix (List.length errors)); List.iter (fun e -> Cache.remove cache e.original) errors; Cache.save repo ~prefix cache; split_with_cache repo ~prefix ~head cache end else Ok (if Hash.equal h Hash.null then None else Some h) | None -> (* Cache miss — process new commits; find_ancestor_split handles gaps. *) split_with_cache repo ~prefix ~head cache (** {1 Add} *) let insert_tree_at_prefix repo base_tree_hash prefix subtree_hash = (* Split prefix into path segments. *) let segments = String.split_on_char '/' prefix |> List.filter (fun s -> s <> "") in (* Recursively build trees from the deepest level up. For each level, we need to either modify an existing tree or create a new one. *) let rec build_trees current_tree_hash = function | [] -> (* No more segments - replace with subtree *) Ok subtree_hash | [ name ] -> ( (* Last segment - insert subtree here *) match Repository.read repo current_tree_hash with | Error e -> Error e | Ok (Value.Tree tree) -> let new_tree = tree |> Tree.remove ~name |> Tree.add (Tree.entry ~perm:`Dir ~name subtree_hash) in Ok (Repository.write_tree repo new_tree) | _ -> Error (`Msg "Expected tree object")) | name :: rest -> ( (* Intermediate segment - descend or create *) match Repository.read repo current_tree_hash with | Error e -> Error e | Ok (Value.Tree tree) -> ( let existing_entry = Tree.find ~name tree in let child_hash = match existing_entry with | Some entry when entry.perm = `Dir -> entry.hash | _ -> (* No existing dir or not a dir - use empty tree *) Repository.write_tree repo Tree.empty in match build_trees child_hash rest with | Error e -> Error e | Ok new_child_hash -> let new_tree = tree |> Tree.remove ~name |> Tree.add (Tree.entry ~perm:`Dir ~name new_child_hash) in Ok (Repository.write_tree repo new_tree)) | _ -> Error (`Msg "Expected tree object")) in match segments with | [] -> (* Empty prefix means replace root tree entirely *) Ok subtree_hash | _ -> build_trees base_tree_hash segments (** Build a nested tree structure from a list of path segments. *) let build_nested_tree repo remote_tree segments = let rec build = function | [] -> remote_tree | [ name ] -> Repository.write_tree repo (Tree.v [ Tree.entry ~perm:`Dir ~name remote_tree ]) | name :: rest -> let child = build rest in Repository.write_tree repo (Tree.v [ Tree.entry ~perm:`Dir ~name child ]) in build segments (** Default message for add/merge operations. *) let default_add_message op prefix commit = Fmt.str "%s '%s' from commit %s\n" op prefix (Hash.to_hex commit) let add repo ~prefix ~commit ~author ~committer ?message () = match Repository.read repo commit with | Error e -> Error e | Ok (Value.Commit remote_commit) -> ( let remote_tree = Commit.tree remote_commit in let msg = Option.value message ~default:(default_add_message "Add" prefix commit) in match Repository.head repo with | None -> let segments = String.split_on_char '/' prefix |> List.filter (( <> ) "") in let root_tree = build_nested_tree repo remote_tree segments in let new_commit = Commit.v ~tree:root_tree ~author ~committer ~parents:[ commit ] (Some msg) in let new_hash = Repository.write_commit repo new_commit in Repository.advance_head repo new_hash; Ok new_hash | Some head_hash -> ( match Repository.read repo head_hash with | Error e -> Error e | Ok (Value.Commit head_commit) -> ( match insert_tree_at_prefix repo (Commit.tree head_commit) prefix remote_tree with | Error e -> Error e | Ok new_tree -> let new_commit = Commit.v ~tree:new_tree ~author ~committer ~parents:[ head_hash; commit ] (Some msg) in let new_hash = Repository.write_commit repo new_commit in Repository.advance_head repo new_hash; Ok new_hash) | _ -> Error (`Msg "HEAD does not point to a commit"))) | _ -> Error (`Msg "Not a commit object") let merge repo ~prefix ~commit ~author ~committer ?message () = (* Get the tree from the commit we're merging *) match Repository.read repo commit with | Error e -> Error e | Ok (Value.Commit remote_commit) -> ( let remote_tree = Commit.tree remote_commit in (* Get current HEAD *) match Repository.head repo with | None -> Error (`Msg "No HEAD - use add for initial subtree") | Some head_hash -> ( match Repository.read repo head_hash with | Error e -> Error e | Ok (Value.Commit head_commit) -> ( let base_tree = Commit.tree head_commit in (* Check that subtree exists at prefix *) match tree_at_prefix repo base_tree prefix with | None -> Error (`Msg ("Subtree not found at prefix: " ^ prefix)) | Some _ -> ( (* Replace the subtree at prefix with the remote tree *) match insert_tree_at_prefix repo base_tree prefix remote_tree with | Error e -> Error e | Ok new_tree -> let message = match message with | Some m -> m | None -> Fmt.str "Merge '%s' from commit %s\n" prefix (Hash.to_hex commit) in (* Create merge commit with two parents *) let new_commit = Commit.v ~tree:new_tree ~author ~committer ~parents:[ head_hash; commit ] (Some message) in let new_hash = Repository.write_commit repo new_commit in Repository.advance_head repo new_hash; Ok new_hash)) | _ -> Error (`Msg "HEAD does not point to a commit"))) | _ -> Error (`Msg "Not a commit object") (** {1 Check and Fix} *) type issue = { commit : Hash.t; message : string; subtree_dir : string option } (** Check if a commit message indicates a subtree merge for a different package. *) let is_unrelated_merge ~prefix message = match extract_subtree_dir message with | None -> None (* Not a subtree merge *) | Some dir -> if String.equal dir prefix || String.starts_with ~prefix:(prefix ^ "/") dir || String.starts_with ~prefix:(dir ^ "/") prefix then None (* Related to our prefix *) else Some dir let check repo ~prefix ~head () = let issues = ref [] in let checked = ref 0 in (* Walk the commit history *) let rec walk visited hash = if Hash.equal hash Hash.null || Hashtbl.mem visited hash then () else begin Hashtbl.add visited hash (); match Repository.read repo hash with | Ok (Value.Commit commit) -> incr checked; let message = Commit.message commit in (* Check if this is an unrelated subtree merge *) (match message with | Some msg -> ( match is_unrelated_merge ~prefix message with | Some dir -> issues := { commit = hash; message = msg; subtree_dir = Some dir } :: !issues | None -> ()) | None -> ()); (* Continue walking parents *) List.iter (walk visited) (Commit.parents commit) | _ -> () end in let visited = Hashtbl.create 1024 in walk visited head; (!checked, List.rev !issues) (** Process a single commit for fix rewriting. Determines if the commit is a self-merge, unrelated merge, or regular commit, and either skips it or rewrites it with remapped parents. *) let rewrite_commit repo ~prefix ~cache commit hash parents = let message = Commit.message commit in let tree = Commit.tree commit in (* Get remapped parents, filtering out null hashes *) let new_parents = List.filter_map (fun p -> match Hashtbl.find_opt cache p with | Some h when not (Hash.equal h Hash.null) -> Some h | _ -> None) parents in (* Check subtree merge type *) let subtree_dir = extract_subtree_dir message in let is_unrelated = match subtree_dir with | None -> false | Some dir -> not (String.equal dir prefix || String.starts_with ~prefix:(prefix ^ "/") dir || String.starts_with ~prefix:(dir ^ "/") prefix) in let is_self_merge = match subtree_dir with | None -> false | Some dir -> String.equal dir prefix || String.starts_with ~prefix:(prefix ^ "/") dir || String.starts_with ~prefix:(dir ^ "/") prefix in (* Determine action: - Self-merges: skip if tree unchanged from first parent (linearize) - Unrelated merges: skip if tree unchanged from first parent - Otherwise: keep the commit with remapped parents *) let action = if is_self_merge || is_unrelated then (* Skip if tree unchanged from first parent *) match new_parents with | first :: _ -> ( match Repository.read repo first with | Ok (Value.Commit pc) when Hash.equal tree (Commit.tree pc) -> `Skip_to first | _ -> `Keep) | [] -> `Keep else `Keep in match action with | `Skip_to parent -> Hashtbl.add cache hash parent | `Keep -> let new_commit = Commit.v ~tree ~author:(Commit.author commit) ~committer:(Commit.committer commit) ~parents:new_parents ~extra:(Commit.extra commit) message in Hashtbl.add cache hash (Repository.write_commit repo new_commit) let fix repo ~prefix ~head () = (* Rewrite history, removing subtree merge commits: 1. Unrelated merges (git-subtree-dir for a different prefix) - skip if tree unchanged from first parent 2. Self-merges (git-subtree-dir matches our prefix) - follow mainline parent to linearize history *) let cache = Hashtbl.create 1024 in (* Process commits in reverse topological order *) match Rev_list.topo_sort_reverse repo head ~stop:(fun _ -> false) with | Error e -> Error e | Ok commits -> List.iter (fun { Rev_list.hash; parents } -> match Repository.read repo hash with | Ok (Value.Commit commit) -> rewrite_commit repo ~prefix ~cache commit hash parents | Ok (Value.Blob _ | Value.Tree _ | Value.Tag _) -> Hashtbl.add cache hash Hash.null | Error _ -> Hashtbl.add cache hash Hash.null) commits; Ok (Hashtbl.find_opt cache head) type mono_issue = { mono_commit : Hash.t; mono_message : string; is_empty : bool; } let check_mono repo ~head () = match Rev_list.topo_sort_reverse repo head ~stop:(fun _ -> false) with | Error _ -> (0, []) | Ok commits -> let issues = ref [] in let count = ref 0 in List.iter (fun { Rev_list.hash; parents } -> incr count; match Repository.read repo hash with | Ok (Value.Commit commit) -> let message = Option.value ~default:"" (Commit.message commit) in let tree = Commit.tree commit in let is_empty = match parents with | first :: _ -> ( match Repository.read repo first with | Ok (Value.Commit pc) -> Hash.equal tree (Commit.tree pc) | _ -> false) | [] -> false in if is_empty then issues := { mono_commit = hash; mono_message = message; is_empty } :: !issues | _ -> ()) commits; (!count, List.rev !issues) let fix_mono repo ~head () = (* Rewrite history, removing all empty commits. *) let cache = Hashtbl.create 1024 in match Rev_list.topo_sort_reverse repo head ~stop:(fun _ -> false) with | Error e -> Error e | Ok commits -> List.iter (fun { Rev_list.hash; parents } -> match Repository.read repo hash with | Ok (Value.Commit commit) -> let message = Commit.message commit in let tree = Commit.tree commit in (* Get remapped parents, filtering out null hashes *) let new_parents = List.filter_map (fun p -> match Hashtbl.find_opt cache p with | Some h when not (Hash.equal h Hash.null) -> Some h | None -> Some p (* Parent not in cache, keep original *) | _ -> None) parents in (* Check if this is an empty commit (tree unchanged from first parent) *) let is_empty = match new_parents with | first :: _ -> ( match Repository.read repo first with | Ok (Value.Commit pc) -> Hash.equal tree (Commit.tree pc) | _ -> false) | [] -> false in if is_empty then (* Skip to first parent *) match new_parents with | first :: _ -> Hashtbl.add cache hash first | [] -> Hashtbl.add cache hash Hash.null else (* Keep the commit with remapped parents *) let new_commit = Commit.v ~tree ~author:(Commit.author commit) ~committer:(Commit.committer commit) ~parents:new_parents ~extra:(Commit.extra commit) message in Hashtbl.add cache hash (Repository.write_commit repo new_commit) | Ok (Value.Blob _ | Value.Tree _ | Value.Tag _) -> Hashtbl.add cache hash Hash.null | Error _ -> Hashtbl.add cache hash Hash.null) commits; Ok (Hashtbl.find_opt cache head)