fork of whitequark.org/git-pages with mods for tangled
at main 7.5 kB view raw
1package git_pages 2 3import ( 4 "context" 5 "errors" 6 "fmt" 7 "io" 8 "maps" 9 "net/url" 10 "os" 11 "slices" 12 13 "github.com/c2h5oh/datasize" 14 "github.com/go-git/go-billy/v6/osfs" 15 "github.com/go-git/go-git/v6" 16 "github.com/go-git/go-git/v6/plumbing" 17 "github.com/go-git/go-git/v6/plumbing/cache" 18 "github.com/go-git/go-git/v6/plumbing/filemode" 19 "github.com/go-git/go-git/v6/plumbing/object" 20 "github.com/go-git/go-git/v6/plumbing/protocol/packp" 21 "github.com/go-git/go-git/v6/plumbing/transport" 22 "github.com/go-git/go-git/v6/storage/filesystem" 23 "google.golang.org/protobuf/proto" 24) 25 26func FetchRepository( 27 ctx context.Context, repoURL string, branch string, oldManifest *Manifest, 28) ( 29 *Manifest, error, 30) { 31 span, ctx := ObserveFunction(ctx, "FetchRepository", 32 "git.repository", repoURL, "git.branch", branch) 33 defer span.Finish() 34 35 parsedRepoURL, err := url.Parse(repoURL) 36 if err != nil { 37 return nil, fmt.Errorf("URL parse: %w", err) 38 } 39 40 var repo *git.Repository 41 var storer *filesystem.Storage 42 for _, filter := range []packp.Filter{packp.FilterBlobNone(), packp.Filter("")} { 43 var tempDir string 44 tempDir, err = os.MkdirTemp("", "fetchRepo") 45 if err != nil { 46 return nil, fmt.Errorf("mkdtemp: %w", err) 47 } 48 defer os.RemoveAll(tempDir) 49 50 storer = filesystem.NewStorageWithOptions( 51 osfs.New(tempDir, osfs.WithBoundOS()), 52 cache.NewObjectLRUDefault(), 53 filesystem.Options{ 54 ExclusiveAccess: true, 55 LargeObjectThreshold: int64(config.Limits.GitLargeObjectThreshold.Bytes()), 56 }, 57 ) 58 repo, err = git.CloneContext(ctx, storer, nil, &git.CloneOptions{ 59 Bare: true, 60 URL: repoURL, 61 ReferenceName: plumbing.ReferenceName(branch), 62 SingleBranch: true, 63 Depth: 1, 64 Tags: git.NoTags, 65 Filter: filter, 66 }) 67 if err != nil { 68 logc.Printf(ctx, "clone err: %s %s filter=%q\n", repoURL, branch, filter) 69 continue 70 } else { 71 logc.Printf(ctx, "clone ok: %s %s filter=%q\n", repoURL, branch, filter) 72 break 73 } 74 } 75 if err != nil { 76 return nil, fmt.Errorf("git clone: %w", err) 77 } 78 79 ref, err := repo.Head() 80 if err != nil { 81 return nil, fmt.Errorf("git head: %w", err) 82 } 83 84 commit, err := repo.CommitObject(ref.Hash()) 85 if err != nil { 86 return nil, fmt.Errorf("git commit: %w", err) 87 } 88 89 tree, err := repo.TreeObject(commit.TreeHash) 90 if err != nil { 91 return nil, fmt.Errorf("git tree: %w", err) 92 } 93 94 walker := object.NewTreeWalker(tree, true, make(map[plumbing.Hash]bool)) 95 defer walker.Close() 96 97 // Create a manifest for the tree object corresponding to `branch`, but do not populate it 98 // with data yet; instead, record all the blobs we'll need. 99 manifest := &Manifest{ 100 RepoUrl: proto.String(repoURL), 101 Branch: proto.String(branch), 102 Commit: proto.String(ref.Hash().String()), 103 Contents: map[string]*Entry{ 104 "": {Type: Type_Directory.Enum()}, 105 }, 106 } 107 blobsNeeded := map[plumbing.Hash]*Entry{} 108 for { 109 name, entry, err := walker.Next() 110 if err == io.EOF { 111 break 112 } else if err != nil { 113 return nil, fmt.Errorf("git walker: %w", err) 114 } else { 115 manifestEntry := &Entry{} 116 if existingManifestEntry, found := blobsNeeded[entry.Hash]; found { 117 // If the same blob is present twice, we only need to fetch it once (and both 118 // instances will alias the same `Entry` structure in the manifest). 119 manifestEntry = existingManifestEntry 120 } else if entry.Mode.IsFile() { 121 blobsNeeded[entry.Hash] = manifestEntry 122 if entry.Mode == filemode.Symlink { 123 manifestEntry.Type = Type_Symlink.Enum() 124 } else { 125 manifestEntry.Type = Type_InlineFile.Enum() 126 } 127 manifestEntry.GitHash = proto.String(entry.Hash.String()) 128 } else if entry.Mode == filemode.Dir { 129 manifestEntry.Type = Type_Directory.Enum() 130 } else { 131 AddProblem(manifest, name, "unsupported mode %#o", entry.Mode) 132 continue 133 } 134 manifest.Contents[name] = manifestEntry 135 } 136 } 137 138 // Collect checkout statistics. 139 var dataBytesFromOldManifest int64 140 var dataBytesFromGitCheckout int64 141 var dataBytesFromGitTransport int64 142 143 // First, see if we can extract the blobs from the old manifest. This is the preferred option 144 // because it avoids both network transfers and recompression. Note that we do not request 145 // blobs from the backend under any circumstances to avoid creating a blob existence oracle. 146 for _, oldManifestEntry := range oldManifest.GetContents() { 147 if hash, ok := plumbing.FromHex(oldManifestEntry.GetGitHash()); ok { 148 if manifestEntry, found := blobsNeeded[hash]; found { 149 manifestEntry.Reset() 150 proto.Merge(manifestEntry, oldManifestEntry) 151 dataBytesFromOldManifest += oldManifestEntry.GetOriginalSize() 152 delete(blobsNeeded, hash) 153 } 154 } 155 } 156 157 // Second, fill the manifest entries with data from the git checkout we just made. 158 // This will only succeed if a `blob:none` filter isn't supported and we got a full 159 // clone despite asking for a partial clone. 160 for hash, manifestEntry := range blobsNeeded { 161 if err := readGitBlob(repo, hash, manifestEntry); err == nil { 162 dataBytesFromGitCheckout += manifestEntry.GetOriginalSize() 163 delete(blobsNeeded, hash) 164 } 165 } 166 167 // Third, if we still don't have data for some manifest entries, re-establish a git transport 168 // and request the missing blobs (only) from the server. 169 if len(blobsNeeded) > 0 { 170 client, err := transport.Get(parsedRepoURL.Scheme) 171 if err != nil { 172 return nil, fmt.Errorf("git transport: %w", err) 173 } 174 175 endpoint, err := transport.NewEndpoint(repoURL) 176 if err != nil { 177 return nil, fmt.Errorf("git endpoint: %w", err) 178 } 179 180 session, err := client.NewSession(storer, endpoint, nil) 181 if err != nil { 182 return nil, fmt.Errorf("git session: %w", err) 183 } 184 185 connection, err := session.Handshake(ctx, transport.UploadPackService) 186 if err != nil { 187 return nil, fmt.Errorf("git connection: %w", err) 188 } 189 defer connection.Close() 190 191 if err := connection.Fetch(ctx, &transport.FetchRequest{ 192 Wants: slices.Collect(maps.Keys(blobsNeeded)), 193 Depth: 1, 194 // Git CLI behaves like this, even if the wants above are references to blobs. 195 Filter: "blob:none", 196 }); err != nil && !errors.Is(err, transport.ErrNoChange) { 197 return nil, fmt.Errorf("git blob fetch request: %w", err) 198 } 199 200 // All remaining blobs should now be available. 201 for hash, manifestEntry := range blobsNeeded { 202 if err := readGitBlob(repo, hash, manifestEntry); err != nil { 203 return nil, err 204 } 205 dataBytesFromGitTransport += manifestEntry.GetOriginalSize() 206 delete(blobsNeeded, hash) 207 } 208 } 209 210 logc.Printf(ctx, 211 "fetch: %s from old manifest, %s from git checkout, %s from git transport\n", 212 datasize.ByteSize(dataBytesFromOldManifest).HR(), 213 datasize.ByteSize(dataBytesFromGitCheckout).HR(), 214 datasize.ByteSize(dataBytesFromGitTransport).HR(), 215 ) 216 217 return manifest, nil 218} 219 220func readGitBlob(repo *git.Repository, hash plumbing.Hash, entry *Entry) error { 221 blob, err := repo.BlobObject(hash) 222 if err != nil { 223 return fmt.Errorf("git blob %s: %w", hash, err) 224 } 225 226 reader, err := blob.Reader() 227 if err != nil { 228 return fmt.Errorf("git blob open: %w", err) 229 } 230 defer reader.Close() 231 232 data, err := io.ReadAll(reader) 233 if err != nil { 234 return fmt.Errorf("git blob read: %w", err) 235 } 236 237 switch entry.GetType() { 238 case Type_InlineFile, Type_Symlink: 239 // okay 240 default: 241 panic(fmt.Errorf("readGitBlob encountered invalid entry: %v, %v", 242 entry.GetType(), entry.GetTransform())) 243 } 244 245 entry.Data = data 246 entry.Transform = Transform_Identity.Enum() 247 entry.OriginalSize = proto.Int64(blob.Size) 248 entry.CompressedSize = proto.Int64(blob.Size) 249 return nil 250}