[mirror] Scalable static site server for Git forges (like GitHub Pages)
at v0.3.1 7.3 kB view raw
1package git_pages 2 3import ( 4 "context" 5 "errors" 6 "fmt" 7 "io" 8 "maps" 9 "net/url" 10 "os" 11 "slices" 12 13 "github.com/c2h5oh/datasize" 14 "github.com/go-git/go-billy/v6/osfs" 15 "github.com/go-git/go-git/v6" 16 "github.com/go-git/go-git/v6/plumbing" 17 "github.com/go-git/go-git/v6/plumbing/cache" 18 "github.com/go-git/go-git/v6/plumbing/filemode" 19 "github.com/go-git/go-git/v6/plumbing/object" 20 "github.com/go-git/go-git/v6/plumbing/protocol/packp" 21 "github.com/go-git/go-git/v6/plumbing/transport" 22 "github.com/go-git/go-git/v6/storage/filesystem" 23 "google.golang.org/protobuf/proto" 24) 25 26func FetchRepository( 27 ctx context.Context, repoURL string, branch string, oldManifest *Manifest, 28) ( 29 *Manifest, error, 30) { 31 span, ctx := ObserveFunction(ctx, "FetchRepository", 32 "git.repository", repoURL, "git.branch", branch) 33 defer span.Finish() 34 35 parsedRepoURL, err := url.Parse(repoURL) 36 if err != nil { 37 return nil, fmt.Errorf("URL parse: %w", err) 38 } 39 40 var repo *git.Repository 41 var storer *filesystem.Storage 42 for _, filter := range []packp.Filter{packp.FilterBlobNone(), packp.Filter("")} { 43 var tempDir string 44 if tempDir, err = os.MkdirTemp("", "fetchRepo"); err != nil { 45 return nil, fmt.Errorf("mkdtemp: %w", err) 46 } 47 defer os.RemoveAll(tempDir) 48 49 storer = filesystem.NewStorageWithOptions( 50 osfs.New(tempDir, osfs.WithBoundOS()), 51 cache.NewObjectLRUDefault(), 52 filesystem.Options{ 53 ExclusiveAccess: true, 54 LargeObjectThreshold: int64(config.Limits.GitLargeObjectThreshold.Bytes()), 55 }, 56 ) 57 repo, err = git.CloneContext(ctx, storer, nil, &git.CloneOptions{ 58 Bare: true, 59 URL: repoURL, 60 ReferenceName: plumbing.ReferenceName(branch), 61 SingleBranch: true, 62 Depth: 1, 63 Tags: git.NoTags, 64 Filter: filter, 65 }) 66 if err != nil { 67 logc.Printf(ctx, "clone err: %s %s filter=%q\n", repoURL, branch, filter) 68 continue 69 } else { 70 logc.Printf(ctx, "clone ok: %s %s filter=%q\n", repoURL, branch, filter) 71 break 72 } 73 } 74 if err != nil { 75 return nil, fmt.Errorf("git clone: %w", err) 76 } 77 78 ref, err := repo.Head() 79 if err != nil { 80 return nil, fmt.Errorf("git head: %w", err) 81 } 82 83 commit, err := repo.CommitObject(ref.Hash()) 84 if err != nil { 85 return nil, fmt.Errorf("git commit: %w", err) 86 } 87 88 tree, err := repo.TreeObject(commit.TreeHash) 89 if err != nil { 90 return nil, fmt.Errorf("git tree: %w", err) 91 } 92 93 walker := object.NewTreeWalker(tree, true, make(map[plumbing.Hash]bool)) 94 defer walker.Close() 95 96 // Create a manifest for the tree object corresponding to `branch`, but do not populate it 97 // with data yet; instead, record all the blobs we'll need. 98 manifest := NewManifest() 99 manifest.RepoUrl = proto.String(repoURL) 100 manifest.Branch = proto.String(branch) 101 manifest.Commit = proto.String(ref.Hash().String()) 102 blobsNeeded := map[plumbing.Hash]*Entry{} 103 for { 104 name, entry, err := walker.Next() 105 if err == io.EOF { 106 break 107 } else if err != nil { 108 return nil, fmt.Errorf("git walker: %w", err) 109 } else { 110 manifestEntry := &Entry{} 111 if existingManifestEntry, found := blobsNeeded[entry.Hash]; found { 112 // If the same blob is present twice, we only need to fetch it once (and both 113 // instances will alias the same `Entry` structure in the manifest). 114 manifestEntry = existingManifestEntry 115 } else if entry.Mode.IsFile() { 116 blobsNeeded[entry.Hash] = manifestEntry 117 if entry.Mode == filemode.Symlink { 118 manifestEntry.Type = Type_Symlink.Enum() 119 } else { 120 manifestEntry.Type = Type_InlineFile.Enum() 121 } 122 manifestEntry.GitHash = proto.String(entry.Hash.String()) 123 } else if entry.Mode == filemode.Dir { 124 manifestEntry.Type = Type_Directory.Enum() 125 } else { 126 AddProblem(manifest, name, "unsupported mode %#o", entry.Mode) 127 continue 128 } 129 manifest.Contents[name] = manifestEntry 130 } 131 } 132 133 // Collect checkout statistics. 134 var dataBytesRecycled int64 135 var dataBytesTransferred int64 136 137 // First, see if we can extract the blobs from the old manifest. This is the preferred option 138 // because it avoids both network transfers and recompression. Note that we do not request 139 // blobs from the backend under any circumstances to avoid creating a blob existence oracle. 140 for _, oldManifestEntry := range oldManifest.GetContents() { 141 if hash, ok := plumbing.FromHex(oldManifestEntry.GetGitHash()); ok { 142 if manifestEntry, found := blobsNeeded[hash]; found { 143 manifestEntry.Reset() 144 proto.Merge(manifestEntry, oldManifestEntry) 145 dataBytesRecycled += oldManifestEntry.GetOriginalSize() 146 delete(blobsNeeded, hash) 147 } 148 } 149 } 150 151 // Second, fill the manifest entries with data from the git checkout we just made. 152 // This will only succeed if a `blob:none` filter isn't supported and we got a full 153 // clone despite asking for a partial clone. 154 for hash, manifestEntry := range blobsNeeded { 155 if err := readGitBlob(repo, hash, manifestEntry); err == nil { 156 dataBytesTransferred += manifestEntry.GetOriginalSize() 157 delete(blobsNeeded, hash) 158 } 159 } 160 161 // Third, if we still don't have data for some manifest entries, re-establish a git transport 162 // and request the missing blobs (only) from the server. 163 if len(blobsNeeded) > 0 { 164 client, err := transport.Get(parsedRepoURL.Scheme) 165 if err != nil { 166 return nil, fmt.Errorf("git transport: %w", err) 167 } 168 169 endpoint, err := transport.NewEndpoint(repoURL) 170 if err != nil { 171 return nil, fmt.Errorf("git endpoint: %w", err) 172 } 173 174 session, err := client.NewSession(storer, endpoint, nil) 175 if err != nil { 176 return nil, fmt.Errorf("git session: %w", err) 177 } 178 179 connection, err := session.Handshake(ctx, transport.UploadPackService) 180 if err != nil { 181 return nil, fmt.Errorf("git connection: %w", err) 182 } 183 defer connection.Close() 184 185 if err := connection.Fetch(ctx, &transport.FetchRequest{ 186 Wants: slices.Collect(maps.Keys(blobsNeeded)), 187 Depth: 1, 188 // Git CLI behaves like this, even if the wants above are references to blobs. 189 Filter: "blob:none", 190 }); err != nil && !errors.Is(err, transport.ErrNoChange) { 191 return nil, fmt.Errorf("git blob fetch request: %w", err) 192 } 193 194 // All remaining blobs should now be available. 195 for hash, manifestEntry := range blobsNeeded { 196 if err := readGitBlob(repo, hash, manifestEntry); err != nil { 197 return nil, err 198 } 199 dataBytesTransferred += manifestEntry.GetOriginalSize() 200 delete(blobsNeeded, hash) 201 } 202 } 203 204 logc.Printf(ctx, 205 "reuse: %s recycled, %s transferred\n", 206 datasize.ByteSize(dataBytesRecycled).HR(), 207 datasize.ByteSize(dataBytesTransferred).HR(), 208 ) 209 210 return manifest, nil 211} 212 213func readGitBlob(repo *git.Repository, hash plumbing.Hash, entry *Entry) error { 214 blob, err := repo.BlobObject(hash) 215 if err != nil { 216 return fmt.Errorf("git blob %s: %w", hash, err) 217 } 218 219 reader, err := blob.Reader() 220 if err != nil { 221 return fmt.Errorf("git blob open: %w", err) 222 } 223 defer reader.Close() 224 225 data, err := io.ReadAll(reader) 226 if err != nil { 227 return fmt.Errorf("git blob read: %w", err) 228 } 229 230 switch entry.GetType() { 231 case Type_InlineFile, Type_Symlink: 232 // okay 233 default: 234 panic(fmt.Errorf("readGitBlob encountered invalid entry: %v, %v", 235 entry.GetType(), entry.GetTransform())) 236 } 237 238 entry.Data = data 239 entry.Transform = Transform_Identity.Enum() 240 entry.OriginalSize = proto.Int64(blob.Size) 241 entry.CompressedSize = proto.Int64(blob.Size) 242 return nil 243}