[mirror] Scalable static site server for Git forges (like GitHub Pages)
at main 7.6 kB view raw
1package git_pages 2 3import ( 4 "context" 5 "errors" 6 "fmt" 7 "io" 8 "maps" 9 "net/url" 10 "os" 11 "slices" 12 13 "github.com/c2h5oh/datasize" 14 "github.com/go-git/go-billy/v6/osfs" 15 "github.com/go-git/go-git/v6" 16 "github.com/go-git/go-git/v6/plumbing" 17 "github.com/go-git/go-git/v6/plumbing/cache" 18 "github.com/go-git/go-git/v6/plumbing/filemode" 19 "github.com/go-git/go-git/v6/plumbing/object" 20 "github.com/go-git/go-git/v6/plumbing/protocol/packp" 21 "github.com/go-git/go-git/v6/plumbing/transport" 22 "github.com/go-git/go-git/v6/storage/filesystem" 23 "google.golang.org/protobuf/proto" 24) 25 26var ErrRepositoryTooLarge = errors.New("repository too large") 27 28func FetchRepository( 29 ctx context.Context, repoURL string, branch string, oldManifest *Manifest, 30) ( 31 *Manifest, error, 32) { 33 span, ctx := ObserveFunction(ctx, "FetchRepository", 34 "git.repository", repoURL, "git.branch", branch) 35 defer span.Finish() 36 37 parsedRepoURL, err := url.Parse(repoURL) 38 if err != nil { 39 return nil, fmt.Errorf("URL parse: %w", err) 40 } 41 42 var repo *git.Repository 43 var storer *filesystem.Storage 44 for _, filter := range []packp.Filter{packp.FilterBlobNone(), packp.Filter("")} { 45 var tempDir string 46 if tempDir, err = os.MkdirTemp("", "fetchRepo"); err != nil { 47 return nil, fmt.Errorf("mkdtemp: %w", err) 48 } 49 defer os.RemoveAll(tempDir) 50 51 storer = filesystem.NewStorageWithOptions( 52 osfs.New(tempDir, osfs.WithBoundOS()), 53 cache.NewObjectLRUDefault(), 54 filesystem.Options{ 55 ExclusiveAccess: true, 56 LargeObjectThreshold: int64(config.Limits.GitLargeObjectThreshold.Bytes()), 57 }, 58 ) 59 repo, err = git.CloneContext(ctx, storer, nil, &git.CloneOptions{ 60 Bare: true, 61 URL: repoURL, 62 ReferenceName: plumbing.NewBranchReferenceName(branch), 63 SingleBranch: true, 64 Depth: 1, 65 Tags: git.NoTags, 66 Filter: filter, 67 }) 68 if err != nil { 69 logc.Printf(ctx, "clone err: %s %s filter=%q\n", repoURL, branch, filter) 70 continue 71 } else { 72 logc.Printf(ctx, "clone ok: %s %s filter=%q\n", repoURL, branch, filter) 73 break 74 } 75 } 76 if err != nil { 77 return nil, fmt.Errorf("git clone: %w", err) 78 } 79 80 ref, err := repo.Head() 81 if err != nil { 82 return nil, fmt.Errorf("git head: %w", err) 83 } 84 85 commit, err := repo.CommitObject(ref.Hash()) 86 if err != nil { 87 return nil, fmt.Errorf("git commit: %w", err) 88 } 89 90 tree, err := repo.TreeObject(commit.TreeHash) 91 if err != nil { 92 return nil, fmt.Errorf("git tree: %w", err) 93 } 94 95 walker := object.NewTreeWalker(tree, true, make(map[plumbing.Hash]bool)) 96 defer walker.Close() 97 98 // Create a manifest for the tree object corresponding to `branch`, but do not populate it 99 // with data yet; instead, record all the blobs we'll need. 100 manifest := NewManifest() 101 manifest.RepoUrl = proto.String(repoURL) 102 manifest.Branch = proto.String(branch) 103 manifest.Commit = proto.String(ref.Hash().String()) 104 blobsNeeded := map[plumbing.Hash]*Entry{} 105 for { 106 name, entry, err := walker.Next() 107 if err == io.EOF { 108 break 109 } else if err != nil { 110 return nil, fmt.Errorf("git walker: %w", err) 111 } else { 112 manifestEntry := &Entry{} 113 if existingManifestEntry, found := blobsNeeded[entry.Hash]; found { 114 // If the same blob is present twice, we only need to fetch it once (and both 115 // instances will alias the same `Entry` structure in the manifest). 116 manifestEntry = existingManifestEntry 117 } else if entry.Mode.IsFile() { 118 blobsNeeded[entry.Hash] = manifestEntry 119 if entry.Mode == filemode.Symlink { 120 manifestEntry.Type = Type_Symlink.Enum() 121 } else { 122 manifestEntry.Type = Type_InlineFile.Enum() 123 } 124 manifestEntry.GitHash = proto.String(entry.Hash.String()) 125 } else if entry.Mode == filemode.Dir { 126 manifestEntry.Type = Type_Directory.Enum() 127 } else { 128 AddProblem(manifest, name, "unsupported mode %#o", entry.Mode) 129 continue 130 } 131 manifest.Contents[name] = manifestEntry 132 } 133 } 134 135 // Collect checkout statistics. 136 var dataBytesRecycled int64 137 var dataBytesTransferred int64 138 139 // First, see if we can extract the blobs from the old manifest. This is the preferred option 140 // because it avoids both network transfers and recompression. Note that we do not request 141 // blobs from the backend under any circumstances to avoid creating a blob existence oracle. 142 for _, oldManifestEntry := range oldManifest.GetContents() { 143 if hash, ok := plumbing.FromHex(oldManifestEntry.GetGitHash()); ok { 144 if manifestEntry, found := blobsNeeded[hash]; found { 145 manifestEntry.Reset() 146 proto.Merge(manifestEntry, oldManifestEntry) 147 dataBytesRecycled += oldManifestEntry.GetOriginalSize() 148 delete(blobsNeeded, hash) 149 } 150 } 151 } 152 153 // Second, fill the manifest entries with data from the git checkout we just made. 154 // This will only succeed if a `blob:none` filter isn't supported and we got a full 155 // clone despite asking for a partial clone. 156 for hash, manifestEntry := range blobsNeeded { 157 if err := readGitBlob(repo, hash, manifestEntry, &dataBytesTransferred); err == nil { 158 delete(blobsNeeded, hash) 159 } else if errors.Is(err, ErrRepositoryTooLarge) { 160 return nil, err 161 } 162 } 163 164 // Third, if we still don't have data for some manifest entries, re-establish a git transport 165 // and request the missing blobs (only) from the server. 166 if len(blobsNeeded) > 0 { 167 client, err := transport.Get(parsedRepoURL.Scheme) 168 if err != nil { 169 return nil, fmt.Errorf("git transport: %w", err) 170 } 171 172 endpoint, err := transport.NewEndpoint(repoURL) 173 if err != nil { 174 return nil, fmt.Errorf("git endpoint: %w", err) 175 } 176 177 session, err := client.NewSession(storer, endpoint, nil) 178 if err != nil { 179 return nil, fmt.Errorf("git session: %w", err) 180 } 181 182 connection, err := session.Handshake(ctx, transport.UploadPackService) 183 if err != nil { 184 return nil, fmt.Errorf("git connection: %w", err) 185 } 186 defer connection.Close() 187 188 if err := connection.Fetch(ctx, &transport.FetchRequest{ 189 Wants: slices.Collect(maps.Keys(blobsNeeded)), 190 Depth: 1, 191 // Git CLI behaves like this, even if the wants above are references to blobs. 192 Filter: "blob:none", 193 }); err != nil && !errors.Is(err, transport.ErrNoChange) { 194 return nil, fmt.Errorf("git blob fetch request: %w", err) 195 } 196 197 // All remaining blobs should now be available. 198 for hash, manifestEntry := range blobsNeeded { 199 if err := readGitBlob(repo, hash, manifestEntry, &dataBytesTransferred); err != nil { 200 return nil, err 201 } 202 delete(blobsNeeded, hash) 203 } 204 } 205 206 logc.Printf(ctx, 207 "reuse: %s recycled, %s transferred\n", 208 datasize.ByteSize(dataBytesRecycled).HR(), 209 datasize.ByteSize(dataBytesTransferred).HR(), 210 ) 211 212 return manifest, nil 213} 214 215func readGitBlob( 216 repo *git.Repository, hash plumbing.Hash, entry *Entry, bytesTransferred *int64, 217) error { 218 blob, err := repo.BlobObject(hash) 219 if err != nil { 220 return fmt.Errorf("git blob %s: %w", hash, err) 221 } 222 223 reader, err := blob.Reader() 224 if err != nil { 225 return fmt.Errorf("git blob open: %w", err) 226 } 227 defer reader.Close() 228 229 data, err := io.ReadAll(reader) 230 if err != nil { 231 return fmt.Errorf("git blob read: %w", err) 232 } 233 234 switch entry.GetType() { 235 case Type_InlineFile, Type_Symlink: 236 // okay 237 default: 238 panic(fmt.Errorf("readGitBlob encountered invalid entry: %v, %v", 239 entry.GetType(), entry.GetTransform())) 240 } 241 242 entry.Data = data 243 entry.Transform = Transform_Identity.Enum() 244 entry.OriginalSize = proto.Int64(blob.Size) 245 entry.CompressedSize = proto.Int64(blob.Size) 246 247 *bytesTransferred += blob.Size 248 if uint64(*bytesTransferred) > config.Limits.MaxSiteSize.Bytes() { 249 return fmt.Errorf("%w: fetch exceeds %s limit", 250 ErrRepositoryTooLarge, 251 config.Limits.MaxSiteSize.HR(), 252 ) 253 } 254 255 return nil 256}