[mirror] Scalable static site server for Git forges (like GitHub Pages)
1package git_pages
2
3import (
4 "context"
5 "errors"
6 "fmt"
7 "io"
8 "maps"
9 "net/url"
10 "os"
11 "slices"
12
13 "github.com/c2h5oh/datasize"
14 "github.com/go-git/go-billy/v6/osfs"
15 "github.com/go-git/go-git/v6"
16 "github.com/go-git/go-git/v6/plumbing"
17 "github.com/go-git/go-git/v6/plumbing/cache"
18 "github.com/go-git/go-git/v6/plumbing/filemode"
19 "github.com/go-git/go-git/v6/plumbing/object"
20 "github.com/go-git/go-git/v6/plumbing/protocol/packp"
21 "github.com/go-git/go-git/v6/plumbing/transport"
22 "github.com/go-git/go-git/v6/storage/filesystem"
23 "google.golang.org/protobuf/proto"
24)
25
26var ErrRepositoryTooLarge = errors.New("repository too large")
27
28func FetchRepository(
29 ctx context.Context, repoURL string, branch string, oldManifest *Manifest,
30) (
31 *Manifest, error,
32) {
33 span, ctx := ObserveFunction(ctx, "FetchRepository",
34 "git.repository", repoURL, "git.branch", branch)
35 defer span.Finish()
36
37 parsedRepoURL, err := url.Parse(repoURL)
38 if err != nil {
39 return nil, fmt.Errorf("URL parse: %w", err)
40 }
41
42 var repo *git.Repository
43 var storer *filesystem.Storage
44 for _, filter := range []packp.Filter{packp.FilterBlobNone(), packp.Filter("")} {
45 var tempDir string
46 if tempDir, err = os.MkdirTemp("", "fetchRepo"); err != nil {
47 return nil, fmt.Errorf("mkdtemp: %w", err)
48 }
49 defer os.RemoveAll(tempDir)
50
51 storer = filesystem.NewStorageWithOptions(
52 osfs.New(tempDir, osfs.WithBoundOS()),
53 cache.NewObjectLRUDefault(),
54 filesystem.Options{
55 ExclusiveAccess: true,
56 LargeObjectThreshold: int64(config.Limits.GitLargeObjectThreshold.Bytes()),
57 },
58 )
59 repo, err = git.CloneContext(ctx, storer, nil, &git.CloneOptions{
60 Bare: true,
61 URL: repoURL,
62 ReferenceName: plumbing.NewBranchReferenceName(branch),
63 SingleBranch: true,
64 Depth: 1,
65 Tags: git.NoTags,
66 Filter: filter,
67 })
68 if err != nil {
69 logc.Printf(ctx, "clone err: %s %s filter=%q\n", repoURL, branch, filter)
70 continue
71 } else {
72 logc.Printf(ctx, "clone ok: %s %s filter=%q\n", repoURL, branch, filter)
73 break
74 }
75 }
76 if err != nil {
77 return nil, fmt.Errorf("git clone: %w", err)
78 }
79
80 ref, err := repo.Head()
81 if err != nil {
82 return nil, fmt.Errorf("git head: %w", err)
83 }
84
85 commit, err := repo.CommitObject(ref.Hash())
86 if err != nil {
87 return nil, fmt.Errorf("git commit: %w", err)
88 }
89
90 tree, err := repo.TreeObject(commit.TreeHash)
91 if err != nil {
92 return nil, fmt.Errorf("git tree: %w", err)
93 }
94
95 walker := object.NewTreeWalker(tree, true, make(map[plumbing.Hash]bool))
96 defer walker.Close()
97
98 // Create a manifest for the tree object corresponding to `branch`, but do not populate it
99 // with data yet; instead, record all the blobs we'll need.
100 manifest := NewManifest()
101 manifest.RepoUrl = proto.String(repoURL)
102 manifest.Branch = proto.String(branch)
103 manifest.Commit = proto.String(ref.Hash().String())
104 blobsNeeded := map[plumbing.Hash]*Entry{}
105 for {
106 name, entry, err := walker.Next()
107 if err == io.EOF {
108 break
109 } else if err != nil {
110 return nil, fmt.Errorf("git walker: %w", err)
111 } else {
112 manifestEntry := &Entry{}
113 if existingManifestEntry, found := blobsNeeded[entry.Hash]; found {
114 // If the same blob is present twice, we only need to fetch it once (and both
115 // instances will alias the same `Entry` structure in the manifest).
116 manifestEntry = existingManifestEntry
117 } else if entry.Mode.IsFile() {
118 blobsNeeded[entry.Hash] = manifestEntry
119 if entry.Mode == filemode.Symlink {
120 manifestEntry.Type = Type_Symlink.Enum()
121 } else {
122 manifestEntry.Type = Type_InlineFile.Enum()
123 }
124 manifestEntry.GitHash = proto.String(entry.Hash.String())
125 } else if entry.Mode == filemode.Dir {
126 manifestEntry.Type = Type_Directory.Enum()
127 } else {
128 AddProblem(manifest, name, "unsupported mode %#o", entry.Mode)
129 continue
130 }
131 manifest.Contents[name] = manifestEntry
132 }
133 }
134
135 // Collect checkout statistics.
136 var dataBytesRecycled int64
137 var dataBytesTransferred int64
138
139 // First, see if we can extract the blobs from the old manifest. This is the preferred option
140 // because it avoids both network transfers and recompression. Note that we do not request
141 // blobs from the backend under any circumstances to avoid creating a blob existence oracle.
142 for _, oldManifestEntry := range oldManifest.GetContents() {
143 if hash, ok := plumbing.FromHex(oldManifestEntry.GetGitHash()); ok {
144 if manifestEntry, found := blobsNeeded[hash]; found {
145 manifestEntry.Reset()
146 proto.Merge(manifestEntry, oldManifestEntry)
147 dataBytesRecycled += oldManifestEntry.GetOriginalSize()
148 delete(blobsNeeded, hash)
149 }
150 }
151 }
152
153 // Second, fill the manifest entries with data from the git checkout we just made.
154 // This will only succeed if a `blob:none` filter isn't supported and we got a full
155 // clone despite asking for a partial clone.
156 for hash, manifestEntry := range blobsNeeded {
157 if err := readGitBlob(repo, hash, manifestEntry, &dataBytesTransferred); err == nil {
158 delete(blobsNeeded, hash)
159 } else if errors.Is(err, ErrRepositoryTooLarge) {
160 return nil, err
161 }
162 }
163
164 // Third, if we still don't have data for some manifest entries, re-establish a git transport
165 // and request the missing blobs (only) from the server.
166 if len(blobsNeeded) > 0 {
167 client, err := transport.Get(parsedRepoURL.Scheme)
168 if err != nil {
169 return nil, fmt.Errorf("git transport: %w", err)
170 }
171
172 endpoint, err := transport.NewEndpoint(repoURL)
173 if err != nil {
174 return nil, fmt.Errorf("git endpoint: %w", err)
175 }
176
177 session, err := client.NewSession(storer, endpoint, nil)
178 if err != nil {
179 return nil, fmt.Errorf("git session: %w", err)
180 }
181
182 connection, err := session.Handshake(ctx, transport.UploadPackService)
183 if err != nil {
184 return nil, fmt.Errorf("git connection: %w", err)
185 }
186 defer connection.Close()
187
188 if err := connection.Fetch(ctx, &transport.FetchRequest{
189 Wants: slices.Collect(maps.Keys(blobsNeeded)),
190 Depth: 1,
191 // Git CLI behaves like this, even if the wants above are references to blobs.
192 Filter: "blob:none",
193 }); err != nil && !errors.Is(err, transport.ErrNoChange) {
194 return nil, fmt.Errorf("git blob fetch request: %w", err)
195 }
196
197 // All remaining blobs should now be available.
198 for hash, manifestEntry := range blobsNeeded {
199 if err := readGitBlob(repo, hash, manifestEntry, &dataBytesTransferred); err != nil {
200 return nil, err
201 }
202 delete(blobsNeeded, hash)
203 }
204 }
205
206 logc.Printf(ctx,
207 "reuse: %s recycled, %s transferred\n",
208 datasize.ByteSize(dataBytesRecycled).HR(),
209 datasize.ByteSize(dataBytesTransferred).HR(),
210 )
211
212 return manifest, nil
213}
214
215func readGitBlob(
216 repo *git.Repository, hash plumbing.Hash, entry *Entry, bytesTransferred *int64,
217) error {
218 blob, err := repo.BlobObject(hash)
219 if err != nil {
220 return fmt.Errorf("git blob %s: %w", hash, err)
221 }
222
223 reader, err := blob.Reader()
224 if err != nil {
225 return fmt.Errorf("git blob open: %w", err)
226 }
227 defer reader.Close()
228
229 data, err := io.ReadAll(reader)
230 if err != nil {
231 return fmt.Errorf("git blob read: %w", err)
232 }
233
234 switch entry.GetType() {
235 case Type_InlineFile, Type_Symlink:
236 // okay
237 default:
238 panic(fmt.Errorf("readGitBlob encountered invalid entry: %v, %v",
239 entry.GetType(), entry.GetTransform()))
240 }
241
242 entry.Data = data
243 entry.Transform = Transform_Identity.Enum()
244 entry.OriginalSize = proto.Int64(blob.Size)
245 entry.CompressedSize = proto.Int64(blob.Size)
246
247 *bytesTransferred += blob.Size
248 if uint64(*bytesTransferred) > config.Limits.MaxSiteSize.Bytes() {
249 return fmt.Errorf("%w: fetch exceeds %s limit",
250 ErrRepositoryTooLarge,
251 config.Limits.MaxSiteSize.HR(),
252 )
253 }
254
255 return nil
256}