[mirror] Scalable static site server for Git forges (like GitHub Pages)
1package git_pages
2
3import (
4 "context"
5 "errors"
6 "fmt"
7 "io"
8 "maps"
9 "net/url"
10 "os"
11 "slices"
12
13 "github.com/c2h5oh/datasize"
14 "github.com/go-git/go-billy/v6/osfs"
15 "github.com/go-git/go-git/v6"
16 "github.com/go-git/go-git/v6/plumbing"
17 "github.com/go-git/go-git/v6/plumbing/cache"
18 "github.com/go-git/go-git/v6/plumbing/filemode"
19 "github.com/go-git/go-git/v6/plumbing/object"
20 "github.com/go-git/go-git/v6/plumbing/protocol/packp"
21 "github.com/go-git/go-git/v6/plumbing/transport"
22 "github.com/go-git/go-git/v6/storage/filesystem"
23 "google.golang.org/protobuf/proto"
24)
25
26func FetchRepository(
27 ctx context.Context, repoURL string, branch string, oldManifest *Manifest,
28) (
29 *Manifest, error,
30) {
31 span, ctx := ObserveFunction(ctx, "FetchRepository",
32 "git.repository", repoURL, "git.branch", branch)
33 defer span.Finish()
34
35 parsedRepoURL, err := url.Parse(repoURL)
36 if err != nil {
37 return nil, fmt.Errorf("URL parse: %w", err)
38 }
39
40 var repo *git.Repository
41 var storer *filesystem.Storage
42 for _, filter := range []packp.Filter{packp.FilterBlobNone(), packp.Filter("")} {
43 var tempDir string
44 if tempDir, err = os.MkdirTemp("", "fetchRepo"); err != nil {
45 return nil, fmt.Errorf("mkdtemp: %w", err)
46 }
47 defer os.RemoveAll(tempDir)
48
49 storer = filesystem.NewStorageWithOptions(
50 osfs.New(tempDir, osfs.WithBoundOS()),
51 cache.NewObjectLRUDefault(),
52 filesystem.Options{
53 ExclusiveAccess: true,
54 LargeObjectThreshold: int64(config.Limits.GitLargeObjectThreshold.Bytes()),
55 },
56 )
57 repo, err = git.CloneContext(ctx, storer, nil, &git.CloneOptions{
58 Bare: true,
59 URL: repoURL,
60 ReferenceName: plumbing.ReferenceName(branch),
61 SingleBranch: true,
62 Depth: 1,
63 Tags: git.NoTags,
64 Filter: filter,
65 })
66 if err != nil {
67 logc.Printf(ctx, "clone err: %s %s filter=%q\n", repoURL, branch, filter)
68 continue
69 } else {
70 logc.Printf(ctx, "clone ok: %s %s filter=%q\n", repoURL, branch, filter)
71 break
72 }
73 }
74 if err != nil {
75 return nil, fmt.Errorf("git clone: %w", err)
76 }
77
78 ref, err := repo.Head()
79 if err != nil {
80 return nil, fmt.Errorf("git head: %w", err)
81 }
82
83 commit, err := repo.CommitObject(ref.Hash())
84 if err != nil {
85 return nil, fmt.Errorf("git commit: %w", err)
86 }
87
88 tree, err := repo.TreeObject(commit.TreeHash)
89 if err != nil {
90 return nil, fmt.Errorf("git tree: %w", err)
91 }
92
93 walker := object.NewTreeWalker(tree, true, make(map[plumbing.Hash]bool))
94 defer walker.Close()
95
96 // Create a manifest for the tree object corresponding to `branch`, but do not populate it
97 // with data yet; instead, record all the blobs we'll need.
98 manifest := NewManifest()
99 manifest.RepoUrl = proto.String(repoURL)
100 manifest.Branch = proto.String(branch)
101 manifest.Commit = proto.String(ref.Hash().String())
102 blobsNeeded := map[plumbing.Hash]*Entry{}
103 for {
104 name, entry, err := walker.Next()
105 if err == io.EOF {
106 break
107 } else if err != nil {
108 return nil, fmt.Errorf("git walker: %w", err)
109 } else {
110 manifestEntry := &Entry{}
111 if existingManifestEntry, found := blobsNeeded[entry.Hash]; found {
112 // If the same blob is present twice, we only need to fetch it once (and both
113 // instances will alias the same `Entry` structure in the manifest).
114 manifestEntry = existingManifestEntry
115 } else if entry.Mode.IsFile() {
116 blobsNeeded[entry.Hash] = manifestEntry
117 if entry.Mode == filemode.Symlink {
118 manifestEntry.Type = Type_Symlink.Enum()
119 } else {
120 manifestEntry.Type = Type_InlineFile.Enum()
121 }
122 manifestEntry.GitHash = proto.String(entry.Hash.String())
123 } else if entry.Mode == filemode.Dir {
124 manifestEntry.Type = Type_Directory.Enum()
125 } else {
126 AddProblem(manifest, name, "unsupported mode %#o", entry.Mode)
127 continue
128 }
129 manifest.Contents[name] = manifestEntry
130 }
131 }
132
133 // Collect checkout statistics.
134 var dataBytesRecycled int64
135 var dataBytesTransferred int64
136
137 // First, see if we can extract the blobs from the old manifest. This is the preferred option
138 // because it avoids both network transfers and recompression. Note that we do not request
139 // blobs from the backend under any circumstances to avoid creating a blob existence oracle.
140 for _, oldManifestEntry := range oldManifest.GetContents() {
141 if hash, ok := plumbing.FromHex(oldManifestEntry.GetGitHash()); ok {
142 if manifestEntry, found := blobsNeeded[hash]; found {
143 manifestEntry.Reset()
144 proto.Merge(manifestEntry, oldManifestEntry)
145 dataBytesRecycled += oldManifestEntry.GetOriginalSize()
146 delete(blobsNeeded, hash)
147 }
148 }
149 }
150
151 // Second, fill the manifest entries with data from the git checkout we just made.
152 // This will only succeed if a `blob:none` filter isn't supported and we got a full
153 // clone despite asking for a partial clone.
154 for hash, manifestEntry := range blobsNeeded {
155 if err := readGitBlob(repo, hash, manifestEntry); err == nil {
156 dataBytesTransferred += manifestEntry.GetOriginalSize()
157 delete(blobsNeeded, hash)
158 }
159 }
160
161 // Third, if we still don't have data for some manifest entries, re-establish a git transport
162 // and request the missing blobs (only) from the server.
163 if len(blobsNeeded) > 0 {
164 client, err := transport.Get(parsedRepoURL.Scheme)
165 if err != nil {
166 return nil, fmt.Errorf("git transport: %w", err)
167 }
168
169 endpoint, err := transport.NewEndpoint(repoURL)
170 if err != nil {
171 return nil, fmt.Errorf("git endpoint: %w", err)
172 }
173
174 session, err := client.NewSession(storer, endpoint, nil)
175 if err != nil {
176 return nil, fmt.Errorf("git session: %w", err)
177 }
178
179 connection, err := session.Handshake(ctx, transport.UploadPackService)
180 if err != nil {
181 return nil, fmt.Errorf("git connection: %w", err)
182 }
183 defer connection.Close()
184
185 if err := connection.Fetch(ctx, &transport.FetchRequest{
186 Wants: slices.Collect(maps.Keys(blobsNeeded)),
187 Depth: 1,
188 // Git CLI behaves like this, even if the wants above are references to blobs.
189 Filter: "blob:none",
190 }); err != nil && !errors.Is(err, transport.ErrNoChange) {
191 return nil, fmt.Errorf("git blob fetch request: %w", err)
192 }
193
194 // All remaining blobs should now be available.
195 for hash, manifestEntry := range blobsNeeded {
196 if err := readGitBlob(repo, hash, manifestEntry); err != nil {
197 return nil, err
198 }
199 dataBytesTransferred += manifestEntry.GetOriginalSize()
200 delete(blobsNeeded, hash)
201 }
202 }
203
204 logc.Printf(ctx,
205 "reuse: %s recycled, %s transferred\n",
206 datasize.ByteSize(dataBytesRecycled).HR(),
207 datasize.ByteSize(dataBytesTransferred).HR(),
208 )
209
210 return manifest, nil
211}
212
213func readGitBlob(repo *git.Repository, hash plumbing.Hash, entry *Entry) error {
214 blob, err := repo.BlobObject(hash)
215 if err != nil {
216 return fmt.Errorf("git blob %s: %w", hash, err)
217 }
218
219 reader, err := blob.Reader()
220 if err != nil {
221 return fmt.Errorf("git blob open: %w", err)
222 }
223 defer reader.Close()
224
225 data, err := io.ReadAll(reader)
226 if err != nil {
227 return fmt.Errorf("git blob read: %w", err)
228 }
229
230 switch entry.GetType() {
231 case Type_InlineFile, Type_Symlink:
232 // okay
233 default:
234 panic(fmt.Errorf("readGitBlob encountered invalid entry: %v, %v",
235 entry.GetType(), entry.GetTransform()))
236 }
237
238 entry.Data = data
239 entry.Transform = Transform_Identity.Enum()
240 entry.OriginalSize = proto.Int64(blob.Size)
241 entry.CompressedSize = proto.Int64(blob.Size)
242 return nil
243}