forked from
whitequark.org/git-pages
fork of whitequark.org/git-pages with mods for tangled
1package git_pages
2
3import (
4 "context"
5 "errors"
6 "fmt"
7 "io"
8 "maps"
9 "net/url"
10 "os"
11 "slices"
12
13 "github.com/c2h5oh/datasize"
14 "github.com/go-git/go-billy/v6/osfs"
15 "github.com/go-git/go-git/v6"
16 "github.com/go-git/go-git/v6/plumbing"
17 "github.com/go-git/go-git/v6/plumbing/cache"
18 "github.com/go-git/go-git/v6/plumbing/filemode"
19 "github.com/go-git/go-git/v6/plumbing/object"
20 "github.com/go-git/go-git/v6/plumbing/protocol/packp"
21 "github.com/go-git/go-git/v6/plumbing/transport"
22 "github.com/go-git/go-git/v6/storage/filesystem"
23 "google.golang.org/protobuf/proto"
24)
25
26func FetchRepository(
27 ctx context.Context, repoURL string, branch string, oldManifest *Manifest,
28) (
29 *Manifest, error,
30) {
31 span, ctx := ObserveFunction(ctx, "FetchRepository",
32 "git.repository", repoURL, "git.branch", branch)
33 defer span.Finish()
34
35 parsedRepoURL, err := url.Parse(repoURL)
36 if err != nil {
37 return nil, fmt.Errorf("URL parse: %w", err)
38 }
39
40 var repo *git.Repository
41 var storer *filesystem.Storage
42 for _, filter := range []packp.Filter{packp.FilterBlobNone(), packp.Filter("")} {
43 var tempDir string
44 tempDir, err = os.MkdirTemp("", "fetchRepo")
45 if err != nil {
46 return nil, fmt.Errorf("mkdtemp: %w", err)
47 }
48 defer os.RemoveAll(tempDir)
49
50 storer = filesystem.NewStorageWithOptions(
51 osfs.New(tempDir, osfs.WithBoundOS()),
52 cache.NewObjectLRUDefault(),
53 filesystem.Options{
54 ExclusiveAccess: true,
55 LargeObjectThreshold: int64(config.Limits.GitLargeObjectThreshold.Bytes()),
56 },
57 )
58 repo, err = git.CloneContext(ctx, storer, nil, &git.CloneOptions{
59 Bare: true,
60 URL: repoURL,
61 ReferenceName: plumbing.ReferenceName(branch),
62 SingleBranch: true,
63 Depth: 1,
64 Tags: git.NoTags,
65 Filter: filter,
66 })
67 if err != nil {
68 logc.Printf(ctx, "clone err: %s %s filter=%q\n", repoURL, branch, filter)
69 continue
70 } else {
71 logc.Printf(ctx, "clone ok: %s %s filter=%q\n", repoURL, branch, filter)
72 break
73 }
74 }
75 if err != nil {
76 return nil, fmt.Errorf("git clone: %w", err)
77 }
78
79 ref, err := repo.Head()
80 if err != nil {
81 return nil, fmt.Errorf("git head: %w", err)
82 }
83
84 commit, err := repo.CommitObject(ref.Hash())
85 if err != nil {
86 return nil, fmt.Errorf("git commit: %w", err)
87 }
88
89 tree, err := repo.TreeObject(commit.TreeHash)
90 if err != nil {
91 return nil, fmt.Errorf("git tree: %w", err)
92 }
93
94 walker := object.NewTreeWalker(tree, true, make(map[plumbing.Hash]bool))
95 defer walker.Close()
96
97 // Create a manifest for the tree object corresponding to `branch`, but do not populate it
98 // with data yet; instead, record all the blobs we'll need.
99 manifest := &Manifest{
100 RepoUrl: proto.String(repoURL),
101 Branch: proto.String(branch),
102 Commit: proto.String(ref.Hash().String()),
103 Contents: map[string]*Entry{
104 "": {Type: Type_Directory.Enum()},
105 },
106 }
107 blobsNeeded := map[plumbing.Hash]*Entry{}
108 for {
109 name, entry, err := walker.Next()
110 if err == io.EOF {
111 break
112 } else if err != nil {
113 return nil, fmt.Errorf("git walker: %w", err)
114 } else {
115 manifestEntry := &Entry{}
116 if existingManifestEntry, found := blobsNeeded[entry.Hash]; found {
117 // If the same blob is present twice, we only need to fetch it once (and both
118 // instances will alias the same `Entry` structure in the manifest).
119 manifestEntry = existingManifestEntry
120 } else if entry.Mode.IsFile() {
121 blobsNeeded[entry.Hash] = manifestEntry
122 if entry.Mode == filemode.Symlink {
123 manifestEntry.Type = Type_Symlink.Enum()
124 } else {
125 manifestEntry.Type = Type_InlineFile.Enum()
126 }
127 manifestEntry.GitHash = proto.String(entry.Hash.String())
128 } else if entry.Mode == filemode.Dir {
129 manifestEntry.Type = Type_Directory.Enum()
130 } else {
131 AddProblem(manifest, name, "unsupported mode %#o", entry.Mode)
132 continue
133 }
134 manifest.Contents[name] = manifestEntry
135 }
136 }
137
138 // Collect checkout statistics.
139 var dataBytesFromOldManifest int64
140 var dataBytesFromGitCheckout int64
141 var dataBytesFromGitTransport int64
142
143 // First, see if we can extract the blobs from the old manifest. This is the preferred option
144 // because it avoids both network transfers and recompression. Note that we do not request
145 // blobs from the backend under any circumstances to avoid creating a blob existence oracle.
146 for _, oldManifestEntry := range oldManifest.GetContents() {
147 if hash, ok := plumbing.FromHex(oldManifestEntry.GetGitHash()); ok {
148 if manifestEntry, found := blobsNeeded[hash]; found {
149 manifestEntry.Reset()
150 proto.Merge(manifestEntry, oldManifestEntry)
151 dataBytesFromOldManifest += oldManifestEntry.GetOriginalSize()
152 delete(blobsNeeded, hash)
153 }
154 }
155 }
156
157 // Second, fill the manifest entries with data from the git checkout we just made.
158 // This will only succeed if a `blob:none` filter isn't supported and we got a full
159 // clone despite asking for a partial clone.
160 for hash, manifestEntry := range blobsNeeded {
161 if err := readGitBlob(repo, hash, manifestEntry); err == nil {
162 dataBytesFromGitCheckout += manifestEntry.GetOriginalSize()
163 delete(blobsNeeded, hash)
164 }
165 }
166
167 // Third, if we still don't have data for some manifest entries, re-establish a git transport
168 // and request the missing blobs (only) from the server.
169 if len(blobsNeeded) > 0 {
170 client, err := transport.Get(parsedRepoURL.Scheme)
171 if err != nil {
172 return nil, fmt.Errorf("git transport: %w", err)
173 }
174
175 endpoint, err := transport.NewEndpoint(repoURL)
176 if err != nil {
177 return nil, fmt.Errorf("git endpoint: %w", err)
178 }
179
180 session, err := client.NewSession(storer, endpoint, nil)
181 if err != nil {
182 return nil, fmt.Errorf("git session: %w", err)
183 }
184
185 connection, err := session.Handshake(ctx, transport.UploadPackService)
186 if err != nil {
187 return nil, fmt.Errorf("git connection: %w", err)
188 }
189 defer connection.Close()
190
191 if err := connection.Fetch(ctx, &transport.FetchRequest{
192 Wants: slices.Collect(maps.Keys(blobsNeeded)),
193 Depth: 1,
194 // Git CLI behaves like this, even if the wants above are references to blobs.
195 Filter: "blob:none",
196 }); err != nil && !errors.Is(err, transport.ErrNoChange) {
197 return nil, fmt.Errorf("git blob fetch request: %w", err)
198 }
199
200 // All remaining blobs should now be available.
201 for hash, manifestEntry := range blobsNeeded {
202 if err := readGitBlob(repo, hash, manifestEntry); err != nil {
203 return nil, err
204 }
205 dataBytesFromGitTransport += manifestEntry.GetOriginalSize()
206 delete(blobsNeeded, hash)
207 }
208 }
209
210 logc.Printf(ctx,
211 "fetch: %s from old manifest, %s from git checkout, %s from git transport\n",
212 datasize.ByteSize(dataBytesFromOldManifest).HR(),
213 datasize.ByteSize(dataBytesFromGitCheckout).HR(),
214 datasize.ByteSize(dataBytesFromGitTransport).HR(),
215 )
216
217 return manifest, nil
218}
219
220func readGitBlob(repo *git.Repository, hash plumbing.Hash, entry *Entry) error {
221 blob, err := repo.BlobObject(hash)
222 if err != nil {
223 return fmt.Errorf("git blob %s: %w", hash, err)
224 }
225
226 reader, err := blob.Reader()
227 if err != nil {
228 return fmt.Errorf("git blob open: %w", err)
229 }
230 defer reader.Close()
231
232 data, err := io.ReadAll(reader)
233 if err != nil {
234 return fmt.Errorf("git blob read: %w", err)
235 }
236
237 switch entry.GetType() {
238 case Type_InlineFile, Type_Symlink:
239 // okay
240 default:
241 panic(fmt.Errorf("readGitBlob encountered invalid entry: %v, %v",
242 entry.GetType(), entry.GetTransform()))
243 }
244
245 entry.Data = data
246 entry.Transform = Transform_Identity.Enum()
247 entry.OriginalSize = proto.Int64(blob.Size)
248 entry.CompressedSize = proto.Int64(blob.Size)
249 return nil
250}