[mirror] Scalable static site server for Git forges (like GitHub Pages)
1//go:generate protoc --go_out=. --go_opt=paths=source_relative schema.proto
2
3package git_pages
4
5import (
6 "bytes"
7 "context"
8 "crypto/sha256"
9 "errors"
10 "fmt"
11 "mime"
12 "net/http"
13 "path"
14 "path/filepath"
15 "strings"
16 "sync"
17 "time"
18
19 "github.com/c2h5oh/datasize"
20 "github.com/go-git/go-git/v6/plumbing"
21 format "github.com/go-git/go-git/v6/plumbing/format/config"
22 "github.com/klauspost/compress/zstd"
23 "github.com/prometheus/client_golang/prometheus"
24 "github.com/prometheus/client_golang/prometheus/promauto"
25 "google.golang.org/protobuf/encoding/protojson"
26 "google.golang.org/protobuf/proto"
27)
28
29var (
30 siteCompressionSpaceSaving = promauto.NewHistogram(prometheus.HistogramOpts{
31 Name: "git_pages_site_compression_space_saving",
32 Help: "Reduction in site size after compression relative to the uncompressed size",
33 Buckets: []float64{.01, .025, .05, .1, .25, .5, .75, 1, 1.25, 1.5, 1.75, 2, 2.5, 5, 10},
34
35 NativeHistogramBucketFactor: 1.1,
36 NativeHistogramMaxBucketNumber: 100,
37 NativeHistogramMinResetDuration: 10 * time.Minute,
38 })
39)
40
41func NewManifest() *Manifest {
42 return &Manifest{
43 Contents: map[string]*Entry{
44 "": {Type: Type_Directory.Enum()},
45 },
46 }
47}
48
49func IsManifestEmpty(manifest *Manifest) bool {
50 if len(manifest.Contents) > 1 {
51 return false
52 }
53 for name, entry := range manifest.Contents {
54 if name == "" && entry.GetType() == Type_Directory {
55 return true
56 }
57 }
58 panic(fmt.Errorf("malformed manifest %v", manifest))
59}
60
61// Returns `true` if `left` and `right` contain the same files with the same types and data.
62func CompareManifest(left *Manifest, right *Manifest) bool {
63 if len(left.Contents) != len(right.Contents) {
64 return false
65 }
66 for name, leftEntry := range left.Contents {
67 rightEntry := right.Contents[name]
68 if rightEntry == nil {
69 return false
70 }
71 if leftEntry.GetType() != rightEntry.GetType() {
72 return false
73 }
74 if !bytes.Equal(leftEntry.Data, rightEntry.Data) {
75 return false
76 }
77 }
78 return true
79}
80
81func EncodeManifest(manifest *Manifest) (data []byte) {
82 data, err := proto.MarshalOptions{Deterministic: true}.Marshal(manifest)
83 if err != nil {
84 panic(err)
85 }
86 return
87}
88
89func DecodeManifest(data []byte) (manifest *Manifest, err error) {
90 manifest = &Manifest{}
91 err = proto.Unmarshal(data, manifest)
92 return
93}
94
95func NewManifestEntry(type_ Type, data []byte) *Entry {
96 entry := &Entry{}
97 entry.Type = type_.Enum()
98 if data != nil {
99 entry.Data = data
100 entry.Transform = Transform_Identity.Enum()
101 entry.OriginalSize = proto.Int64(int64(len(data)))
102 entry.CompressedSize = proto.Int64(int64(len(data)))
103 }
104 return entry
105}
106
107func AddFile(manifest *Manifest, fileName string, data []byte) *Entry {
108 // Fill in `git_hash` even for files not originating from git using the SHA256 algorithm;
109 // we use this primarily for incremental archive uploads, but when support for git SHA256
110 // repositories is complete, archive uploads and git checkouts will have cross-support for
111 // incremental updates.
112 hasher := plumbing.NewHasher(format.SHA256, plumbing.BlobObject, int64(len(data)))
113 hasher.Write(data)
114 entry := NewManifestEntry(Type_InlineFile, data)
115 entry.GitHash = proto.String(hasher.Sum().String())
116 manifest.Contents[fileName] = entry
117 return entry
118}
119
120func AddSymlink(manifest *Manifest, fileName string, target string) *Entry {
121 if path.IsAbs(target) {
122 AddProblem(manifest, fileName, "absolute symlink: %s", target)
123 return nil
124 } else {
125 entry := NewManifestEntry(Type_Symlink, []byte(target))
126 manifest.Contents[fileName] = entry
127 return entry
128 }
129}
130
131func AddDirectory(manifest *Manifest, dirName string) *Entry {
132 dirName = strings.TrimSuffix(dirName, "/")
133 entry := NewManifestEntry(Type_Directory, nil)
134 manifest.Contents[dirName] = entry
135 return entry
136}
137
138func AddProblem(manifest *Manifest, pathName, format string, args ...any) error {
139 cause := fmt.Sprintf(format, args...)
140 manifest.Problems = append(manifest.Problems, &Problem{
141 Path: proto.String(pathName),
142 Cause: proto.String(cause),
143 })
144 return fmt.Errorf("%s: %s", pathName, cause)
145}
146
147// EnsureLeadingDirectories adds directory entries for any parent directories
148// that are implicitly referenced by files in the manifest but don't have
149// explicit directory entries. (This can be the case if an archive is created
150// via globs rather than including a whole directory.)
151func EnsureLeadingDirectories(manifest *Manifest) {
152 for name := range manifest.Contents {
153 for dir := path.Dir(name); dir != "." && dir != ""; dir = path.Dir(dir) {
154 if _, exists := manifest.Contents[dir]; !exists {
155 AddDirectory(manifest, dir)
156 }
157 }
158 }
159}
160
161func GetProblemReport(manifest *Manifest) []string {
162 var report []string
163 for _, problem := range manifest.Problems {
164 report = append(report,
165 fmt.Sprintf("/%s: %s", problem.GetPath(), problem.GetCause()))
166 }
167 return report
168}
169
170func ManifestJSON(manifest *Manifest) []byte {
171 json, err := protojson.MarshalOptions{
172 Multiline: true,
173 EmitDefaultValues: true,
174 }.Marshal(manifest)
175 if err != nil {
176 panic(err)
177 }
178 return json
179}
180
181var ErrSymlinkLoop = errors.New("symbolic link loop")
182
183func ExpandSymlinks(manifest *Manifest, inPath string) (string, error) {
184 var levels uint
185again:
186 for levels = 0; levels < config.Limits.MaxSymlinkDepth; levels += 1 {
187 parts := strings.Split(inPath, "/")
188 for i := 1; i <= len(parts); i++ {
189 linkPath := path.Join(parts[:i]...)
190 entry := manifest.Contents[linkPath]
191 if entry != nil && entry.GetType() == Type_Symlink {
192 inPath = path.Join(
193 path.Dir(linkPath),
194 string(entry.Data),
195 path.Join(parts[i:]...),
196 )
197 continue again
198 }
199 }
200 break
201 }
202 if levels < config.Limits.MaxSymlinkDepth {
203 return inPath, nil
204 } else {
205 return "", ErrSymlinkLoop
206 }
207}
208
209// Sniff content type using the same algorithm as `http.ServeContent`.
210func DetectContentType(manifest *Manifest) {
211 for path, entry := range manifest.Contents {
212 if entry.GetType() == Type_Directory || entry.GetType() == Type_Symlink {
213 // no Content-Type
214 } else if entry.GetType() == Type_InlineFile && entry.GetTransform() == Transform_Identity {
215 contentType := mime.TypeByExtension(filepath.Ext(path))
216 if contentType == "" {
217 contentType = http.DetectContentType(entry.Data[:min(512, len(entry.Data))])
218 }
219 entry.ContentType = proto.String(contentType)
220 } else if entry.GetContentType() == "" {
221 panic(fmt.Errorf("DetectContentType encountered invalid entry: %v, %v",
222 entry.GetType(), entry.GetTransform()))
223 }
224 }
225}
226
227// The `klauspost/compress/zstd` package recommends reusing a compressor to avoid repeated
228// allocations of internal buffers.
229var zstdEncoder, _ = zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedBetterCompression))
230
231// Compress contents of inline files.
232func CompressFiles(ctx context.Context, manifest *Manifest) {
233 span, _ := ObserveFunction(ctx, "CompressFiles")
234 defer span.Finish()
235
236 var originalSize int64
237 var compressedSize int64
238 for _, entry := range manifest.Contents {
239 if entry.GetType() == Type_InlineFile && entry.GetTransform() == Transform_Identity {
240 mediaType := getMediaType(entry.GetContentType())
241 if strings.HasPrefix(mediaType, "video/") || strings.HasPrefix(mediaType, "audio/") {
242 continue
243 }
244 compressedData := zstdEncoder.EncodeAll(entry.GetData(),
245 make([]byte, 0, entry.GetOriginalSize()))
246 if int64(len(compressedData)) < entry.GetOriginalSize() {
247 entry.Data = compressedData
248 entry.Transform = Transform_Zstd.Enum()
249 entry.CompressedSize = proto.Int64(int64(len(entry.Data)))
250 }
251 }
252 originalSize += entry.GetOriginalSize()
253 compressedSize += entry.GetCompressedSize()
254 }
255 manifest.OriginalSize = proto.Int64(originalSize)
256 manifest.CompressedSize = proto.Int64(compressedSize)
257
258 if originalSize != 0 {
259 spaceSaving := (float64(originalSize) - float64(compressedSize)) / float64(originalSize)
260 logc.Printf(ctx, "compress: saved %.2f percent (%s to %s)",
261 spaceSaving*100.0,
262 datasize.ByteSize(originalSize).HR(),
263 datasize.ByteSize(compressedSize).HR(),
264 )
265 siteCompressionSpaceSaving.
266 Observe(spaceSaving)
267 }
268}
269
270// Apply post-processing steps to the manifest.
271// At the moment, there isn't a good way to report errors except to log them on the terminal.
272// (Perhaps in the future they could be exposed at `.git-pages/status.txt`?)
273func PrepareManifest(ctx context.Context, manifest *Manifest) error {
274 // Parse Netlify-style `_redirects`.
275 if err := ProcessRedirectsFile(manifest); err != nil {
276 logc.Printf(ctx, "redirects err: %s\n", err)
277 } else if len(manifest.Redirects) > 0 {
278 logc.Printf(ctx, "redirects ok: %d rules\n", len(manifest.Redirects))
279 }
280
281 // Check if any redirects are unreachable.
282 LintRedirects(manifest)
283
284 // Parse Netlify-style `_headers`.
285 if err := ProcessHeadersFile(manifest); err != nil {
286 logc.Printf(ctx, "headers err: %s\n", err)
287 } else if len(manifest.Headers) > 0 {
288 logc.Printf(ctx, "headers ok: %d rules\n", len(manifest.Headers))
289 }
290
291 // Sniff content type like `http.ServeContent`.
292 DetectContentType(manifest)
293
294 // Opportunistically compress blobs (must be done last).
295 CompressFiles(ctx, manifest)
296
297 return nil
298}
299
300var ErrSiteTooLarge = errors.New("site too large")
301var ErrManifestTooLarge = errors.New("manifest too large")
302
303// Uploads inline file data over certain size to the storage backend. Returns a copy of
304// the manifest updated to refer to an external content-addressable store.
305func StoreManifest(
306 ctx context.Context, name string, manifest *Manifest, opts ModifyManifestOptions,
307) (*Manifest, error) {
308 span, ctx := ObserveFunction(ctx, "StoreManifest", "manifest.name", name)
309 defer span.Finish()
310
311 // Replace inline files over certain size with references to external data.
312 extManifest := Manifest{
313 RepoUrl: manifest.RepoUrl,
314 Branch: manifest.Branch,
315 Commit: manifest.Commit,
316 Contents: make(map[string]*Entry),
317 Redirects: manifest.Redirects,
318 Headers: manifest.Headers,
319 Problems: manifest.Problems,
320 OriginalSize: manifest.OriginalSize,
321 CompressedSize: manifest.CompressedSize,
322 StoredSize: proto.Int64(0),
323 }
324 for name, entry := range manifest.Contents {
325 cannotBeInlined := entry.GetType() == Type_InlineFile &&
326 entry.GetCompressedSize() > int64(config.Limits.MaxInlineFileSize.Bytes())
327 if cannotBeInlined {
328 dataHash := sha256.Sum256(entry.Data)
329 extManifest.Contents[name] = &Entry{
330 Type: Type_ExternalFile.Enum(),
331 OriginalSize: entry.OriginalSize,
332 CompressedSize: entry.CompressedSize,
333 Data: fmt.Appendf(nil, "sha256-%x", dataHash),
334 Transform: entry.Transform,
335 ContentType: entry.ContentType,
336 GitHash: entry.GitHash,
337 }
338 } else {
339 extManifest.Contents[name] = entry
340 }
341 }
342
343 // Compute the total and deduplicated storage size.
344 totalSize := int64(0)
345 blobSizes := map[string]int64{}
346 for _, entry := range extManifest.Contents {
347 totalSize += entry.GetOriginalSize()
348 if entry.GetType() == Type_ExternalFile {
349 blobSizes[string(entry.Data)] = entry.GetCompressedSize()
350 }
351 }
352 if uint64(totalSize) > config.Limits.MaxSiteSize.Bytes() {
353 return nil, fmt.Errorf("%w: contents size %s exceeds %s limit",
354 ErrSiteTooLarge,
355 datasize.ByteSize(totalSize).HR(),
356 config.Limits.MaxSiteSize.HR(),
357 )
358 }
359 for _, blobSize := range blobSizes {
360 *extManifest.StoredSize += blobSize
361 }
362
363 // Upload the resulting manifest and the blob it references.
364 extManifestData := EncodeManifest(&extManifest)
365 if uint64(len(extManifestData)) > config.Limits.MaxManifestSize.Bytes() {
366 return nil, fmt.Errorf("%w: manifest size %s exceeds %s limit",
367 ErrManifestTooLarge,
368 datasize.ByteSize(len(extManifestData)).HR(),
369 config.Limits.MaxManifestSize,
370 )
371 }
372
373 if err := backend.StageManifest(ctx, &extManifest); err != nil {
374 return nil, fmt.Errorf("stage manifest: %w", err)
375 }
376
377 wg := sync.WaitGroup{}
378 ch := make(chan error, len(extManifest.Contents))
379 for name, entry := range extManifest.Contents {
380 // Upload external entries (those that were decided as ineligible for being stored inline).
381 // If the entry in the original manifest is already an external reference, there's no need
382 // to externalize it (and no way for us to do so, since the entry only contains the blob name).
383 if entry.GetType() == Type_ExternalFile && manifest.Contents[name].GetType() == Type_InlineFile {
384 wg.Go(func() {
385 err := backend.PutBlob(ctx, string(entry.Data), manifest.Contents[name].Data)
386 if err != nil {
387 ch <- fmt.Errorf("put blob %s: %w", name, err)
388 }
389 })
390 }
391 }
392 wg.Wait()
393 close(ch)
394 for err := range ch {
395 return nil, err // currently ignores all but 1st error
396 }
397
398 if err := backend.CommitManifest(ctx, name, &extManifest, opts); err != nil {
399 if errors.Is(err, ErrDomainFrozen) {
400 return nil, err
401 } else {
402 return nil, fmt.Errorf("commit manifest: %w", err)
403 }
404 }
405
406 return &extManifest, nil
407}