[mirror] Scalable static site server for Git forges (like GitHub Pages)
1//go:generate protoc --go_out=. --go_opt=paths=source_relative schema.proto
2
3package git_pages
4
5import (
6 "bytes"
7 "context"
8 "crypto/sha256"
9 "errors"
10 "fmt"
11 "log"
12 "mime"
13 "net/http"
14 "path"
15 "path/filepath"
16 "strings"
17 "sync"
18 "time"
19
20 "github.com/c2h5oh/datasize"
21 "github.com/klauspost/compress/zstd"
22 "github.com/prometheus/client_golang/prometheus"
23 "github.com/prometheus/client_golang/prometheus/promauto"
24 "google.golang.org/protobuf/encoding/protojson"
25 "google.golang.org/protobuf/proto"
26)
27
28var (
29 siteCompressionSpaceSaving = promauto.NewHistogram(prometheus.HistogramOpts{
30 Name: "git_pages_site_compression_space_saving",
31 Help: "Reduction in site size after compression relative to the uncompressed size",
32 Buckets: []float64{.01, .025, .05, .1, .25, .5, .75, 1, 1.25, 1.5, 1.75, 2, 2.5, 5, 10},
33
34 NativeHistogramBucketFactor: 1.1,
35 NativeHistogramMaxBucketNumber: 100,
36 NativeHistogramMinResetDuration: 10 * time.Minute,
37 })
38)
39
40func IsManifestEmpty(manifest *Manifest) bool {
41 if len(manifest.Contents) > 1 {
42 return false
43 }
44 for name, entry := range manifest.Contents {
45 if name == "" && entry.GetType() == Type_Directory {
46 return true
47 }
48 }
49 panic(fmt.Errorf("malformed manifest %v", manifest))
50}
51
52// Returns `true` if `left` and `right` contain the same files with the same types and data.
53func CompareManifest(left *Manifest, right *Manifest) bool {
54 if len(left.Contents) != len(right.Contents) {
55 return false
56 }
57 for name, leftEntry := range left.Contents {
58 rightEntry := right.Contents[name]
59 if rightEntry == nil {
60 return false
61 }
62 if leftEntry.GetType() != rightEntry.GetType() {
63 return false
64 }
65 if !bytes.Equal(leftEntry.Data, rightEntry.Data) {
66 return false
67 }
68 }
69 return true
70}
71
72func EncodeManifest(manifest *Manifest) []byte {
73 result, err := proto.MarshalOptions{Deterministic: true}.Marshal(manifest)
74 if err != nil {
75 panic(err)
76 }
77 return result
78}
79
80func DecodeManifest(data []byte) (*Manifest, error) {
81 manifest := Manifest{}
82 err := proto.Unmarshal(data, &manifest)
83 return &manifest, err
84}
85
86func AddProblem(manifest *Manifest, path, format string, args ...any) error {
87 cause := fmt.Sprintf(format, args...)
88 manifest.Problems = append(manifest.Problems, &Problem{
89 Path: proto.String(path),
90 Cause: proto.String(cause),
91 })
92 return fmt.Errorf("%s: %s", path, cause)
93}
94
95func GetProblemReport(manifest *Manifest) []string {
96 var report []string
97 for _, problem := range manifest.Problems {
98 report = append(report,
99 fmt.Sprintf("%s: %s", problem.GetPath(), problem.GetCause()))
100 }
101 return report
102}
103
104func ManifestDebugJSON(manifest *Manifest) string {
105 result, err := protojson.MarshalOptions{
106 Multiline: true,
107 EmitDefaultValues: true,
108 }.Marshal(manifest)
109 if err != nil {
110 panic(err)
111 }
112 return string(result)
113}
114
115var ErrSymlinkLoop = errors.New("symbolic link loop")
116
117func ExpandSymlinks(manifest *Manifest, inPath string) (string, error) {
118 var levels uint
119again:
120 for levels = 0; levels < config.Limits.MaxSymlinkDepth; levels += 1 {
121 parts := strings.Split(inPath, "/")
122 for i := 1; i <= len(parts); i++ {
123 linkPath := path.Join(parts[:i]...)
124 entry := manifest.Contents[linkPath]
125 if entry != nil && entry.GetType() == Type_Symlink {
126 inPath = path.Join(
127 path.Dir(linkPath),
128 string(entry.Data),
129 path.Join(parts[i:]...),
130 )
131 continue again
132 }
133 }
134 break
135 }
136 if levels < config.Limits.MaxSymlinkDepth {
137 return inPath, nil
138 } else {
139 return "", ErrSymlinkLoop
140 }
141}
142
143// Sniff content type using the same algorithm as `http.ServeContent`.
144func DetectContentType(manifest *Manifest) {
145 for path, entry := range manifest.Contents {
146 if entry.GetType() == Type_Directory || entry.GetType() == Type_Symlink {
147 // no Content-Type
148 } else if entry.GetType() == Type_InlineFile && entry.GetTransform() == Transform_None {
149 contentType := mime.TypeByExtension(filepath.Ext(path))
150 if contentType == "" {
151 contentType = http.DetectContentType(entry.Data[:512])
152 }
153 entry.ContentType = proto.String(contentType)
154 } else {
155 panic(fmt.Errorf("DetectContentType encountered invalid entry: %v, %v",
156 entry.GetType(), entry.GetTransform()))
157 }
158 }
159}
160
161// The `clauspost/compress/zstd` package recommends reusing a compressor to avoid repeated
162// allocations of internal buffers.
163var zstdEncoder, _ = zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedBetterCompression))
164
165// Compress contents of inline files.
166func CompressFiles(ctx context.Context, manifest *Manifest) {
167 span, _ := ObserveFunction(ctx, "CompressFiles")
168 defer span.Finish()
169
170 var originalSize, compressedSize int64
171 for _, entry := range manifest.Contents {
172 if entry.GetType() == Type_InlineFile && entry.GetTransform() == Transform_None {
173 mtype := getMediaType(entry.GetContentType())
174 if strings.HasPrefix(mtype, "video/") || strings.HasPrefix(mtype, "audio/") {
175 continue
176 }
177 originalSize += entry.GetSize()
178 compressedData := zstdEncoder.EncodeAll(entry.GetData(), make([]byte, 0, entry.GetSize()))
179 if len(compressedData) < int(*entry.Size) {
180 entry.Data = compressedData
181 entry.Size = proto.Int64(int64(len(entry.Data)))
182 entry.Transform = Transform_Zstandard.Enum()
183 }
184 compressedSize += entry.GetSize()
185 }
186 }
187 manifest.OriginalSize = proto.Int64(originalSize)
188 manifest.CompressedSize = proto.Int64(compressedSize)
189
190 if originalSize != 0 {
191 spaceSaving := (float64(originalSize) - float64(compressedSize)) / float64(originalSize)
192 log.Printf("compress: saved %.2f percent (%s to %s)",
193 spaceSaving*100.0,
194 datasize.ByteSize(originalSize).HR(),
195 datasize.ByteSize(compressedSize).HR(),
196 )
197 siteCompressionSpaceSaving.
198 Observe(spaceSaving)
199 }
200}
201
202// Apply post-processing steps to the manifest.
203// At the moment, there isn't a good way to report errors except to log them on the terminal.
204// (Perhaps in the future they could be exposed at `.git-pages/status.txt`?)
205func PrepareManifest(ctx context.Context, manifest *Manifest) error {
206 // Parse Netlify-style `_redirects`
207 if err := ProcessRedirectsFile(manifest); err != nil {
208 log.Printf("redirects err: %s\n", err)
209 } else if len(manifest.Redirects) > 0 {
210 log.Printf("redirects ok: %d rules\n", len(manifest.Redirects))
211 }
212
213 // Parse Netlify-style `_headers`
214 if err := ProcessHeadersFile(manifest); err != nil {
215 log.Printf("headers err: %s\n", err)
216 } else if len(manifest.Headers) > 0 {
217 log.Printf("headers ok: %d rules\n", len(manifest.Headers))
218 }
219
220 // Sniff content type like `http.ServeContent`
221 DetectContentType(manifest)
222
223 // Opportunistically compress blobs (must be done last)
224 CompressFiles(ctx, manifest)
225
226 return nil
227}
228
229var ErrManifestTooLarge = errors.New("manifest too large")
230
231// Uploads inline file data over certain size to the storage backend. Returns a copy of
232// the manifest updated to refer to an external content-addressable store.
233func StoreManifest(ctx context.Context, name string, manifest *Manifest) (*Manifest, error) {
234 span, ctx := ObserveFunction(ctx, "StoreManifest", "manifest.name", name)
235 defer span.Finish()
236
237 // Replace inline files over certain size with references to external data.
238 extManifest := Manifest{
239 RepoUrl: manifest.RepoUrl,
240 Branch: manifest.Branch,
241 Commit: manifest.Commit,
242 Contents: make(map[string]*Entry),
243 Redirects: manifest.Redirects,
244 Headers: manifest.Headers,
245 Problems: manifest.Problems,
246 OriginalSize: manifest.OriginalSize,
247 CompressedSize: manifest.CompressedSize,
248 StoredSize: proto.Int64(0),
249 }
250 extObjectSizes := make(map[string]int64)
251 for name, entry := range manifest.Contents {
252 cannotBeInlined := entry.GetType() == Type_InlineFile &&
253 entry.GetSize() > int64(config.Limits.MaxInlineFileSize.Bytes())
254 if cannotBeInlined {
255 dataHash := sha256.Sum256(entry.Data)
256 extManifest.Contents[name] = &Entry{
257 Type: Type_ExternalFile.Enum(),
258 Size: entry.Size,
259 Data: fmt.Appendf(nil, "sha256-%x", dataHash),
260 Transform: entry.Transform,
261 ContentType: entry.ContentType,
262 }
263 extObjectSizes[string(dataHash[:])] = entry.GetSize()
264 } else {
265 extManifest.Contents[name] = entry
266 }
267 }
268 // `extObjectMap` stores size once per object, deduplicating it
269 for _, storedSize := range extObjectSizes {
270 *extManifest.StoredSize += storedSize
271 }
272
273 // Upload the resulting manifest and the blob it references.
274 extManifestData := EncodeManifest(&extManifest)
275 if uint64(len(extManifestData)) > config.Limits.MaxManifestSize.Bytes() {
276 return nil, fmt.Errorf("%w: manifest size %s exceeds %s limit",
277 ErrManifestTooLarge,
278 datasize.ByteSize(len(extManifestData)).HR(),
279 config.Limits.MaxManifestSize,
280 )
281 }
282
283 if err := backend.StageManifest(ctx, &extManifest); err != nil {
284 return nil, fmt.Errorf("stage manifest: %w", err)
285 }
286
287 wg := sync.WaitGroup{}
288 ch := make(chan error, len(extManifest.Contents))
289 for name, entry := range extManifest.Contents {
290 if entry.GetType() == Type_ExternalFile {
291 wg.Go(func() {
292 err := backend.PutBlob(ctx, string(entry.Data), manifest.Contents[name].Data)
293 if err != nil {
294 ch <- fmt.Errorf("put blob %s: %w", name, err)
295 }
296 })
297 }
298 }
299 wg.Wait()
300 close(ch)
301 for err := range ch {
302 return nil, err // currently ignores all but 1st error
303 }
304
305 if err := backend.CommitManifest(ctx, name, &extManifest); err != nil {
306 return nil, fmt.Errorf("commit manifest: %w", err)
307 }
308
309 return &extManifest, nil
310}