[mirror] Scalable static site server for Git forges (like GitHub Pages)
at main 13 kB view raw
1//go:generate protoc --go_out=. --go_opt=paths=source_relative schema.proto 2 3package git_pages 4 5import ( 6 "bytes" 7 "context" 8 "crypto/sha256" 9 "errors" 10 "fmt" 11 "mime" 12 "net/http" 13 "path" 14 "path/filepath" 15 "strings" 16 "sync" 17 "time" 18 19 "github.com/c2h5oh/datasize" 20 "github.com/go-git/go-git/v6/plumbing" 21 format "github.com/go-git/go-git/v6/plumbing/format/config" 22 "github.com/klauspost/compress/zstd" 23 "github.com/prometheus/client_golang/prometheus" 24 "github.com/prometheus/client_golang/prometheus/promauto" 25 "google.golang.org/protobuf/encoding/protojson" 26 "google.golang.org/protobuf/proto" 27) 28 29var ( 30 siteCompressionSpaceSaving = promauto.NewHistogram(prometheus.HistogramOpts{ 31 Name: "git_pages_site_compression_space_saving", 32 Help: "Reduction in site size after compression relative to the uncompressed size", 33 Buckets: []float64{.01, .025, .05, .1, .25, .5, .75, 1, 1.25, 1.5, 1.75, 2, 2.5, 5, 10}, 34 35 NativeHistogramBucketFactor: 1.1, 36 NativeHistogramMaxBucketNumber: 100, 37 NativeHistogramMinResetDuration: 10 * time.Minute, 38 }) 39) 40 41func NewManifest() *Manifest { 42 return &Manifest{ 43 Contents: map[string]*Entry{ 44 "": {Type: Type_Directory.Enum()}, 45 }, 46 } 47} 48 49func IsManifestEmpty(manifest *Manifest) bool { 50 if len(manifest.Contents) > 1 { 51 return false 52 } 53 for name, entry := range manifest.Contents { 54 if name == "" && entry.GetType() == Type_Directory { 55 return true 56 } 57 } 58 panic(fmt.Errorf("malformed manifest %v", manifest)) 59} 60 61// Returns `true` if `left` and `right` contain the same files with the same types and data. 62func CompareManifest(left *Manifest, right *Manifest) bool { 63 if len(left.Contents) != len(right.Contents) { 64 return false 65 } 66 for name, leftEntry := range left.Contents { 67 rightEntry := right.Contents[name] 68 if rightEntry == nil { 69 return false 70 } 71 if leftEntry.GetType() != rightEntry.GetType() { 72 return false 73 } 74 if !bytes.Equal(leftEntry.Data, rightEntry.Data) { 75 return false 76 } 77 } 78 return true 79} 80 81func EncodeManifest(manifest *Manifest) (data []byte) { 82 data, err := proto.MarshalOptions{Deterministic: true}.Marshal(manifest) 83 if err != nil { 84 panic(err) 85 } 86 return 87} 88 89func DecodeManifest(data []byte) (manifest *Manifest, err error) { 90 manifest = &Manifest{} 91 err = proto.Unmarshal(data, manifest) 92 return 93} 94 95func NewManifestEntry(type_ Type, data []byte) *Entry { 96 entry := &Entry{} 97 entry.Type = type_.Enum() 98 if data != nil { 99 entry.Data = data 100 entry.Transform = Transform_Identity.Enum() 101 entry.OriginalSize = proto.Int64(int64(len(data))) 102 entry.CompressedSize = proto.Int64(int64(len(data))) 103 } 104 return entry 105} 106 107func AddFile(manifest *Manifest, fileName string, data []byte) *Entry { 108 // Fill in `git_hash` even for files not originating from git using the SHA256 algorithm; 109 // we use this primarily for incremental archive uploads, but when support for git SHA256 110 // repositories is complete, archive uploads and git checkouts will have cross-support for 111 // incremental updates. 112 hasher := plumbing.NewHasher(format.SHA256, plumbing.BlobObject, int64(len(data))) 113 hasher.Write(data) 114 entry := NewManifestEntry(Type_InlineFile, data) 115 entry.GitHash = proto.String(hasher.Sum().String()) 116 manifest.Contents[fileName] = entry 117 return entry 118} 119 120func AddSymlink(manifest *Manifest, fileName string, target string) *Entry { 121 if path.IsAbs(target) { 122 AddProblem(manifest, fileName, "absolute symlink: %s", target) 123 return nil 124 } else { 125 entry := NewManifestEntry(Type_Symlink, []byte(target)) 126 manifest.Contents[fileName] = entry 127 return entry 128 } 129} 130 131func AddDirectory(manifest *Manifest, dirName string) *Entry { 132 dirName = strings.TrimSuffix(dirName, "/") 133 entry := NewManifestEntry(Type_Directory, nil) 134 manifest.Contents[dirName] = entry 135 return entry 136} 137 138func AddProblem(manifest *Manifest, pathName, format string, args ...any) error { 139 cause := fmt.Sprintf(format, args...) 140 manifest.Problems = append(manifest.Problems, &Problem{ 141 Path: proto.String(pathName), 142 Cause: proto.String(cause), 143 }) 144 return fmt.Errorf("%s: %s", pathName, cause) 145} 146 147// EnsureLeadingDirectories adds directory entries for any parent directories 148// that are implicitly referenced by files in the manifest but don't have 149// explicit directory entries. (This can be the case if an archive is created 150// via globs rather than including a whole directory.) 151func EnsureLeadingDirectories(manifest *Manifest) { 152 for name := range manifest.Contents { 153 for dir := path.Dir(name); dir != "." && dir != ""; dir = path.Dir(dir) { 154 if _, exists := manifest.Contents[dir]; !exists { 155 AddDirectory(manifest, dir) 156 } 157 } 158 } 159} 160 161func GetProblemReport(manifest *Manifest) []string { 162 var report []string 163 for _, problem := range manifest.Problems { 164 report = append(report, 165 fmt.Sprintf("/%s: %s", problem.GetPath(), problem.GetCause())) 166 } 167 return report 168} 169 170func ManifestJSON(manifest *Manifest) []byte { 171 json, err := protojson.MarshalOptions{ 172 Multiline: true, 173 EmitDefaultValues: true, 174 }.Marshal(manifest) 175 if err != nil { 176 panic(err) 177 } 178 return json 179} 180 181var ErrSymlinkLoop = errors.New("symbolic link loop") 182 183func ExpandSymlinks(manifest *Manifest, inPath string) (string, error) { 184 var levels uint 185again: 186 for levels = 0; levels < config.Limits.MaxSymlinkDepth; levels += 1 { 187 parts := strings.Split(inPath, "/") 188 for i := 1; i <= len(parts); i++ { 189 linkPath := path.Join(parts[:i]...) 190 entry := manifest.Contents[linkPath] 191 if entry != nil && entry.GetType() == Type_Symlink { 192 inPath = path.Join( 193 path.Dir(linkPath), 194 string(entry.Data), 195 path.Join(parts[i:]...), 196 ) 197 continue again 198 } 199 } 200 break 201 } 202 if levels < config.Limits.MaxSymlinkDepth { 203 return inPath, nil 204 } else { 205 return "", ErrSymlinkLoop 206 } 207} 208 209// Sniff content type using the same algorithm as `http.ServeContent`. 210func DetectContentType(manifest *Manifest) { 211 for path, entry := range manifest.Contents { 212 if entry.GetType() == Type_Directory || entry.GetType() == Type_Symlink { 213 // no Content-Type 214 } else if entry.GetType() == Type_InlineFile && entry.GetTransform() == Transform_Identity { 215 contentType := mime.TypeByExtension(filepath.Ext(path)) 216 if contentType == "" { 217 contentType = http.DetectContentType(entry.Data[:min(512, len(entry.Data))]) 218 } 219 entry.ContentType = proto.String(contentType) 220 } else if entry.GetContentType() == "" { 221 panic(fmt.Errorf("DetectContentType encountered invalid entry: %v, %v", 222 entry.GetType(), entry.GetTransform())) 223 } 224 } 225} 226 227// The `klauspost/compress/zstd` package recommends reusing a compressor to avoid repeated 228// allocations of internal buffers. 229var zstdEncoder, _ = zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedBetterCompression)) 230 231// Compress contents of inline files. 232func CompressFiles(ctx context.Context, manifest *Manifest) { 233 span, _ := ObserveFunction(ctx, "CompressFiles") 234 defer span.Finish() 235 236 var originalSize int64 237 var compressedSize int64 238 for _, entry := range manifest.Contents { 239 if entry.GetType() == Type_InlineFile && entry.GetTransform() == Transform_Identity { 240 mediaType := getMediaType(entry.GetContentType()) 241 if strings.HasPrefix(mediaType, "video/") || strings.HasPrefix(mediaType, "audio/") { 242 continue 243 } 244 compressedData := zstdEncoder.EncodeAll(entry.GetData(), 245 make([]byte, 0, entry.GetOriginalSize())) 246 if int64(len(compressedData)) < entry.GetOriginalSize() { 247 entry.Data = compressedData 248 entry.Transform = Transform_Zstd.Enum() 249 entry.CompressedSize = proto.Int64(int64(len(entry.Data))) 250 } 251 } 252 originalSize += entry.GetOriginalSize() 253 compressedSize += entry.GetCompressedSize() 254 } 255 manifest.OriginalSize = proto.Int64(originalSize) 256 manifest.CompressedSize = proto.Int64(compressedSize) 257 258 if originalSize != 0 { 259 spaceSaving := (float64(originalSize) - float64(compressedSize)) / float64(originalSize) 260 logc.Printf(ctx, "compress: saved %.2f percent (%s to %s)", 261 spaceSaving*100.0, 262 datasize.ByteSize(originalSize).HR(), 263 datasize.ByteSize(compressedSize).HR(), 264 ) 265 siteCompressionSpaceSaving. 266 Observe(spaceSaving) 267 } 268} 269 270// Apply post-processing steps to the manifest. 271// At the moment, there isn't a good way to report errors except to log them on the terminal. 272// (Perhaps in the future they could be exposed at `.git-pages/status.txt`?) 273func PrepareManifest(ctx context.Context, manifest *Manifest) error { 274 // Parse Netlify-style `_redirects`. 275 if err := ProcessRedirectsFile(manifest); err != nil { 276 logc.Printf(ctx, "redirects err: %s\n", err) 277 } else if len(manifest.Redirects) > 0 { 278 logc.Printf(ctx, "redirects ok: %d rules\n", len(manifest.Redirects)) 279 } 280 281 // Check if any redirects are unreachable. 282 LintRedirects(manifest) 283 284 // Parse Netlify-style `_headers`. 285 if err := ProcessHeadersFile(manifest); err != nil { 286 logc.Printf(ctx, "headers err: %s\n", err) 287 } else if len(manifest.Headers) > 0 { 288 logc.Printf(ctx, "headers ok: %d rules\n", len(manifest.Headers)) 289 } 290 291 // Sniff content type like `http.ServeContent`. 292 DetectContentType(manifest) 293 294 // Opportunistically compress blobs (must be done last). 295 CompressFiles(ctx, manifest) 296 297 return nil 298} 299 300var ErrSiteTooLarge = errors.New("site too large") 301var ErrManifestTooLarge = errors.New("manifest too large") 302 303// Uploads inline file data over certain size to the storage backend. Returns a copy of 304// the manifest updated to refer to an external content-addressable store. 305func StoreManifest( 306 ctx context.Context, name string, manifest *Manifest, opts ModifyManifestOptions, 307) (*Manifest, error) { 308 span, ctx := ObserveFunction(ctx, "StoreManifest", "manifest.name", name) 309 defer span.Finish() 310 311 // Replace inline files over certain size with references to external data. 312 extManifest := Manifest{ 313 RepoUrl: manifest.RepoUrl, 314 Branch: manifest.Branch, 315 Commit: manifest.Commit, 316 Contents: make(map[string]*Entry), 317 Redirects: manifest.Redirects, 318 Headers: manifest.Headers, 319 Problems: manifest.Problems, 320 OriginalSize: manifest.OriginalSize, 321 CompressedSize: manifest.CompressedSize, 322 StoredSize: proto.Int64(0), 323 } 324 for name, entry := range manifest.Contents { 325 cannotBeInlined := entry.GetType() == Type_InlineFile && 326 entry.GetCompressedSize() > int64(config.Limits.MaxInlineFileSize.Bytes()) 327 if cannotBeInlined { 328 dataHash := sha256.Sum256(entry.Data) 329 extManifest.Contents[name] = &Entry{ 330 Type: Type_ExternalFile.Enum(), 331 OriginalSize: entry.OriginalSize, 332 CompressedSize: entry.CompressedSize, 333 Data: fmt.Appendf(nil, "sha256-%x", dataHash), 334 Transform: entry.Transform, 335 ContentType: entry.ContentType, 336 GitHash: entry.GitHash, 337 } 338 } else { 339 extManifest.Contents[name] = entry 340 } 341 } 342 343 // Compute the total and deduplicated storage size. 344 totalSize := int64(0) 345 blobSizes := map[string]int64{} 346 for _, entry := range extManifest.Contents { 347 totalSize += entry.GetOriginalSize() 348 if entry.GetType() == Type_ExternalFile { 349 blobSizes[string(entry.Data)] = entry.GetCompressedSize() 350 } 351 } 352 if uint64(totalSize) > config.Limits.MaxSiteSize.Bytes() { 353 return nil, fmt.Errorf("%w: contents size %s exceeds %s limit", 354 ErrSiteTooLarge, 355 datasize.ByteSize(totalSize).HR(), 356 config.Limits.MaxSiteSize.HR(), 357 ) 358 } 359 for _, blobSize := range blobSizes { 360 *extManifest.StoredSize += blobSize 361 } 362 363 // Upload the resulting manifest and the blob it references. 364 extManifestData := EncodeManifest(&extManifest) 365 if uint64(len(extManifestData)) > config.Limits.MaxManifestSize.Bytes() { 366 return nil, fmt.Errorf("%w: manifest size %s exceeds %s limit", 367 ErrManifestTooLarge, 368 datasize.ByteSize(len(extManifestData)).HR(), 369 config.Limits.MaxManifestSize, 370 ) 371 } 372 373 if err := backend.StageManifest(ctx, &extManifest); err != nil { 374 return nil, fmt.Errorf("stage manifest: %w", err) 375 } 376 377 wg := sync.WaitGroup{} 378 ch := make(chan error, len(extManifest.Contents)) 379 for name, entry := range extManifest.Contents { 380 // Upload external entries (those that were decided as ineligible for being stored inline). 381 // If the entry in the original manifest is already an external reference, there's no need 382 // to externalize it (and no way for us to do so, since the entry only contains the blob name). 383 if entry.GetType() == Type_ExternalFile && manifest.Contents[name].GetType() == Type_InlineFile { 384 wg.Go(func() { 385 err := backend.PutBlob(ctx, string(entry.Data), manifest.Contents[name].Data) 386 if err != nil { 387 ch <- fmt.Errorf("put blob %s: %w", name, err) 388 } 389 }) 390 } 391 } 392 wg.Wait() 393 close(ch) 394 for err := range ch { 395 return nil, err // currently ignores all but 1st error 396 } 397 398 if err := backend.CommitManifest(ctx, name, &extManifest, opts); err != nil { 399 if errors.Is(err, ErrDomainFrozen) { 400 return nil, err 401 } else { 402 return nil, fmt.Errorf("commit manifest: %w", err) 403 } 404 } 405 406 return &extManifest, nil 407}