[mirror] Scalable static site server for Git forges (like GitHub Pages)

Allow downloading entire site via CLI or HTTP.

The HTTP endpoint is `/.git-pages/archive.tar` and it is gated behind
a feature flag `archive-site`. It serially downloads every blob and
writes it to the client in a chunked response, optionally compressed
with gzip or zstd as per `Accept-Encoding:`. It is authorized the same
as `/.git-pages/manifest.json`, for the same reasons.

The CLI operation is `-get-archive <site-name>` and it writes a tar
archive to stdout. This could be useful for an administrator to review
the contents of a site in response to a report.

Both `_headers` and `_redirects` files are present in the output,
reconstituted from the manifest.

+1 -1
flake.nix
··· 43 43 "-s -w" 44 44 ]; 45 45 46 - vendorHash = "sha256-UQl8AeijqJd2qpVZBDuHT/+Dtd3+Uwrf4w4yAOaFs98="; 46 + vendorHash = "sha256-oVXELOXbRTzzU8pUGNE4K552thlZXGAX7qpv6ETwz6o="; 47 47 }; 48 48 in 49 49 {
+1 -1
go.mod
··· 3 3 go 1.25.0 4 4 5 5 require ( 6 - codeberg.org/git-pages/go-headers v1.0.0 6 + codeberg.org/git-pages/go-headers v1.1.0 7 7 github.com/KimMachineGun/automemlimit v0.7.5 8 8 github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500 9 9 github.com/creasty/defaults v1.8.0
+2
go.sum
··· 1 1 codeberg.org/git-pages/go-headers v1.0.0 h1:hvGU97hQdXaT5HwCpZJWQdg7akvtOBCSUNL4u2a5uTs= 2 2 codeberg.org/git-pages/go-headers v1.0.0/go.mod h1:N4gwH0U3YPwmuyxqH7xBA8j44fTPX+vOEP7ejJVBPts= 3 + codeberg.org/git-pages/go-headers v1.1.0 h1:rk7/SOSsn+XuL7PUQZFYUaWKHEaj6K8mXmUV9rF2VxE= 4 + codeberg.org/git-pages/go-headers v1.1.0/go.mod h1:N4gwH0U3YPwmuyxqH7xBA8j44fTPX+vOEP7ejJVBPts= 3 5 github.com/KimMachineGun/automemlimit v0.7.5 h1:RkbaC0MwhjL1ZuBKunGDjE/ggwAX43DwZrJqVwyveTk= 4 6 github.com/KimMachineGun/automemlimit v0.7.5/go.mod h1:QZxpHaGOQoYvFhv/r4u3U0JTC2ZcOwbSr11UZF46UBM= 5 7 github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
+1
src/auth.go
··· 310 310 } 311 311 } 312 312 313 + // Checks whether an operation that enables enumerating site contents is allowed. 313 314 func AuthorizeMetadataRetrieval(r *http.Request) (*Authorization, error) { 314 315 causes := []error{AuthError{http.StatusUnauthorized, "unauthorized"}} 315 316
+126
src/collect.go
··· 1 + package git_pages 2 + 3 + import ( 4 + "archive/tar" 5 + "context" 6 + "fmt" 7 + "io" 8 + "time" 9 + ) 10 + 11 + type Flusher interface { 12 + Flush() error 13 + } 14 + 15 + // Inverse of `ExtractTar`. 16 + func CollectTar( 17 + context context.Context, writer io.Writer, manifest *Manifest, manifestMtime time.Time, 18 + ) ( 19 + err error, 20 + ) { 21 + archive := tar.NewWriter(writer) 22 + 23 + appendFile := func(header *tar.Header, data []byte, transform Transform) (err error) { 24 + switch transform { 25 + case Transform_None: 26 + case Transform_Zstandard: 27 + data, err = zstdDecoder.DecodeAll(data, []byte{}) 28 + if err != nil { 29 + return err 30 + } 31 + default: 32 + return fmt.Errorf("unexpected transform") 33 + } 34 + header.Size = int64(len(data)) 35 + 36 + err = archive.WriteHeader(header) 37 + if err != nil { 38 + return 39 + } 40 + _, err = archive.Write(data) 41 + return 42 + } 43 + 44 + for fileName, entry := range manifest.Contents { 45 + var header tar.Header 46 + if fileName == "" { 47 + continue 48 + } 49 + header.Name = fileName 50 + 51 + switch entry.GetType() { 52 + case Type_Directory: 53 + header.Typeflag = tar.TypeDir 54 + header.Mode = 0755 55 + header.ModTime = manifestMtime 56 + err = appendFile(&header, nil, Transform_None) 57 + 58 + case Type_InlineFile: 59 + header.Typeflag = tar.TypeReg 60 + header.Mode = 0644 61 + header.ModTime = manifestMtime 62 + err = appendFile(&header, entry.GetData(), entry.GetTransform()) 63 + 64 + case Type_ExternalFile: 65 + var blobReader io.Reader 66 + var blobMtime time.Time 67 + var blobData []byte 68 + blobReader, _, blobMtime, err = backend.GetBlob(context, string(entry.Data)) 69 + if err != nil { 70 + return 71 + } 72 + blobData, _ = io.ReadAll(blobReader) 73 + header.Typeflag = tar.TypeReg 74 + header.Mode = 0644 75 + header.ModTime = blobMtime 76 + err = appendFile(&header, blobData, entry.GetTransform()) 77 + 78 + case Type_Symlink: 79 + header.Typeflag = tar.TypeSymlink 80 + header.Mode = 0644 81 + header.ModTime = manifestMtime 82 + err = appendFile(&header, entry.GetData(), Transform_None) 83 + 84 + default: 85 + return fmt.Errorf("unexpected entry type") 86 + } 87 + if err != nil { 88 + return err 89 + } 90 + } 91 + 92 + if redirects := CollectRedirectsFile(manifest); redirects != "" { 93 + err = appendFile(&tar.Header{ 94 + Name: RedirectsFileName, 95 + Typeflag: tar.TypeReg, 96 + Mode: 0644, 97 + ModTime: manifestMtime, 98 + }, []byte(redirects), Transform_None) 99 + if err != nil { 100 + return err 101 + } 102 + } 103 + 104 + if headers := CollectHeadersFile(manifest); headers != "" { 105 + err = appendFile(&tar.Header{ 106 + Name: HeadersFileName, 107 + Typeflag: tar.TypeReg, 108 + Mode: 0644, 109 + ModTime: manifestMtime, 110 + }, []byte(headers), Transform_None) 111 + if err != nil { 112 + return err 113 + } 114 + } 115 + 116 + err = archive.Flush() 117 + if err != nil { 118 + return err 119 + } 120 + 121 + flusher, ok := writer.(Flusher) 122 + if ok { 123 + err = flusher.Flush() 124 + } 125 + return err 126 + }
+21 -6
src/headers.go
··· 15 15 16 16 var ErrHeaderNotAllowed = errors.New("custom header not allowed") 17 17 18 - const headersFileName string = "_headers" 18 + const HeadersFileName string = "_headers" 19 19 20 20 // Lifted from https://docs.netlify.com/manage/routing/headers/, except for `Set-Cookie` 21 21 // the rationale for which does not apply in our environment. ··· 86 86 87 87 // Parses redirects file and injects rules into the manifest. 88 88 func ProcessHeadersFile(manifest *Manifest) error { 89 - headersEntry := manifest.Contents[headersFileName] 90 - delete(manifest.Contents, headersFileName) 89 + headersEntry := manifest.Contents[HeadersFileName] 90 + delete(manifest.Contents, HeadersFileName) 91 91 if headersEntry == nil { 92 92 return nil 93 93 } else if headersEntry.GetType() != Type_InlineFile { 94 - return AddProblem(manifest, headersFileName, 94 + return AddProblem(manifest, HeadersFileName, 95 95 "not a regular file") 96 96 } 97 97 98 98 rules, err := headers.ParseString(string(headersEntry.GetData())) 99 99 if err != nil { 100 - return AddProblem(manifest, headersFileName, 100 + return AddProblem(manifest, HeadersFileName, 101 101 "syntax error: %s", err) 102 102 } 103 103 104 104 for index, rule := range rules { 105 105 if err := validateHeaderRule(rule); err != nil { 106 - AddProblem(manifest, headersFileName, 106 + AddProblem(manifest, HeadersFileName, 107 107 "rule #%d %q: %s", index+1, rule.Path, err) 108 108 continue 109 109 } ··· 120 120 }) 121 121 } 122 122 return nil 123 + } 124 + 125 + func CollectHeadersFile(manifest *Manifest) string { 126 + var headersRules []headers.Rule 127 + for _, manifestRule := range manifest.GetHeaders() { 128 + headersRule := headers.Rule{ 129 + Path: manifestRule.GetPath(), 130 + Headers: http.Header{}, 131 + } 132 + for _, manifestHeader := range manifestRule.GetHeaderMap() { 133 + headersRule.Headers[manifestHeader.GetName()] = manifestHeader.GetValues() 134 + } 135 + headersRules = append(headersRules, headersRule) 136 + } 137 + return headers.Must(headers.UnparseString(headersRules)) 123 138 } 124 139 125 140 func ApplyHeaderRules(manifest *Manifest, url *url.URL) (headers http.Header, err error) {
+47 -14
src/main.go
··· 69 69 } 70 70 } 71 71 72 + func webRootArg(arg string) string { 73 + switch strings.Count(arg, "/") { 74 + case 0: 75 + return arg + "/.index" 76 + case 1: 77 + return arg 78 + default: 79 + log.Fatalf("webroot argument must be either 'domain.tld' or 'domain.tld/dir") 80 + return "" 81 + } 82 + } 83 + 72 84 func Main() { 73 85 printConfigEnvVars := flag.Bool("print-config-env-vars", false, 74 86 "print every recognized configuration environment variable and exit") ··· 80 92 "run without configuration file (configure via environment variables)") 81 93 runMigration := flag.String("run-migration", "", 82 94 "run a specific store migration (available: \"create-domain-markers\")") 95 + getBlob := flag.String("get-blob", "", 96 + "write contents of `blob-ref` ('sha256-xxxxxxx...xxx') to stdout") 83 97 getManifest := flag.String("get-manifest", "", 84 - "write manifest for `webroot` (either 'domain.tld' or 'domain.tld/dir') to stdout as ProtoJSON") 85 - getBlob := flag.String("get-blob", "", 86 - "write `blob` ('sha256-xxxxxxx...xxx') to stdout") 98 + "write manifest for `site-name` (either 'domain.tld' or 'domain.tld/dir') to stdout as ProtoJSON") 99 + getArchive := flag.String("get-archive", "", 100 + "write archive for `site-name` (either 'domain.tld' or 'domain.tld/dir') to stdout in tar format") 87 101 updateSite := flag.String("update-site", "", 88 - "update site for `webroot` (either 'domain.tld' or 'domain.tld/dir') from archive or repository URL") 102 + "update site for `site-name` (either 'domain.tld' or 'domain.tld/dir') from archive or repository URL") 89 103 flag.Parse() 90 104 91 - if *getManifest != "" && *getBlob != "" { 92 - log.Fatalln("-get-manifest and -get-blob are mutually exclusive") 105 + var cliOperations int 106 + if *getBlob != "" { 107 + cliOperations += 1 108 + } 109 + if *getManifest != "" { 110 + cliOperations += 1 111 + } 112 + if *getArchive != "" { 113 + cliOperations += 1 114 + } 115 + if cliOperations > 1 { 116 + log.Fatalln("-get-blob, -get-manifest, and -get-archive are mutually exclusive") 93 117 } 94 118 95 119 if *configTomlPath != "" && *noConfig { ··· 150 174 log.Fatalln(err) 151 175 } 152 176 153 - case *getManifest != "": 177 + case *getBlob != "": 154 178 if err := ConfigureBackend(&config.Storage); err != nil { 155 179 log.Fatalln(err) 156 180 } 157 181 158 - webRoot := *getManifest 159 - if !strings.Contains(webRoot, "/") { 160 - webRoot += "/.index" 182 + reader, _, _, err := backend.GetBlob(context.Background(), *getBlob) 183 + if err != nil { 184 + log.Fatalln(err) 185 + } 186 + 187 + io.Copy(os.Stdout, reader) 188 + 189 + case *getManifest != "": 190 + if err := ConfigureBackend(&config.Storage); err != nil { 191 + log.Fatalln(err) 161 192 } 162 193 194 + webRoot := webRootArg(*getManifest) 163 195 manifest, _, err := backend.GetManifest(context.Background(), webRoot, GetManifestOptions{}) 164 196 if err != nil { 165 197 log.Fatalln(err) 166 198 } 167 199 fmt.Println(ManifestDebugJSON(manifest)) 168 200 169 - case *getBlob != "": 201 + case *getArchive != "": 170 202 if err := ConfigureBackend(&config.Storage); err != nil { 171 203 log.Fatalln(err) 172 204 } 173 205 174 - reader, _, _, err := backend.GetBlob(context.Background(), *getBlob) 206 + webRoot := webRootArg(*getArchive) 207 + manifest, manifestMtime, err := 208 + backend.GetManifest(context.Background(), webRoot, GetManifestOptions{}) 175 209 if err != nil { 176 210 log.Fatalln(err) 177 211 } 178 - 179 - io.Copy(os.Stdout, reader) 212 + CollectTar(context.Background(), os.Stdout, manifest, manifestMtime) 180 213 181 214 case *updateSite != "": 182 215 if err := ConfigureBackend(&config.Storage); err != nil {
+38 -4
src/pages.go
··· 2 2 3 3 import ( 4 4 "bytes" 5 + "compress/gzip" 5 6 "context" 6 7 "encoding/json" 7 8 "errors" ··· 159 160 } 160 161 if metadataPath, found := strings.CutPrefix(sitePath, ".git-pages/"); found { 161 162 lastModified := manifestMtime.UTC().Format(http.TimeFormat) 162 - switch metadataPath { 163 - case "health": 163 + switch { 164 + case metadataPath == "health": 164 165 w.Header().Add("Last-Modified", lastModified) 165 166 w.WriteHeader(http.StatusOK) 166 167 fmt.Fprintf(w, "ok\n") 168 + return nil 167 169 168 - case "manifest.json": 170 + case metadataPath == "manifest.json": 169 171 // metadata requests require authorization to avoid making pushes from private 170 172 // repositories enumerable 171 173 _, err := AuthorizeMetadataRetrieval(r) ··· 177 179 w.Header().Add("Last-Modified", lastModified) 178 180 w.WriteHeader(http.StatusOK) 179 181 w.Write([]byte(ManifestDebugJSON(manifest))) 182 + return nil 183 + 184 + case metadataPath == "archive.tar" && config.Feature("archive-site"): 185 + // same as above 186 + _, err := AuthorizeMetadataRetrieval(r) 187 + if err != nil { 188 + return err 189 + } 190 + 191 + // we only offer `/.git-pages/archive.tar` and not the `.tar.gz`/`.tar.zst` variants 192 + // because HTTP can already request compression using the `Content-Encoding` mechanism 193 + acceptedEncodings := parseHTTPEncodings(r.Header.Get("Accept-Encoding")) 194 + negotiated := acceptedEncodings.Negotiate("zstd", "gzip", "identity") 195 + if negotiated != "" { 196 + w.Header().Set("Content-Encoding", negotiated) 197 + } 198 + w.Header().Add("Content-Type", "application/x-tar") 199 + w.Header().Add("Last-Modified", lastModified) 200 + w.Header().Add("Transfer-Encoding", "chunked") 201 + w.WriteHeader(http.StatusOK) 202 + var iow io.Writer 203 + switch negotiated { 204 + case "", "identity": 205 + iow = w 206 + case "gzip": 207 + iow = gzip.NewWriter(w) 208 + case "zstd": 209 + iow, _ = zstd.NewWriter(w) 210 + } 211 + return CollectTar(r.Context(), iow, manifest, manifestMtime) 180 212 181 213 default: 182 214 w.WriteHeader(http.StatusNotFound) 183 215 fmt.Fprintf(w, "not found\n") 216 + return nil 184 217 } 185 - return nil 186 218 } 187 219 188 220 entryPath := sitePath ··· 297 329 default: 298 330 negotiatedEncoding = false 299 331 } 332 + default: 333 + return fmt.Errorf("unexpected transform") 300 334 } 301 335 if !negotiatedEncoding { 302 336 w.WriteHeader(http.StatusNotAcceptable)
+19 -6
src/redirects.go
··· 11 11 "google.golang.org/protobuf/proto" 12 12 ) 13 13 14 - const redirectsFileName string = "_redirects" 14 + const RedirectsFileName string = "_redirects" 15 15 16 16 func unparseRule(rule redirects.Rule) string { 17 17 var statusPart string ··· 87 87 88 88 // Parses redirects file and injects rules into the manifest. 89 89 func ProcessRedirectsFile(manifest *Manifest) error { 90 - redirectsEntry := manifest.Contents[redirectsFileName] 91 - delete(manifest.Contents, redirectsFileName) 90 + redirectsEntry := manifest.Contents[RedirectsFileName] 91 + delete(manifest.Contents, RedirectsFileName) 92 92 if redirectsEntry == nil { 93 93 return nil 94 94 } else if redirectsEntry.GetType() != Type_InlineFile { 95 - return AddProblem(manifest, redirectsFileName, 95 + return AddProblem(manifest, RedirectsFileName, 96 96 "not a regular file") 97 97 } 98 98 99 99 rules, err := redirects.ParseString(string(redirectsEntry.GetData())) 100 100 if err != nil { 101 - return AddProblem(manifest, redirectsFileName, 101 + return AddProblem(manifest, RedirectsFileName, 102 102 "syntax error: %s", err) 103 103 } 104 104 105 105 for index, rule := range rules { 106 106 if err := validateRedirectRule(rule); err != nil { 107 - AddProblem(manifest, redirectsFileName, 107 + AddProblem(manifest, RedirectsFileName, 108 108 "rule #%d %q: %s", index+1, unparseRule(rule), err) 109 109 continue 110 110 } ··· 116 116 }) 117 117 } 118 118 return nil 119 + } 120 + 121 + func CollectRedirectsFile(manifest *Manifest) string { 122 + var rules []string 123 + for _, rule := range manifest.GetRedirects() { 124 + rules = append(rules, unparseRule(redirects.Rule{ 125 + From: rule.GetFrom(), 126 + To: rule.GetTo(), 127 + Status: int(rule.GetStatus()), 128 + Force: rule.GetForce(), 129 + })+"\n") 130 + } 131 + return strings.Join(rules, "") 119 132 } 120 133 121 134 func pathSegments(path string) []string {