[mirror] Scalable static site server for Git forges (like GitHub Pages)

Collect statistics on blob reuse during archive upload.

Changed files
+61 -29
src
+43 -10
src/extract.go
··· 5 5 "archive/zip" 6 6 "bytes" 7 7 "compress/gzip" 8 + "context" 8 9 "errors" 9 10 "fmt" 10 11 "io" ··· 25 26 fmt.Errorf("%w: %s limit exceeded", ErrArchiveTooLarge, config.Limits.MaxSiteSize.HR())) 26 27 } 27 28 28 - func ExtractGzip(reader io.Reader, next func(io.Reader) (*Manifest, error)) (*Manifest, error) { 29 + func ExtractGzip( 30 + ctx context.Context, reader io.Reader, 31 + next func(context.Context, io.Reader) (*Manifest, error), 32 + ) (*Manifest, error) { 29 33 stream, err := gzip.NewReader(reader) 30 34 if err != nil { 31 35 return nil, err 32 36 } 33 37 defer stream.Close() 34 38 35 - return next(boundArchiveStream(stream)) 39 + return next(ctx, boundArchiveStream(stream)) 36 40 } 37 41 38 - func ExtractZstd(reader io.Reader, next func(io.Reader) (*Manifest, error)) (*Manifest, error) { 42 + func ExtractZstd( 43 + ctx context.Context, reader io.Reader, 44 + next func(context.Context, io.Reader) (*Manifest, error), 45 + ) (*Manifest, error) { 39 46 stream, err := zstd.NewReader(reader) 40 47 if err != nil { 41 48 return nil, err 42 49 } 43 50 defer stream.Close() 44 51 45 - return next(boundArchiveStream(stream)) 52 + return next(ctx, boundArchiveStream(stream)) 46 53 } 47 54 48 55 // Returns a map of git hash to entry. If `manifest` is nil, returns an empty map. ··· 62 69 63 70 func addSymlinkOrBlobReference( 64 71 manifest *Manifest, fileName string, target string, index map[string]*Entry, 65 - ) { 72 + ) *Entry { 66 73 if hash, found := strings.CutPrefix(target, BlobReferencePrefix); found { 67 74 if entry, found := index[hash]; found { 68 75 manifest.Contents[fileName] = entry 76 + return entry 69 77 } else { 70 78 AddProblem(manifest, fileName, "unresolved reference: %s", target) 79 + return nil 71 80 } 72 81 } else { 73 - AddSymlink(manifest, fileName, target) 82 + return AddSymlink(manifest, fileName, target) 74 83 } 75 84 } 76 85 77 - func ExtractTar(reader io.Reader, oldManifest *Manifest) (*Manifest, error) { 86 + func ExtractTar(ctx context.Context, reader io.Reader, oldManifest *Manifest) (*Manifest, error) { 78 87 archive := tar.NewReader(reader) 88 + 89 + var dataBytesRecycled int64 90 + var dataBytesTransferred int64 79 91 80 92 index := indexManifestByGitHash(oldManifest) 81 93 manifest := NewManifest() ··· 105 117 return nil, fmt.Errorf("tar: %s: %w", fileName, err) 106 118 } 107 119 AddFile(manifest, fileName, fileData) 120 + dataBytesTransferred += int64(len(fileData)) 108 121 case tar.TypeSymlink: 109 - addSymlinkOrBlobReference(manifest, fileName, header.Linkname, index) 122 + entry := addSymlinkOrBlobReference(manifest, fileName, header.Linkname, index) 123 + dataBytesRecycled += entry.GetOriginalSize() 110 124 case tar.TypeDir: 111 125 AddDirectory(manifest, fileName) 112 126 default: ··· 114 128 continue 115 129 } 116 130 } 131 + 132 + logc.Printf(ctx, 133 + "reuse: %s recycled, %s transferred\n", 134 + datasize.ByteSize(dataBytesRecycled).HR(), 135 + datasize.ByteSize(dataBytesTransferred).HR(), 136 + ) 137 + 117 138 return manifest, nil 118 139 } 119 140 120 - func ExtractZip(reader io.Reader, oldManifest *Manifest) (*Manifest, error) { 141 + func ExtractZip(ctx context.Context, reader io.Reader, oldManifest *Manifest) (*Manifest, error) { 121 142 data, err := io.ReadAll(reader) 122 143 if err != nil { 123 144 return nil, err ··· 141 162 ) 142 163 } 143 164 165 + var dataBytesRecycled int64 166 + var dataBytesTransferred int64 167 + 144 168 index := indexManifestByGitHash(oldManifest) 145 169 manifest := NewManifest() 146 170 for _, file := range archive.File { ··· 159 183 } 160 184 161 185 if file.Mode()&os.ModeSymlink != 0 { 162 - addSymlinkOrBlobReference(manifest, file.Name, string(fileData), index) 186 + entry := addSymlinkOrBlobReference(manifest, file.Name, string(fileData), index) 187 + dataBytesRecycled += entry.GetOriginalSize() 163 188 } else { 164 189 AddFile(manifest, file.Name, fileData) 190 + dataBytesTransferred += int64(len(fileData)) 165 191 } 166 192 } 167 193 } 194 + 195 + logc.Printf(ctx, 196 + "reuse: %s recycled, %s transferred\n", 197 + datasize.ByteSize(dataBytesRecycled).HR(), 198 + datasize.ByteSize(dataBytesTransferred).HR(), 199 + ) 200 + 168 201 return manifest, nil 169 202 }
+8 -9
src/fetch.go
··· 131 131 } 132 132 133 133 // Collect checkout statistics. 134 - var dataBytesFromOldManifest int64 135 - var dataBytesFromGitCheckout int64 136 - var dataBytesFromGitTransport int64 134 + var dataBytesRecycled int64 135 + var dataBytesTransferred int64 137 136 138 137 // First, see if we can extract the blobs from the old manifest. This is the preferred option 139 138 // because it avoids both network transfers and recompression. Note that we do not request ··· 143 142 if manifestEntry, found := blobsNeeded[hash]; found { 144 143 manifestEntry.Reset() 145 144 proto.Merge(manifestEntry, oldManifestEntry) 146 - dataBytesFromOldManifest += oldManifestEntry.GetOriginalSize() 145 + dataBytesRecycled += oldManifestEntry.GetOriginalSize() 147 146 delete(blobsNeeded, hash) 148 147 } 149 148 } ··· 154 153 // clone despite asking for a partial clone. 155 154 for hash, manifestEntry := range blobsNeeded { 156 155 if err := readGitBlob(repo, hash, manifestEntry); err == nil { 157 - dataBytesFromGitCheckout += manifestEntry.GetOriginalSize() 156 + dataBytesTransferred += manifestEntry.GetOriginalSize() 158 157 delete(blobsNeeded, hash) 159 158 } 160 159 } ··· 197 196 if err := readGitBlob(repo, hash, manifestEntry); err != nil { 198 197 return nil, err 199 198 } 200 - dataBytesFromGitTransport += manifestEntry.GetOriginalSize() 199 + dataBytesTransferred += manifestEntry.GetOriginalSize() 201 200 delete(blobsNeeded, hash) 202 201 } 203 202 } 204 203 205 204 logc.Printf(ctx, 206 - "fetch: %s reused, %s received\n", 207 - datasize.ByteSize(dataBytesFromOldManifest).HR(), 208 - datasize.ByteSize(dataBytesFromGitCheckout+dataBytesFromGitTransport).HR(), 205 + "reuse: %s recycled, %s transferred\n", 206 + datasize.ByteSize(dataBytesRecycled).HR(), 207 + datasize.ByteSize(dataBytesTransferred).HR(), 209 208 ) 210 209 211 210 return manifest, nil
+10 -10
src/update.go
··· 125 125 // Ignore errors; worst case we have to re-fetch all of the blobs. 126 126 oldManifest, _, _ := backend.GetManifest(ctx, webRoot, GetManifestOptions{}) 127 127 128 - extractTar := func(reader io.Reader) (*Manifest, error) { 129 - return ExtractTar(reader, oldManifest) 128 + extractTar := func(ctx context.Context, reader io.Reader) (*Manifest, error) { 129 + return ExtractTar(ctx, reader, oldManifest) 130 130 } 131 131 132 132 var newManifest *Manifest 133 133 switch contentType { 134 134 case "application/x-tar": 135 135 logc.Printf(ctx, "update %s: (tar)", webRoot) 136 - newManifest, err = extractTar(reader) // yellow? 136 + newManifest, err = extractTar(ctx, reader) // yellow? 137 137 case "application/x-tar+gzip": 138 138 logc.Printf(ctx, "update %s: (tar.gz)", webRoot) 139 - newManifest, err = ExtractGzip(reader, extractTar) // definitely yellow. 139 + newManifest, err = ExtractGzip(ctx, reader, extractTar) // definitely yellow. 140 140 case "application/x-tar+zstd": 141 141 logc.Printf(ctx, "update %s: (tar.zst)", webRoot) 142 - newManifest, err = ExtractZstd(reader, extractTar) 142 + newManifest, err = ExtractZstd(ctx, reader, extractTar) 143 143 case "application/zip": 144 144 logc.Printf(ctx, "update %s: (zip)", webRoot) 145 - newManifest, err = ExtractZip(reader, oldManifest) 145 + newManifest, err = ExtractZip(ctx, reader, oldManifest) 146 146 default: 147 147 err = errArchiveFormat 148 148 } ··· 177 177 return UpdateResult{UpdateError, nil, err} 178 178 } 179 179 180 - applyTarPatch := func(reader io.Reader) (*Manifest, error) { 180 + applyTarPatch := func(ctx context.Context, reader io.Reader) (*Manifest, error) { 181 181 // Clone the manifest before starting to mutate it. `GetManifest` may return cached 182 182 // `*Manifest` objects, which should never be mutated. 183 183 newManifest := &Manifest{} ··· 193 193 switch contentType { 194 194 case "application/x-tar": 195 195 logc.Printf(ctx, "patch %s: (tar)", webRoot) 196 - newManifest, err = applyTarPatch(reader) 196 + newManifest, err = applyTarPatch(ctx, reader) 197 197 case "application/x-tar+gzip": 198 198 logc.Printf(ctx, "patch %s: (tar.gz)", webRoot) 199 - newManifest, err = ExtractGzip(reader, applyTarPatch) 199 + newManifest, err = ExtractGzip(ctx, reader, applyTarPatch) 200 200 case "application/x-tar+zstd": 201 201 logc.Printf(ctx, "patch %s: (tar.zst)", webRoot) 202 - newManifest, err = ExtractZstd(reader, applyTarPatch) 202 + newManifest, err = ExtractZstd(ctx, reader, applyTarPatch) 203 203 default: 204 204 err = errArchiveFormat 205 205 }