Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
at python-updates 433 lines 14 kB view raw
1""" 2This script generates a Docker image from a set of store paths. Uses 3Docker Image Specification v1.2 as reference [1]. 4 5It expects a JSON file with the following properties and writes the 6image as an uncompressed tarball to stdout: 7 8* "architecture", "config", "os", "created", "repo_tag" correspond to 9 the fields with the same name on the image spec [2]. 10* "created" can be "now". 11* "created" is also used as mtime for files added to the image. 12* "uid", "gid", "uname", "gname" is the file ownership, for example, 13 0, 0, "root", "root". 14* "store_layers" is a list of layers in ascending order, where each 15 layer is the list of store paths to include in that layer. 16 17The main challenge for this script to create the final image in a 18streaming fashion, without dumping any intermediate data to disk 19for performance. 20 21A docker image has each layer contents archived as separate tarballs, 22and they later all get enveloped into a single big tarball in a 23content addressed fashion. However, because how "tar" format works, 24we have to know about the name (which includes the checksum in our 25case) and the size of the tarball before we can start adding it to the 26outer tarball. We achieve that by creating the layer tarballs twice; 27on the first iteration we calculate the file size and the checksum, 28and on the second one we actually stream the contents. 'add_layer_dir' 29function does all this. 30 31[1]: https://github.com/moby/moby/blob/master/image/spec/v1.2.md 32[2]: https://github.com/moby/moby/blob/4fb59c20a4fb54f944fe170d0ff1d00eb4a24d6f/image/spec/v1.2.md#image-json-field-descriptions 33""" # noqa: E501 34 35import argparse 36import io 37import os 38import re 39import sys 40import json 41import hashlib 42import pathlib 43import tarfile 44import itertools 45import threading 46from datetime import datetime, timezone 47from collections import namedtuple 48 49 50def archive_paths_to(obj, paths, mtime, uid, gid, uname, gname): 51 """ 52 Writes the given store paths as a tar file to the given stream. 53 54 obj: Stream to write to. Should have a 'write' method. 55 paths: List of store paths. 56 """ 57 58 # gettarinfo makes the paths relative, this makes them 59 # absolute again 60 def append_root(ti): 61 ti.name = "/" + ti.name 62 return ti 63 64 def apply_filters(ti): 65 ti.mtime = mtime 66 ti.uid = uid 67 ti.gid = gid 68 ti.uname = uname 69 ti.gname = gname 70 return ti 71 72 def nix_root(ti): 73 ti.mode = 0o0755 # rwxr-xr-x 74 return ti 75 76 def dir(path): 77 ti = tarfile.TarInfo(path) 78 ti.type = tarfile.DIRTYPE 79 return ti 80 81 with tarfile.open(fileobj=obj, mode="w|") as tar: 82 # To be consistent with the docker utilities, we need to have 83 # these directories first when building layer tarballs. 84 tar.addfile(apply_filters(nix_root(dir("/nix")))) 85 tar.addfile(apply_filters(nix_root(dir("/nix/store")))) 86 87 for path in paths: 88 path = pathlib.Path(path) 89 if path.is_symlink(): 90 files = [path] 91 else: 92 files = itertools.chain([path], path.rglob("*")) 93 94 for filename in sorted(files): 95 ti = append_root(tar.gettarinfo(filename)) 96 97 # copy hardlinks as regular files 98 if ti.islnk(): 99 ti.type = tarfile.REGTYPE 100 ti.linkname = "" 101 ti.size = filename.stat().st_size 102 103 ti = apply_filters(ti) 104 if ti.isfile(): 105 with open(filename, "rb") as f: 106 tar.addfile(ti, f) 107 else: 108 tar.addfile(ti) 109 110 111class ExtractChecksum: 112 """ 113 A writable stream which only calculates the final file size and 114 sha256sum, while discarding the actual contents. 115 """ 116 117 def __init__(self): 118 self._digest = hashlib.sha256() 119 self._size = 0 120 121 def write(self, data): 122 self._digest.update(data) 123 self._size += len(data) 124 125 def extract(self): 126 """ 127 Returns: Hex-encoded sha256sum and size as a tuple. 128 """ 129 return (self._digest.hexdigest(), self._size) 130 131 132FromImage = namedtuple("FromImage", ["tar", "manifest_json", "image_json"]) 133# Some metadata for a layer 134LayerInfo = namedtuple("LayerInfo", ["size", "checksum", "path", "paths"]) 135 136 137def load_from_image(from_image_str): 138 """ 139 Loads the given base image, if any. 140 141 from_image_str: Path to the base image archive. 142 143 Returns: A 'FromImage' object with references to the loaded base image, 144 or 'None' if no base image was provided. 145 """ 146 if from_image_str is None: 147 return None 148 149 base_tar = tarfile.open(from_image_str) 150 151 manifest_json_tarinfo = base_tar.getmember("manifest.json") 152 with base_tar.extractfile(manifest_json_tarinfo) as f: 153 manifest_json = json.load(f) 154 155 image_json_tarinfo = base_tar.getmember(manifest_json[0]["Config"]) 156 with base_tar.extractfile(image_json_tarinfo) as f: 157 image_json = json.load(f) 158 159 return FromImage(base_tar, manifest_json, image_json) 160 161 162def add_base_layers(tar, from_image): 163 """ 164 Adds the layers from the given base image to the final image. 165 166 tar: 'tarfile.TarFile' object for new layers to be added to. 167 from_image: 'FromImage' object with references to the loaded base image. 168 """ 169 if from_image is None: 170 print("No 'fromImage' provided", file=sys.stderr) 171 return [] 172 173 layers = from_image.manifest_json[0]["Layers"] 174 checksums = from_image.image_json["rootfs"]["diff_ids"] 175 layers_checksums = zip(layers, checksums) 176 177 for num, (layer, checksum) in enumerate(layers_checksums, start=1): 178 layer_tarinfo = from_image.tar.getmember(layer) 179 checksum = re.sub(r"^sha256:", "", checksum) 180 181 tar.addfile(layer_tarinfo, from_image.tar.extractfile(layer_tarinfo)) 182 path = layer_tarinfo.path 183 size = layer_tarinfo.size 184 185 print("Adding base layer", num, "from", path, file=sys.stderr) 186 yield LayerInfo(size=size, checksum=checksum, path=path, paths=[path]) 187 188 from_image.tar.close() 189 190 191def overlay_base_config(from_image, final_config): 192 """ 193 Overlays the final image 'config' JSON on top of selected defaults from the 194 base image 'config' JSON. 195 196 from_image: 'FromImage' object with references to the loaded base image. 197 final_config: 'dict' object of the final image 'config' JSON. 198 """ 199 if from_image is None: 200 return final_config 201 202 base_config = from_image.image_json["config"] 203 204 # Preserve environment from base image 205 final_env = base_config.get("Env", []) + final_config.get("Env", []) 206 if final_env: 207 # Resolve duplicates (last one wins) and format back as list 208 resolved_env = {entry.split("=", 1)[0]: entry for entry in final_env} 209 final_config["Env"] = list(resolved_env.values()) 210 return final_config 211 212 213def add_layer_dir(tar, paths, store_dir, mtime, uid, gid, uname, gname): 214 """ 215 Appends given store paths to a TarFile object as a new layer. 216 217 tar: 'tarfile.TarFile' object for the new layer to be added to. 218 paths: List of store paths. 219 store_dir: the root directory of the nix store 220 mtime: 'mtime' of the added files and the layer tarball. 221 Should be an integer representing a POSIX time. 222 223 Returns: A 'LayerInfo' object containing some metadata of 224 the layer added. 225 """ 226 227 invalid_paths = [i for i in paths if not i.startswith(store_dir)] 228 assert ( 229 len(invalid_paths) == 0 230 ), f"Expecting absolute paths from {store_dir}, but got: {invalid_paths}" 231 232 # First, calculate the tarball checksum and the size. 233 extract_checksum = ExtractChecksum() 234 archive_paths_to(extract_checksum, paths, mtime, uid, gid, uname, gname) 235 (checksum, size) = extract_checksum.extract() 236 237 path = f"{checksum}/layer.tar" 238 layer_tarinfo = tarfile.TarInfo(path) 239 layer_tarinfo.size = size 240 layer_tarinfo.mtime = mtime 241 242 # Then actually stream the contents to the outer tarball. 243 read_fd, write_fd = os.pipe() 244 with open(read_fd, "rb") as read, open(write_fd, "wb") as write: 245 246 def producer(): 247 archive_paths_to(write, paths, mtime, uid, gid, uname, gname) 248 write.close() 249 250 # Closing the write end of the fifo also closes the read end, 251 # so we don't need to wait until this thread is finished. 252 # 253 # Any exception from the thread will get printed by the default 254 # exception handler, and the 'addfile' call will fail since it 255 # won't be able to read required amount of bytes. 256 threading.Thread(target=producer).start() 257 tar.addfile(layer_tarinfo, read) 258 259 return LayerInfo(size=size, checksum=checksum, path=path, paths=paths) 260 261 262def add_customisation_layer(target_tar, customisation_layer, mtime): 263 """ 264 Adds the customisation layer as a new layer. This is layer is structured 265 differently; given store path has the 'layer.tar' and corresponding 266 sha256sum ready. 267 268 tar: 'tarfile.TarFile' object for the new layer to be added to. 269 customisation_layer: Path containing the layer archive. 270 mtime: 'mtime' of the added layer tarball. 271 """ 272 273 checksum_path = os.path.join(customisation_layer, "checksum") 274 with open(checksum_path) as f: 275 checksum = f.read().strip() 276 assert len(checksum) == 64, f"Invalid sha256 at ${checksum_path}." 277 278 layer_path = os.path.join(customisation_layer, "layer.tar") 279 280 path = f"{checksum}/layer.tar" 281 tarinfo = target_tar.gettarinfo(layer_path) 282 tarinfo.name = path 283 tarinfo.mtime = mtime 284 285 with open(layer_path, "rb") as f: 286 target_tar.addfile(tarinfo, f) 287 288 return LayerInfo( 289 size=None, checksum=checksum, path=path, paths=[customisation_layer] 290 ) 291 292 293def add_bytes(tar, path, content, mtime): 294 """ 295 Adds a file to the tarball with given path and contents. 296 297 tar: 'tarfile.TarFile' object. 298 path: Path of the file as a string. 299 content: Contents of the file. 300 mtime: 'mtime' of the file. Should be an integer representing a POSIX time. 301 """ 302 assert type(content) is bytes 303 304 ti = tarfile.TarInfo(path) 305 ti.size = len(content) 306 ti.mtime = mtime 307 tar.addfile(ti, io.BytesIO(content)) 308 309 310now = datetime.now(tz=timezone.utc) 311 312 313def parse_time(s): 314 if s == "now": 315 return now 316 return datetime.fromisoformat(s) 317 318 319def main(): 320 arg_parser = argparse.ArgumentParser( 321 description=""" 322This script generates a Docker image from a set of store paths. Uses 323Docker Image Specification v1.2 as reference [1]. 324 325[1]: https://github.com/moby/moby/blob/master/image/spec/v1.2.md 326 """ 327 ) 328 arg_parser.add_argument( 329 "conf", 330 type=str, 331 help=""" 332 JSON file with the following properties and writes the 333 image as an uncompressed tarball to stdout: 334 335 * "architecture", "config", "os", "created", "repo_tag" correspond to 336 the fields with the same name on the image spec [2]. 337 * "created" can be "now". 338 * "created" is also used as mtime for files added to the image. 339 * "uid", "gid", "uname", "gname" is the file ownership, for example, 340 0, 0, "root", "root". 341 * "store_layers" is a list of layers in ascending order, where each 342 layer is the list of store paths to include in that layer. 343 """, 344 ) 345 arg_parser.add_argument( 346 "--repo_tag", "-t", type=str, 347 help="Override the RepoTags from the configuration" 348 ) 349 350 args = arg_parser.parse_args() 351 with open(args.conf, "r") as f: 352 conf = json.load(f) 353 354 created = parse_time(conf["created"]) 355 mtime = int(parse_time(conf["mtime"]).timestamp()) 356 uid = int(conf["uid"]) 357 gid = int(conf["gid"]) 358 uname = conf["uname"] 359 gname = conf["gname"] 360 store_dir = conf["store_dir"] 361 362 from_image = load_from_image(conf["from_image"]) 363 364 with tarfile.open(mode="w|", fileobj=sys.stdout.buffer) as tar: 365 layers = [] 366 layers.extend(add_base_layers(tar, from_image)) 367 368 start = len(layers) + 1 369 for num, store_layer in enumerate(conf["store_layers"], start=start): 370 print( 371 "Creating layer", 372 num, 373 "from paths:", 374 store_layer, 375 file=sys.stderr, 376 ) 377 info = add_layer_dir( 378 tar, store_layer, store_dir, mtime, uid, gid, uname, gname 379 ) 380 layers.append(info) 381 382 print( 383 "Creating layer", 384 len(layers) + 1, 385 "with customisation...", 386 file=sys.stderr, 387 ) 388 layers.append( 389 add_customisation_layer( 390 tar, conf["customisation_layer"], mtime=mtime 391 ) 392 ) 393 394 print("Adding manifests...", file=sys.stderr) 395 396 image_json = { 397 "created": datetime.isoformat(created), 398 "architecture": conf["architecture"], 399 "os": "linux", 400 "config": overlay_base_config(from_image, conf["config"]), 401 "rootfs": { 402 "diff_ids": [f"sha256:{layer.checksum}" for layer in layers], 403 "type": "layers", 404 }, 405 "history": [ 406 { 407 "created": datetime.isoformat(created), 408 "comment": f"store paths: {layer.paths}", 409 } 410 for layer in layers 411 ], 412 } 413 414 image_json = json.dumps(image_json, indent=4).encode("utf-8") 415 image_json_checksum = hashlib.sha256(image_json).hexdigest() 416 image_json_path = f"{image_json_checksum}.json" 417 add_bytes(tar, image_json_path, image_json, mtime=mtime) 418 419 manifest_json = [ 420 { 421 "Config": image_json_path, 422 "RepoTags": [args.repo_tag or conf["repo_tag"]], 423 "Layers": [layer.path for layer in layers], 424 } 425 ] 426 manifest_json = json.dumps(manifest_json, indent=4).encode("utf-8") 427 add_bytes(tar, "manifest.json", manifest_json, mtime=mtime) 428 429 print("Done.", file=sys.stderr) 430 431 432if __name__ == "__main__": 433 main()