Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
1"""
2This script generates a Docker image from a set of store paths. Uses
3Docker Image Specification v1.2 as reference [1].
4
5It expects a JSON file with the following properties and writes the
6image as an uncompressed tarball to stdout:
7
8* "architecture", "config", "os", "created", "repo_tag" correspond to
9 the fields with the same name on the image spec [2].
10* "created" can be "now".
11* "created" is also used as mtime for files added to the image.
12* "uid", "gid", "uname", "gname" is the file ownership, for example,
13 0, 0, "root", "root".
14* "store_layers" is a list of layers in ascending order, where each
15 layer is the list of store paths to include in that layer.
16
17The main challenge for this script to create the final image in a
18streaming fashion, without dumping any intermediate data to disk
19for performance.
20
21A docker image has each layer contents archived as separate tarballs,
22and they later all get enveloped into a single big tarball in a
23content addressed fashion. However, because how "tar" format works,
24we have to know about the name (which includes the checksum in our
25case) and the size of the tarball before we can start adding it to the
26outer tarball. We achieve that by creating the layer tarballs twice;
27on the first iteration we calculate the file size and the checksum,
28and on the second one we actually stream the contents. 'add_layer_dir'
29function does all this.
30
31[1]: https://github.com/moby/moby/blob/master/image/spec/v1.2.md
32[2]: https://github.com/moby/moby/blob/4fb59c20a4fb54f944fe170d0ff1d00eb4a24d6f/image/spec/v1.2.md#image-json-field-descriptions
33""" # noqa: E501
34
35import argparse
36import io
37import os
38import re
39import sys
40import json
41import hashlib
42import pathlib
43import tarfile
44import itertools
45import threading
46from datetime import datetime, timezone
47from collections import namedtuple
48
49
50def archive_paths_to(obj, paths, mtime, uid, gid, uname, gname):
51 """
52 Writes the given store paths as a tar file to the given stream.
53
54 obj: Stream to write to. Should have a 'write' method.
55 paths: List of store paths.
56 """
57
58 # gettarinfo makes the paths relative, this makes them
59 # absolute again
60 def append_root(ti):
61 ti.name = "/" + ti.name
62 return ti
63
64 def apply_filters(ti):
65 ti.mtime = mtime
66 ti.uid = uid
67 ti.gid = gid
68 ti.uname = uname
69 ti.gname = gname
70 return ti
71
72 def nix_root(ti):
73 ti.mode = 0o0755 # rwxr-xr-x
74 return ti
75
76 def dir(path):
77 ti = tarfile.TarInfo(path)
78 ti.type = tarfile.DIRTYPE
79 return ti
80
81 with tarfile.open(fileobj=obj, mode="w|") as tar:
82 # To be consistent with the docker utilities, we need to have
83 # these directories first when building layer tarballs.
84 tar.addfile(apply_filters(nix_root(dir("/nix"))))
85 tar.addfile(apply_filters(nix_root(dir("/nix/store"))))
86
87 for path in paths:
88 path = pathlib.Path(path)
89 if path.is_symlink():
90 files = [path]
91 else:
92 files = itertools.chain([path], path.rglob("*"))
93
94 for filename in sorted(files):
95 ti = append_root(tar.gettarinfo(filename))
96
97 # copy hardlinks as regular files
98 if ti.islnk():
99 ti.type = tarfile.REGTYPE
100 ti.linkname = ""
101 ti.size = filename.stat().st_size
102
103 ti = apply_filters(ti)
104 if ti.isfile():
105 with open(filename, "rb") as f:
106 tar.addfile(ti, f)
107 else:
108 tar.addfile(ti)
109
110
111class ExtractChecksum:
112 """
113 A writable stream which only calculates the final file size and
114 sha256sum, while discarding the actual contents.
115 """
116
117 def __init__(self):
118 self._digest = hashlib.sha256()
119 self._size = 0
120
121 def write(self, data):
122 self._digest.update(data)
123 self._size += len(data)
124
125 def extract(self):
126 """
127 Returns: Hex-encoded sha256sum and size as a tuple.
128 """
129 return (self._digest.hexdigest(), self._size)
130
131
132FromImage = namedtuple("FromImage", ["tar", "manifest_json", "image_json"])
133# Some metadata for a layer
134LayerInfo = namedtuple("LayerInfo", ["size", "checksum", "path", "paths"])
135
136
137def load_from_image(from_image_str):
138 """
139 Loads the given base image, if any.
140
141 from_image_str: Path to the base image archive.
142
143 Returns: A 'FromImage' object with references to the loaded base image,
144 or 'None' if no base image was provided.
145 """
146 if from_image_str is None:
147 return None
148
149 base_tar = tarfile.open(from_image_str)
150
151 manifest_json_tarinfo = base_tar.getmember("manifest.json")
152 with base_tar.extractfile(manifest_json_tarinfo) as f:
153 manifest_json = json.load(f)
154
155 image_json_tarinfo = base_tar.getmember(manifest_json[0]["Config"])
156 with base_tar.extractfile(image_json_tarinfo) as f:
157 image_json = json.load(f)
158
159 return FromImage(base_tar, manifest_json, image_json)
160
161
162def add_base_layers(tar, from_image):
163 """
164 Adds the layers from the given base image to the final image.
165
166 tar: 'tarfile.TarFile' object for new layers to be added to.
167 from_image: 'FromImage' object with references to the loaded base image.
168 """
169 if from_image is None:
170 print("No 'fromImage' provided", file=sys.stderr)
171 return []
172
173 layers = from_image.manifest_json[0]["Layers"]
174 checksums = from_image.image_json["rootfs"]["diff_ids"]
175 layers_checksums = zip(layers, checksums)
176
177 for num, (layer, checksum) in enumerate(layers_checksums, start=1):
178 layer_tarinfo = from_image.tar.getmember(layer)
179 checksum = re.sub(r"^sha256:", "", checksum)
180
181 tar.addfile(layer_tarinfo, from_image.tar.extractfile(layer_tarinfo))
182 path = layer_tarinfo.path
183 size = layer_tarinfo.size
184
185 print("Adding base layer", num, "from", path, file=sys.stderr)
186 yield LayerInfo(size=size, checksum=checksum, path=path, paths=[path])
187
188 from_image.tar.close()
189
190
191def overlay_base_config(from_image, final_config):
192 """
193 Overlays the final image 'config' JSON on top of selected defaults from the
194 base image 'config' JSON.
195
196 from_image: 'FromImage' object with references to the loaded base image.
197 final_config: 'dict' object of the final image 'config' JSON.
198 """
199 if from_image is None:
200 return final_config
201
202 base_config = from_image.image_json["config"]
203
204 # Preserve environment from base image
205 final_env = base_config.get("Env", []) + final_config.get("Env", [])
206 if final_env:
207 # Resolve duplicates (last one wins) and format back as list
208 resolved_env = {entry.split("=", 1)[0]: entry for entry in final_env}
209 final_config["Env"] = list(resolved_env.values())
210 return final_config
211
212
213def add_layer_dir(tar, paths, store_dir, mtime, uid, gid, uname, gname):
214 """
215 Appends given store paths to a TarFile object as a new layer.
216
217 tar: 'tarfile.TarFile' object for the new layer to be added to.
218 paths: List of store paths.
219 store_dir: the root directory of the nix store
220 mtime: 'mtime' of the added files and the layer tarball.
221 Should be an integer representing a POSIX time.
222
223 Returns: A 'LayerInfo' object containing some metadata of
224 the layer added.
225 """
226
227 invalid_paths = [i for i in paths if not i.startswith(store_dir)]
228 assert (
229 len(invalid_paths) == 0
230 ), f"Expecting absolute paths from {store_dir}, but got: {invalid_paths}"
231
232 # First, calculate the tarball checksum and the size.
233 extract_checksum = ExtractChecksum()
234 archive_paths_to(extract_checksum, paths, mtime, uid, gid, uname, gname)
235 (checksum, size) = extract_checksum.extract()
236
237 path = f"{checksum}/layer.tar"
238 layer_tarinfo = tarfile.TarInfo(path)
239 layer_tarinfo.size = size
240 layer_tarinfo.mtime = mtime
241
242 # Then actually stream the contents to the outer tarball.
243 read_fd, write_fd = os.pipe()
244 with open(read_fd, "rb") as read, open(write_fd, "wb") as write:
245
246 def producer():
247 archive_paths_to(write, paths, mtime, uid, gid, uname, gname)
248 write.close()
249
250 # Closing the write end of the fifo also closes the read end,
251 # so we don't need to wait until this thread is finished.
252 #
253 # Any exception from the thread will get printed by the default
254 # exception handler, and the 'addfile' call will fail since it
255 # won't be able to read required amount of bytes.
256 threading.Thread(target=producer).start()
257 tar.addfile(layer_tarinfo, read)
258
259 return LayerInfo(size=size, checksum=checksum, path=path, paths=paths)
260
261
262def add_customisation_layer(target_tar, customisation_layer, mtime):
263 """
264 Adds the customisation layer as a new layer. This is layer is structured
265 differently; given store path has the 'layer.tar' and corresponding
266 sha256sum ready.
267
268 tar: 'tarfile.TarFile' object for the new layer to be added to.
269 customisation_layer: Path containing the layer archive.
270 mtime: 'mtime' of the added layer tarball.
271 """
272
273 checksum_path = os.path.join(customisation_layer, "checksum")
274 with open(checksum_path) as f:
275 checksum = f.read().strip()
276 assert len(checksum) == 64, f"Invalid sha256 at ${checksum_path}."
277
278 layer_path = os.path.join(customisation_layer, "layer.tar")
279
280 path = f"{checksum}/layer.tar"
281 tarinfo = target_tar.gettarinfo(layer_path)
282 tarinfo.name = path
283 tarinfo.mtime = mtime
284
285 with open(layer_path, "rb") as f:
286 target_tar.addfile(tarinfo, f)
287
288 return LayerInfo(
289 size=None, checksum=checksum, path=path, paths=[customisation_layer]
290 )
291
292
293def add_bytes(tar, path, content, mtime):
294 """
295 Adds a file to the tarball with given path and contents.
296
297 tar: 'tarfile.TarFile' object.
298 path: Path of the file as a string.
299 content: Contents of the file.
300 mtime: 'mtime' of the file. Should be an integer representing a POSIX time.
301 """
302 assert type(content) is bytes
303
304 ti = tarfile.TarInfo(path)
305 ti.size = len(content)
306 ti.mtime = mtime
307 tar.addfile(ti, io.BytesIO(content))
308
309
310now = datetime.now(tz=timezone.utc)
311
312
313def parse_time(s):
314 if s == "now":
315 return now
316 return datetime.fromisoformat(s)
317
318
319def main():
320 arg_parser = argparse.ArgumentParser(
321 description="""
322This script generates a Docker image from a set of store paths. Uses
323Docker Image Specification v1.2 as reference [1].
324
325[1]: https://github.com/moby/moby/blob/master/image/spec/v1.2.md
326 """
327 )
328 arg_parser.add_argument(
329 "conf",
330 type=str,
331 help="""
332 JSON file with the following properties and writes the
333 image as an uncompressed tarball to stdout:
334
335 * "architecture", "config", "os", "created", "repo_tag" correspond to
336 the fields with the same name on the image spec [2].
337 * "created" can be "now".
338 * "created" is also used as mtime for files added to the image.
339 * "uid", "gid", "uname", "gname" is the file ownership, for example,
340 0, 0, "root", "root".
341 * "store_layers" is a list of layers in ascending order, where each
342 layer is the list of store paths to include in that layer.
343 """,
344 )
345 arg_parser.add_argument(
346 "--repo_tag", "-t", type=str,
347 help="Override the RepoTags from the configuration"
348 )
349
350 args = arg_parser.parse_args()
351 with open(args.conf, "r") as f:
352 conf = json.load(f)
353
354 created = parse_time(conf["created"])
355 mtime = int(parse_time(conf["mtime"]).timestamp())
356 uid = int(conf["uid"])
357 gid = int(conf["gid"])
358 uname = conf["uname"]
359 gname = conf["gname"]
360 store_dir = conf["store_dir"]
361
362 from_image = load_from_image(conf["from_image"])
363
364 with tarfile.open(mode="w|", fileobj=sys.stdout.buffer) as tar:
365 layers = []
366 layers.extend(add_base_layers(tar, from_image))
367
368 start = len(layers) + 1
369 for num, store_layer in enumerate(conf["store_layers"], start=start):
370 print(
371 "Creating layer",
372 num,
373 "from paths:",
374 store_layer,
375 file=sys.stderr,
376 )
377 info = add_layer_dir(
378 tar, store_layer, store_dir, mtime, uid, gid, uname, gname
379 )
380 layers.append(info)
381
382 print(
383 "Creating layer",
384 len(layers) + 1,
385 "with customisation...",
386 file=sys.stderr,
387 )
388 layers.append(
389 add_customisation_layer(
390 tar, conf["customisation_layer"], mtime=mtime
391 )
392 )
393
394 print("Adding manifests...", file=sys.stderr)
395
396 image_json = {
397 "created": datetime.isoformat(created),
398 "architecture": conf["architecture"],
399 "os": "linux",
400 "config": overlay_base_config(from_image, conf["config"]),
401 "rootfs": {
402 "diff_ids": [f"sha256:{layer.checksum}" for layer in layers],
403 "type": "layers",
404 },
405 "history": [
406 {
407 "created": datetime.isoformat(created),
408 "comment": f"store paths: {layer.paths}",
409 }
410 for layer in layers
411 ],
412 }
413
414 image_json = json.dumps(image_json, indent=4).encode("utf-8")
415 image_json_checksum = hashlib.sha256(image_json).hexdigest()
416 image_json_path = f"{image_json_checksum}.json"
417 add_bytes(tar, image_json_path, image_json, mtime=mtime)
418
419 manifest_json = [
420 {
421 "Config": image_json_path,
422 "RepoTags": [args.repo_tag or conf["repo_tag"]],
423 "Layers": [layer.path for layer in layers],
424 }
425 ]
426 manifest_json = json.dumps(manifest_json, indent=4).encode("utf-8")
427 add_bytes(tar, "manifest.json", manifest_json, mtime=mtime)
428
429 print("Done.", file=sys.stderr)
430
431
432if __name__ == "__main__":
433 main()