#!/usr/bin/env python3 # usage: auto-layer.py graph_file [ignore_file] [layer_limit] # graph_file: Path to a json file as generated by writeReferencesGraph # ignore_file: Path to a file with a list of store paths that should not appear in the output # layer_limit: Maximum number of layers to generate, default 100 # This module tries to split a dependency graph of nix store paths into a # limited set of layers that together cover all mentioned paths. It tries to # choose the layers such that different inputs often have the largest layers in # common so most layers can be shared, while the differences in the results end # up in smaller layers. # It does this by splitting off the N largest store paths (by nar size) into # their own layers, including some of their dependencies. # Specifically, for a large store path L, it creates a layer with L and any # store path D that L depends on and for which there is no store path in the # input that depends on D but not on L. # Then, if there are any store paths that are depended on by multiple of the # chosen large store paths, those common dependencies will get their own layer, # one per set of large store paths that depends on them. # N is iteratively increased until the layer limit is reached. # The reasoning for this algorithm is as follows: # Most closures contain a few large store paths and many small store paths. If # we want to share as many bytes as possible with other layered images, we # should focus on putting the largest paths in their own layer. # If we had data on how much each store path is used and how likely each # combination of store paths is, we might be able to infer which large store # paths are better off being combined into a single layer. However, getting that # information, let alone keeping it up-to-date is very difficult. If we can't # tell that two large store paths are often going to appear together, then we're # better off giving each of them their own layer. # This leaves a lot of smaller store paths to be assigned to layers. Anything # that will depend on a large store path L will also depend on all the store # paths that L depends on, so it makes sense to move the dependencies of L into # the same layer as L. # Possible improvements: # - Specifying a size limit below which the algorithm stops using large store # paths as new layer roots might further improve sharing as the layer # boundaries will depend less on the number of larger store paths in the # input. import json import os import sys def layer_count(layer_split): return len(set(layer_split.values())) def path_key(path): hash, name = path.split('-', 1) return name, hash def closure(*todo, key): """ Find all dependencies of the arguments including the arguments themselves. """ todo = set(todo) done = set() while todo: x = todo.pop() if x not in done: done.add(x) todo.update(key(x)) return done def dependencies(*todo, key): """ Find all dependencies of the arguments excluding the arguments themselves. """ return closure(*todo, key=key) - set(todo) def minimal_cover(paths, key): """ The minimal set of paths that together cover all input paths with their closure. None of the result paths depend on each other. """ paths = set(paths) paths_deps = set.union(*(dependencies(d, key=key) for d in paths)) return paths - paths_deps def auto_layer(graph, ignore_paths, layer_limit): # Compute all direct users of each path nodes = {x["path"]: x | {"users": set()} for x in graph} for user in nodes: for ref in nodes[user]["references"]: nodes[ref]["users"] |= {user} def node_deps(path): nonlocal nodes return nodes[path]["references"] def node_users(path): nonlocal nodes return nodes[path]["users"] nodes_by_size = sorted(graph, key=lambda node: node["narSize"]) # Here starts the main algorithm: # The goal is to split the set of store paths into layers such that the layers are likely to be # reusable and that the closure size is spread out over the layers. We do this by iteratively taking # the largest store path and giving it its own layer. This primary store path becomes the identity # of the layer. We also add every dependency of the identifying store path to the same layer unless # it is also used by something that doesn't depend on the identifying store path. More generally, we # put store paths together in the same layer when the set of other layers that depend on it is the # same. # layer_split defines how the layers are currently split. We start with a single layer with no # dependencies. This is encoded as every store path mapped to the empty set of dependencies. # In general, layer_split maps each store path to the set of primary paths that depend on it and # that set defines and identifies the layer. layer_split = {path: frozenset() for path in nodes} primary_paths = set() while nodes_by_size: # Every iteration, we choose the next biggest path to be the root of a new layer. new_primary_path = nodes_by_size.pop()["path"] primary_paths.add(new_primary_path) new_layer_split = layer_split.copy() new_layer_split[new_primary_path] = frozenset({new_primary_path}) new_primary_path_deps = dependencies(new_primary_path, key=node_deps) new_primary_path_users = dependencies(new_primary_path, key=node_users) # Update the set of primary users for every dependency of the new primary path. for dep in new_primary_path_deps: new_layer_split[dep] -= new_primary_path_users if not new_layer_split[dep] & new_primary_path_deps: new_layer_split[dep] |= {new_primary_path} # If we exceed the layer limit, we give up. The previous split should be good enough. if layer_count(new_layer_split) > layer_limit: break layer_split = new_layer_split # Main algorithm done, the layers have been chosen. # Now, let's give each layer some metadata, mostly for debugging. def layer_info(layer_id): nonlocal nodes nonlocal layer_split # The full set of paths in this layer is all the paths that were assigned to it. paths = {path for path, layer_id_2 in layer_split.items() if layer_id == layer_id_2} layerSize = sum(nodes[path]["narSize"] for path in paths) return { "usedBy": sorted(layer_id, key=path_key), "paths": sorted(paths, key=path_key), "layerSize": layerSize, "closureSize": sum(nodes[path]["narSize"] for path in closure(*paths, key=node_deps)), } layers = {layer_id: layer_info(layer_id) for layer_id in set(layer_split.values())} # The layer order doesn't actually matter for docker but it's still kind of neat to have layers come # after all of their dependencies. The easiest way to do that is to order by closure size since a # layer is necessarily always larger than each of its dependencies since it includes them. layer_order = sorted(layers.values(), key=lambda info: info["closureSize"]) if os.environ.get("DEBUG"): print(json.dumps(layer_order, indent=2), file=sys.stderr) # Sanity check that no store path ends up in multiple layers. total_layer_size = sum(node["layerSize"] for node in layer_order) total_nar_size = sum(node["narSize"] for node in graph) assert total_layer_size == total_nar_size, (total_layer_size, total_nar_size) # Format as a list of layers, each defined as a list of store paths. return [[path for path in layer["paths"] if path not in ignore_paths] for layer in layer_order if set(layer["paths"]) - ignore_paths] if __name__ == '__main__': import argparse parser = argparse.ArgumentParser( prog='auto-layer', description='Split store paths into docker layers.' ) parser.add_argument('graph_file') parser.add_argument('ignore_file', default="/dev/null") parser.add_argument('layer_limit', type=int, default=100) args = parser.parse_args() with open(args.graph_file) as f: graph = json.load(f) with open(args.ignore_file) as f: ignore_paths = {line.strip() for line in f} print(json.dumps(auto_layer(graph, ignore_paths, args.layer_limit)))