1# Using a simple algorithm, convert the references to a path in to a
2# sorted list of dependent paths based on how often they're referenced
3# and how deep in the tree they live. Equally-"popular" paths are then
4# sorted by name.
5#
6# The existing writeClosure prints the paths in a simple ascii-based sorting of the paths.
7#
8# Sorting the paths by graph improves the chances that the difference
9# between two builds appear near the end of the list, instead of near
10# the beginning. This makes a difference for Nix builds which export a
11# closure for another program to consume, if that program implements its
12# own level of binary diffing.
13#
14# For an example, Docker Images. If each store path is a separate layer
15# then Docker Images can be very efficiently transfered between systems,
16# and we get very good cache reuse between images built with the same
17# version of Nixpkgs. However, since Docker only reliably supports a
18# small number of layers (42) it is important to pick the individual
19# layers carefully. By storing very popular store paths in the first 40
20# layers, we improve the chances that the next Docker image will share
21# many of those layers.*
22#
23# Given the dependency tree:
24#
25# A - B - C - D -\
26# \ \ \ \
27# \ \ \ \
28# \ \ - E ---- F
29# \- G
30#
31# Nodes which have multiple references are duplicated:
32#
33# A - B - C - D - F
34# \ \ \
35# \ \ \- E - F
36# \ \
37# \ \- E - F
38# \
39# \- G
40#
41# Each leaf node is now replaced by a counter defaulted to 1:
42#
43# A - B - C - D - (F:1)
44# \ \ \
45# \ \ \- E - (F:1)
46# \ \
47# \ \- E - (F:1)
48# \
49# \- (G:1)
50#
51# Then each leaf counter is merged with its parent node, replacing the
52# parent node with a counter of 1, and each existing counter being
53# incremented by 1. That is to say `- D - (F:1)` becomes `- (D:1, F:2)`:
54#
55# A - B - C - (D:1, F:2)
56# \ \ \
57# \ \ \- (E:1, F:2)
58# \ \
59# \ \- (E:1, F:2)
60# \
61# \- (G:1)
62#
63# Then each leaf counter is merged with its parent node again, merging
64# any counters, then incrementing each:
65#
66# A - B - (C:1, D:2, E:2, F:5)
67# \ \
68# \ \- (E:1, F:2)
69# \
70# \- (G:1)
71#
72# And again:
73#
74# A - (B:1, C:2, D:3, E:4, F:8)
75# \
76# \- (G:1)
77#
78# And again:
79#
80# (A:1, B:2, C:3, D:4, E:5, F:9, G:2)
81#
82# and then paths have the following "popularity":
83#
84# A 1
85# B 2
86# C 3
87# D 4
88# E 5
89# F 9
90# G 2
91#
92# and the popularity contest would result in the paths being printed as:
93#
94# F
95# E
96# D
97# C
98# B
99# G
100# A
101#
102# * Note: People who have used a Dockerfile before assume Docker's
103# Layers are inherently ordered. However, this is not true -- Docker
104# layers are content-addressable and are not explicitly layered until
105# they are composed in to an Image.
106
107import igraph as igraph
108
109from collections import defaultdict
110from operator import eq
111from toolz import curried as tlz
112from toolz import curry
113
114from .lib import (
115 debug,
116 directed_graph,
117 igraph_to_reference_graph,
118 over,
119 pick_keys,
120 reference_graph_node_keys_to_keep
121)
122
123eq = curry(eq)
124
125pick_keys_to_keep = pick_keys(reference_graph_node_keys_to_keep)
126
127
128# Find paths in the original dataset which are never referenced by
129# any other paths
130def find_roots(closures):
131 debug('closures', closures)
132 roots = []
133
134 for closure in closures:
135 path = closure['path']
136 if not any_refer_to(path, closures):
137 roots.append(path)
138
139 return roots
140
141
142def any_refer_to(path, closures):
143 for closure in closures:
144 if path != closure['path']:
145 if path in closure['references']:
146 return True
147 return False
148
149
150def all_paths(closures):
151 paths = []
152 for closure in closures:
153 paths.append(closure['path'])
154 paths.extend(closure['references'])
155 paths.sort()
156 return list(set(paths))
157
158
159# Convert:
160#
161# [
162# { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] }, # noqa: E501
163# { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
164# { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
165# { path: /nix/store/tux, references: [ /nix/store/tux ] }
166# ]
167#
168# To:
169# {
170# /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
171# /nix/store/bar: [ /nix/store/baz ],
172# /nix/store/baz: [ /nix/store/tux ] },
173# /nix/store/tux: [ ]
174# }
175#
176# Note that it drops self-references to avoid loops.
177
178
179def make_lookup(closures):
180 return {
181 # remove self reference
182 node["path"]: over("references", tlz.remove(eq(node["path"])), node)
183 for node in closures
184 }
185
186
187# Convert:
188#
189# /nix/store/foo with
190# {
191# /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
192# /nix/store/bar: [ /nix/store/baz ],
193# /nix/store/baz: [ /nix/store/tux ] },
194# /nix/store/tux: [ ]
195# }
196#
197# To:
198#
199# {
200# /nix/store/bar: {
201# /nix/store/baz: {
202# /nix/store/tux: {}
203# }
204# },
205# /nix/store/baz: {
206# /nix/store/tux: {}
207# }
208# }
209
210
211def make_graph_segment_from_root(subgraphs_cache, root, lookup):
212 children = {}
213 for ref in lookup[root]:
214 # make_graph_segment_from_root is a pure function, and will
215 # always return the same result based on a given input. Thus,
216 # cache computation.
217 #
218 # Python's assignment will use a pointer, preventing memory
219 # bloat for large graphs.
220 if ref not in subgraphs_cache:
221 debug("Subgraph Cache miss on {}".format(ref))
222 subgraphs_cache[ref] = make_graph_segment_from_root(
223 subgraphs_cache, ref, lookup
224 )
225 else:
226 debug("Subgraph Cache hit on {}".format(ref))
227 children[ref] = subgraphs_cache[ref]
228 return children
229
230
231# Convert a graph segment in to a popularity-counted dictionary:
232#
233# From:
234# {
235# /nix/store/foo: {
236# /nix/store/bar: {
237# /nix/store/baz: {
238# /nix/store/tux: {}
239# }
240# }
241# /nix/store/baz: {
242# /nix/store/tux: {}
243# }
244# }
245# }
246#
247# to:
248# [
249# /nix/store/foo: 1
250# /nix/store/bar: 2
251# /nix/store/baz: 4
252# /nix/store/tux: 6
253# ]
254
255def graph_popularity_contest(popularity_cache, full_graph):
256 popularity = defaultdict(int)
257 for path, subgraph in full_graph.items():
258 popularity[path] += 1
259 # graph_popularity_contest is a pure function, and will
260 # always return the same result based on a given input. Thus,
261 # cache computation.
262 #
263 # Python's assignment will use a pointer, preventing memory
264 # bloat for large graphs.
265 if path not in popularity_cache:
266 debug("Popularity Cache miss on", path)
267 popularity_cache[path] = graph_popularity_contest(
268 popularity_cache, subgraph
269 )
270 else:
271 debug("Popularity Cache hit on", path)
272
273 subcontest = popularity_cache[path]
274 for subpath, subpopularity in subcontest.items():
275 debug("Calculating popularity for", subpath)
276 popularity[subpath] += subpopularity + 1
277
278 return popularity
279
280# Emit a list of packages by popularity, most first:
281#
282# From:
283# [
284# /nix/store/foo: 1
285# /nix/store/bar: 1
286# /nix/store/baz: 2
287# /nix/store/tux: 2
288# ]
289#
290# To:
291# [ /nix/store/baz /nix/store/tux /nix/store/bar /nix/store/foo ]
292
293
294def order_by_popularity(paths):
295 paths_by_popularity = defaultdict(list)
296 popularities = []
297 for path, popularity in paths.items():
298 popularities.append(popularity)
299 paths_by_popularity[popularity].append(path)
300
301 popularities = sorted(set(popularities))
302
303 flat_ordered = []
304 for popularity in popularities:
305 paths = paths_by_popularity[popularity]
306 paths.sort(key=package_name)
307
308 flat_ordered.extend(reversed(paths))
309 return list(reversed(flat_ordered))
310
311
312def package_name(path):
313 parts = path.split('-')
314 start = parts.pop(0)
315 # don't throw away any data, so the order is always the same.
316 # even in cases where only the hash at the start has changed.
317 parts.append(start)
318 return '-'.join(parts)
319
320
321@curry
322def popularity_contest(graph):
323 # Data comes in as an igraph directed graph or in the format produced
324 # by nix's exportReferencesGraph:
325 # [
326 # { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] }, # noqa: E501
327 # { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] }, # noqa: E501
328 # { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] }, # noqa: E501
329 # { path: /nix/store/tux, references: [ /nix/store/tux ] }
330 # ]
331 #
332 # We want to get out a list of paths ordered by how universally,
333 # important they are, ie: tux is referenced by every path, transitively
334 # so it should be #1
335 #
336 # [
337 # /nix/store/tux,
338 # /nix/store/baz,
339 # /nix/store/bar,
340 # /nix/store/foo,
341 # ]
342 #
343 # NOTE: the output is actually a list of igraph graphs with a single vertex
344 # with v["name"] == path, and some properties (defined in
345 # reference_graph_node_keys_to_keep) from the nodes of the input graph
346 # copied as vertex attributes.
347 debug('graph', graph)
348
349 if isinstance(graph, igraph.Graph):
350 graph = igraph_to_reference_graph(graph)
351
352 debug("Finding roots")
353 roots = find_roots(graph)
354
355 debug("Making lookup")
356 lookup = make_lookup(graph)
357
358 full_graph = {}
359 subgraphs_cache = {}
360 for root in roots:
361 debug("Making full graph for", root)
362 full_graph[root] = make_graph_segment_from_root(
363 subgraphs_cache,
364 root,
365 tlz.valmap(
366 tlz.get("references"),
367 lookup
368 )
369 )
370
371 debug("Running contest")
372 contest = graph_popularity_contest({}, full_graph)
373
374 debug("Ordering by popularity")
375 ordered = order_by_popularity(contest)
376
377 debug("Checking for missing paths")
378 missing = []
379
380 for path in all_paths(graph):
381 if path not in ordered:
382 missing.append(path)
383
384 ordered.extend(missing)
385
386 return map(
387 # Turn each path into a graph with 1 vertex.
388 lambda path: directed_graph(
389 # No edges
390 [],
391 # One vertex, with name=path
392 [path],
393 # Setting desired attributes on the vertex.
394 [(path, pick_keys_to_keep(lookup[path]))]
395 ),
396 ordered
397 )