Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
1# IMPORTANT: Making changes?
2#
3# Validate your changes with python3 ./closure-graph.py --test
4
5
6# Using a simple algorithm, convert the references to a path in to a
7# sorted list of dependent paths based on how often they're referenced
8# and how deep in the tree they live. Equally-"popular" paths are then
9# sorted by name.
10#
11# The existing writeClosure prints the paths in a simple ascii-based
12# sorting of the paths.
13#
14# Sorting the paths by graph improves the chances that the difference
15# between two builds appear near the end of the list, instead of near
16# the beginning. This makes a difference for Nix builds which export a
17# closure for another program to consume, if that program implements its
18# own level of binary diffing.
19#
20# For an example, Docker Images. If each store path is a separate layer
21# then Docker Images can be very efficiently transfered between systems,
22# and we get very good cache reuse between images built with the same
23# version of Nixpkgs. However, since Docker only reliably supports a
24# small number of layers (42) it is important to pick the individual
25# layers carefully. By storing very popular store paths in the first 40
26# layers, we improve the chances that the next Docker image will share
27# many of those layers.*
28#
29# Given the dependency tree:
30#
31# A - B - C - D -\
32# \ \ \ \
33# \ \ \ \
34# \ \ - E ---- F
35# \- G
36#
37# Nodes which have multiple references are duplicated:
38#
39# A - B - C - D - F
40# \ \ \
41# \ \ \- E - F
42# \ \
43# \ \- E - F
44# \
45# \- G
46#
47# Each leaf node is now replaced by a counter defaulted to 1:
48#
49# A - B - C - D - (F:1)
50# \ \ \
51# \ \ \- E - (F:1)
52# \ \
53# \ \- E - (F:1)
54# \
55# \- (G:1)
56#
57# Then each leaf counter is merged with its parent node, replacing the
58# parent node with a counter of 1, and each existing counter being
59# incremented by 1. That is to say `- D - (F:1)` becomes `- (D:1, F:2)`:
60#
61# A - B - C - (D:1, F:2)
62# \ \ \
63# \ \ \- (E:1, F:2)
64# \ \
65# \ \- (E:1, F:2)
66# \
67# \- (G:1)
68#
69# Then each leaf counter is merged with its parent node again, merging
70# any counters, then incrementing each:
71#
72# A - B - (C:1, D:2, E:2, F:5)
73# \ \
74# \ \- (E:1, F:2)
75# \
76# \- (G:1)
77#
78# And again:
79#
80# A - (B:1, C:2, D:3, E:4, F:8)
81# \
82# \- (G:1)
83#
84# And again:
85#
86# (A:1, B:2, C:3, D:4, E:5, F:9, G:2)
87#
88# and then paths have the following "popularity":
89#
90# A 1
91# B 2
92# C 3
93# D 4
94# E 5
95# F 9
96# G 2
97#
98# and the popularity contest would result in the paths being printed as:
99#
100# F
101# E
102# D
103# C
104# B
105# G
106# A
107#
108# * Note: People who have used a Dockerfile before assume Docker's
109# Layers are inherently ordered. However, this is not true -- Docker
110# layers are content-addressable and are not explicitly layered until
111# they are composed in to an Image.
112
113import sys
114import json
115import unittest
116
117from pprint import pprint
118from collections import defaultdict
119
120
121def debug(msg, *args, **kwargs):
122 if False:
123 print(
124 "DEBUG: {}".format(
125 msg.format(*args, **kwargs)
126 ),
127 file=sys.stderr
128 )
129
130
131# Find paths in the original dataset which are never referenced by
132# any other paths
133def find_roots(closures):
134 roots = [];
135
136 for closure in closures:
137 path = closure['path']
138 if not any_refer_to(path, closures):
139 roots.append(path)
140
141 return roots
142
143class TestFindRoots(unittest.TestCase):
144 def test_find_roots(self):
145 self.assertCountEqual(
146 find_roots([
147 {
148 "path": "/nix/store/foo",
149 "references": [
150 "/nix/store/foo",
151 "/nix/store/bar"
152 ]
153 },
154 {
155 "path": "/nix/store/bar",
156 "references": [
157 "/nix/store/bar",
158 "/nix/store/tux"
159 ]
160 },
161 {
162 "path": "/nix/store/hello",
163 "references": [
164 ]
165 }
166 ]),
167 ["/nix/store/foo", "/nix/store/hello"]
168 )
169
170
171def any_refer_to(path, closures):
172 for closure in closures:
173 if path != closure['path']:
174 if path in closure['references']:
175 return True
176 return False
177
178class TestAnyReferTo(unittest.TestCase):
179 def test_has_references(self):
180 self.assertTrue(
181 any_refer_to(
182 "/nix/store/bar",
183 [
184 {
185 "path": "/nix/store/foo",
186 "references": [
187 "/nix/store/bar"
188 ]
189 },
190 ]
191 ),
192 )
193 def test_no_references(self):
194 self.assertFalse(
195 any_refer_to(
196 "/nix/store/foo",
197 [
198 {
199 "path": "/nix/store/foo",
200 "references": [
201 "/nix/store/foo",
202 "/nix/store/bar"
203 ]
204 },
205 ]
206 ),
207 )
208
209def all_paths(closures):
210 paths = []
211 for closure in closures:
212 paths.append(closure['path'])
213 paths.extend(closure['references'])
214 paths.sort()
215 return list(set(paths))
216
217
218class TestAllPaths(unittest.TestCase):
219 def test_returns_all_paths(self):
220 self.assertCountEqual(
221 all_paths([
222 {
223 "path": "/nix/store/foo",
224 "references": [
225 "/nix/store/foo",
226 "/nix/store/bar"
227 ]
228 },
229 {
230 "path": "/nix/store/bar",
231 "references": [
232 "/nix/store/bar",
233 "/nix/store/tux"
234 ]
235 },
236 {
237 "path": "/nix/store/hello",
238 "references": [
239 ]
240 }
241 ]),
242 ["/nix/store/foo", "/nix/store/bar", "/nix/store/hello", "/nix/store/tux",]
243 )
244 def test_no_references(self):
245 self.assertFalse(
246 any_refer_to(
247 "/nix/store/foo",
248 [
249 {
250 "path": "/nix/store/foo",
251 "references": [
252 "/nix/store/foo",
253 "/nix/store/bar"
254 ]
255 },
256 ]
257 ),
258 )
259
260# Convert:
261#
262# [
263# { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },
264# { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
265# { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
266# { path: /nix/store/tux, references: [ /nix/store/tux ] }
267# ]
268#
269# To:
270# {
271# /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
272# /nix/store/bar: [ /nix/store/baz ],
273# /nix/store/baz: [ /nix/store/tux ] },
274# /nix/store/tux: [ ]
275# }
276#
277# Note that it drops self-references to avoid loops.
278def make_lookup(closures):
279 lookup = {}
280
281 for closure in closures:
282 # paths often self-refer
283 nonreferential_paths = [ref for ref in closure['references'] if ref != closure['path']]
284 lookup[closure['path']] = nonreferential_paths
285
286 return lookup
287
288class TestMakeLookup(unittest.TestCase):
289 def test_returns_lookp(self):
290 self.assertDictEqual(
291 make_lookup([
292 {
293 "path": "/nix/store/foo",
294 "references": [
295 "/nix/store/foo",
296 "/nix/store/bar"
297 ]
298 },
299 {
300 "path": "/nix/store/bar",
301 "references": [
302 "/nix/store/bar",
303 "/nix/store/tux"
304 ]
305 },
306 {
307 "path": "/nix/store/hello",
308 "references": [
309 ]
310 }
311 ]),
312 {
313 "/nix/store/foo": [ "/nix/store/bar" ],
314 "/nix/store/bar": [ "/nix/store/tux" ],
315 "/nix/store/hello": [ ],
316 }
317 )
318
319# Convert:
320#
321# /nix/store/foo with
322# {
323# /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
324# /nix/store/bar: [ /nix/store/baz ],
325# /nix/store/baz: [ /nix/store/tux ] },
326# /nix/store/tux: [ ]
327# }
328#
329# To:
330#
331# {
332# /nix/store/bar: {
333# /nix/store/baz: {
334# /nix/store/tux: {}
335# }
336# },
337# /nix/store/baz: {
338# /nix/store/tux: {}
339# }
340# }
341subgraphs_cache = {}
342def make_graph_segment_from_root(root, lookup):
343 global subgraphs_cache
344 children = {}
345 for ref in lookup[root]:
346 # make_graph_segment_from_root is a pure function, and will
347 # always return the same result based on a given input. Thus,
348 # cache computation.
349 #
350 # Python's assignment will use a pointer, preventing memory
351 # bloat for large graphs.
352 if ref not in subgraphs_cache:
353 debug("Subgraph Cache miss on {}".format(ref))
354 subgraphs_cache[ref] = make_graph_segment_from_root(ref, lookup)
355 else:
356 debug("Subgraph Cache hit on {}".format(ref))
357 children[ref] = subgraphs_cache[ref]
358 return children
359
360class TestMakeGraphSegmentFromRoot(unittest.TestCase):
361 def test_returns_graph(self):
362 self.assertDictEqual(
363 make_graph_segment_from_root("/nix/store/foo", {
364 "/nix/store/foo": [ "/nix/store/bar" ],
365 "/nix/store/bar": [ "/nix/store/tux" ],
366 "/nix/store/tux": [ ],
367 "/nix/store/hello": [ ],
368 }),
369 {
370 "/nix/store/bar": {
371 "/nix/store/tux": {}
372 }
373 }
374 )
375 def test_returns_graph_tiny(self):
376 self.assertDictEqual(
377 make_graph_segment_from_root("/nix/store/tux", {
378 "/nix/store/foo": [ "/nix/store/bar" ],
379 "/nix/store/bar": [ "/nix/store/tux" ],
380 "/nix/store/tux": [ ],
381 }),
382 {}
383 )
384
385# Convert a graph segment in to a popularity-counted dictionary:
386#
387# From:
388# {
389# /nix/store/foo: {
390# /nix/store/bar: {
391# /nix/store/baz: {
392# /nix/store/tux: {}
393# }
394# }
395# /nix/store/baz: {
396# /nix/store/tux: {}
397# }
398# }
399# }
400#
401# to:
402# [
403# /nix/store/foo: 1
404# /nix/store/bar: 2
405# /nix/store/baz: 4
406# /nix/store/tux: 6
407# ]
408popularity_cache = {}
409def graph_popularity_contest(full_graph):
410 global popularity_cache
411 popularity = defaultdict(int)
412 for path, subgraph in full_graph.items():
413 popularity[path] += 1
414 # graph_popularity_contest is a pure function, and will
415 # always return the same result based on a given input. Thus,
416 # cache computation.
417 #
418 # Python's assignment will use a pointer, preventing memory
419 # bloat for large graphs.
420 if path not in popularity_cache:
421 debug("Popularity Cache miss on {}", path)
422 popularity_cache[path] = graph_popularity_contest(subgraph)
423 else:
424 debug("Popularity Cache hit on {}", path)
425
426 subcontest = popularity_cache[path]
427 for subpath, subpopularity in subcontest.items():
428 debug("Calculating popularity for {}", subpath)
429 popularity[subpath] += subpopularity + 1
430
431 return popularity
432
433class TestGraphPopularityContest(unittest.TestCase):
434 def test_counts_popularity(self):
435 self.assertDictEqual(
436 graph_popularity_contest({
437 "/nix/store/foo": {
438 "/nix/store/bar": {
439 "/nix/store/baz": {
440 "/nix/store/tux": {}
441 }
442 },
443 "/nix/store/baz": {
444 "/nix/store/tux": {}
445 }
446 }
447 }),
448 {
449 "/nix/store/foo": 1,
450 "/nix/store/bar": 2,
451 "/nix/store/baz": 4,
452 "/nix/store/tux": 6,
453 }
454 )
455
456# Emit a list of packages by popularity, most first:
457#
458# From:
459# [
460# /nix/store/foo: 1
461# /nix/store/bar: 1
462# /nix/store/baz: 2
463# /nix/store/tux: 2
464# ]
465#
466# To:
467# [ /nix/store/baz /nix/store/tux /nix/store/bar /nix/store/foo ]
468def order_by_popularity(paths):
469 paths_by_popularity = defaultdict(list)
470 popularities = []
471 for path, popularity in paths.items():
472 popularities.append(popularity)
473 paths_by_popularity[popularity].append(path)
474
475 popularities = list(set(popularities))
476 popularities.sort()
477
478 flat_ordered = []
479 for popularity in popularities:
480 paths = paths_by_popularity[popularity]
481 paths.sort(key=package_name)
482
483 flat_ordered.extend(reversed(paths))
484 return list(reversed(flat_ordered))
485
486
487class TestOrderByPopularity(unittest.TestCase):
488 def test_returns_in_order(self):
489 self.assertEqual(
490 order_by_popularity({
491 "/nix/store/foo": 1,
492 "/nix/store/bar": 1,
493 "/nix/store/baz": 2,
494 "/nix/store/tux": 2,
495 }),
496 [
497 "/nix/store/baz",
498 "/nix/store/tux",
499 "/nix/store/bar",
500 "/nix/store/foo"
501 ]
502 )
503
504def package_name(path):
505 parts = path.split('-')
506 start = parts.pop(0)
507 # don't throw away any data, so the order is always the same.
508 # even in cases where only the hash at the start has changed.
509 parts.append(start)
510 return '-'.join(parts)
511
512def main():
513 filename = sys.argv[1]
514 key = sys.argv[2]
515
516 debug("Loading from {}", filename)
517 with open(filename) as f:
518 data = json.load(f)
519
520 # Data comes in as:
521 # [
522 # { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },
523 # { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
524 # { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
525 # { path: /nix/store/tux, references: [ /nix/store/tux ] }
526 # ]
527 #
528 # and we want to get out a list of paths ordered by how universally,
529 # important they are, ie: tux is referenced by every path, transitively
530 # so it should be #1
531 #
532 # [
533 # /nix/store/tux,
534 # /nix/store/baz,
535 # /nix/store/bar,
536 # /nix/store/foo,
537 # ]
538 graph = data[key]
539
540 debug("Finding roots from {}", key)
541 roots = find_roots(graph);
542 debug("Making lookup for {}", key)
543 lookup = make_lookup(graph)
544
545 full_graph = {}
546 for root in roots:
547 debug("Making full graph for {}", root)
548 full_graph[root] = make_graph_segment_from_root(root, lookup)
549
550 debug("Running contest")
551 contest = graph_popularity_contest(full_graph)
552 debug("Ordering by popularity")
553 ordered = order_by_popularity(contest)
554 debug("Checking for missing paths")
555 missing = []
556 for path in all_paths(graph):
557 if path not in ordered:
558 missing.append(path)
559
560 ordered.extend(missing)
561 print("\n".join(ordered))
562
563if "--test" in sys.argv:
564 # Don't pass --test otherwise unittest gets mad
565 unittest.main(argv = [f for f in sys.argv if f != "--test" ])
566else:
567 main()