pkgs/build-support/references-by-popularity/closure-graph.py at python-updates

Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
nixpkgs / pkgs / build-support / references-by-popularity / closure-graph.py
at python-updates 567 lines 16 kB view raw
wrap content
  1# IMPORTANT: Making changes?
  2#
  3# Validate your changes with python3 ./closure-graph.py --test
  4
  5
  6# Using a simple algorithm, convert the references to a path in to a
  7# sorted list of dependent paths based on how often they're referenced
  8# and how deep in the tree they live. Equally-"popular" paths are then
  9# sorted by name.
 10#
 11# The existing writeClosure prints the paths in a simple ascii-based
 12# sorting of the paths.
 13#
 14# Sorting the paths by graph improves the chances that the difference
 15# between two builds appear near the end of the list, instead of near
 16# the beginning. This makes a difference for Nix builds which export a
 17# closure for another program to consume, if that program implements its
 18# own level of binary diffing.
 19#
 20# For an example, Docker Images. If each store path is a separate layer
 21# then Docker Images can be very efficiently transfered between systems,
 22# and we get very good cache reuse between images built with the same
 23# version of Nixpkgs. However, since Docker only reliably supports a
 24# small number of layers (42) it is important to pick the individual
 25# layers carefully. By storing very popular store paths in the first 40
 26# layers, we improve the chances that the next Docker image will share
 27# many of those layers.*
 28#
 29# Given the dependency tree:
 30#
 31#     A - B - C - D -\
 32#      \   \   \      \
 33#       \   \   \      \
 34#        \   \ - E ---- F
 35#         \- G
 36#
 37# Nodes which have multiple references are duplicated:
 38#
 39#     A - B - C - D - F
 40#      \   \   \
 41#       \   \   \- E - F
 42#        \   \
 43#         \   \- E - F
 44#          \
 45#           \- G
 46#
 47# Each leaf node is now replaced by a counter defaulted to 1:
 48#
 49#     A - B - C - D - (F:1)
 50#      \   \   \
 51#       \   \   \- E - (F:1)
 52#        \   \
 53#         \   \- E - (F:1)
 54#          \
 55#           \- (G:1)
 56#
 57# Then each leaf counter is merged with its parent node, replacing the
 58# parent node with a counter of 1, and each existing counter being
 59# incremented by 1. That is to say `- D - (F:1)` becomes `- (D:1, F:2)`:
 60#
 61#     A - B - C - (D:1, F:2)
 62#      \   \   \
 63#       \   \   \- (E:1, F:2)
 64#        \   \
 65#         \   \- (E:1, F:2)
 66#          \
 67#           \- (G:1)
 68#
 69# Then each leaf counter is merged with its parent node again, merging
 70# any counters, then incrementing each:
 71#
 72#     A - B - (C:1, D:2, E:2, F:5)
 73#      \   \
 74#       \   \- (E:1, F:2)
 75#        \
 76#         \- (G:1)
 77#
 78# And again:
 79#
 80#     A - (B:1, C:2, D:3, E:4, F:8)
 81#      \
 82#       \- (G:1)
 83#
 84# And again:
 85#
 86#     (A:1, B:2, C:3, D:4, E:5, F:9, G:2)
 87#
 88# and then paths have the following "popularity":
 89#
 90#     A     1
 91#     B     2
 92#     C     3
 93#     D     4
 94#     E     5
 95#     F     9
 96#     G     2
 97#
 98# and the popularity contest would result in the paths being printed as:
 99#
100#     F
101#     E
102#     D
103#     C
104#     B
105#     G
106#     A
107#
108# * Note: People who have used a Dockerfile before assume Docker's
109# Layers are inherently ordered. However, this is not true -- Docker
110# layers are content-addressable and are not explicitly layered until
111# they are composed in to an Image.
112
113import sys
114import json
115import unittest
116
117from pprint import pprint
118from collections import defaultdict
119
120
121def debug(msg, *args, **kwargs):
122    if False:
123        print(
124            "DEBUG: {}".format(
125                msg.format(*args, **kwargs)
126            ),
127            file=sys.stderr
128        )
129
130
131# Find paths in the original dataset which are never referenced by
132# any other paths
133def find_roots(closures):
134    roots = [];
135
136    for closure in closures:
137        path = closure['path']
138        if not any_refer_to(path, closures):
139            roots.append(path)
140
141    return roots
142
143class TestFindRoots(unittest.TestCase):
144    def test_find_roots(self):
145        self.assertCountEqual(
146            find_roots([
147                {
148                    "path": "/nix/store/foo",
149                    "references": [
150                        "/nix/store/foo",
151                        "/nix/store/bar"
152                    ]
153                },
154                {
155                    "path": "/nix/store/bar",
156                    "references": [
157                        "/nix/store/bar",
158                        "/nix/store/tux"
159                    ]
160                },
161                {
162                    "path": "/nix/store/hello",
163                    "references": [
164                    ]
165                }
166            ]),
167            ["/nix/store/foo", "/nix/store/hello"]
168        )
169
170
171def any_refer_to(path, closures):
172    for closure in closures:
173        if path != closure['path']:
174            if path in closure['references']:
175                return True
176    return False
177
178class TestAnyReferTo(unittest.TestCase):
179    def test_has_references(self):
180        self.assertTrue(
181            any_refer_to(
182                "/nix/store/bar",
183                [
184                    {
185                        "path": "/nix/store/foo",
186                        "references": [
187                            "/nix/store/bar"
188                        ]
189                    },
190                ]
191            ),
192        )
193    def test_no_references(self):
194        self.assertFalse(
195            any_refer_to(
196                "/nix/store/foo",
197                [
198                    {
199                        "path": "/nix/store/foo",
200                        "references": [
201                            "/nix/store/foo",
202                            "/nix/store/bar"
203                        ]
204                    },
205                ]
206            ),
207        )
208
209def all_paths(closures):
210    paths = []
211    for closure in closures:
212        paths.append(closure['path'])
213        paths.extend(closure['references'])
214    paths.sort()
215    return list(set(paths))
216
217
218class TestAllPaths(unittest.TestCase):
219    def test_returns_all_paths(self):
220        self.assertCountEqual(
221            all_paths([
222                {
223                    "path": "/nix/store/foo",
224                    "references": [
225                        "/nix/store/foo",
226                        "/nix/store/bar"
227                    ]
228                },
229                {
230                    "path": "/nix/store/bar",
231                    "references": [
232                        "/nix/store/bar",
233                        "/nix/store/tux"
234                    ]
235                },
236                {
237                    "path": "/nix/store/hello",
238                    "references": [
239                    ]
240                }
241            ]),
242            ["/nix/store/foo", "/nix/store/bar", "/nix/store/hello", "/nix/store/tux",]
243        )
244    def test_no_references(self):
245        self.assertFalse(
246            any_refer_to(
247                "/nix/store/foo",
248                [
249                    {
250                        "path": "/nix/store/foo",
251                        "references": [
252                            "/nix/store/foo",
253                            "/nix/store/bar"
254                        ]
255                    },
256                ]
257            ),
258        )
259
260# Convert:
261#
262# [
263#    { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },
264#    { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
265#    { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
266#    { path: /nix/store/tux, references: [ /nix/store/tux ] }
267#  ]
268#
269# To:
270#    {
271#      /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
272#      /nix/store/bar: [ /nix/store/baz ],
273#      /nix/store/baz: [ /nix/store/tux ] },
274#      /nix/store/tux: [ ]
275#    }
276#
277# Note that it drops self-references to avoid loops.
278def make_lookup(closures):
279    lookup = {}
280
281    for closure in closures:
282        # paths often self-refer
283        nonreferential_paths = [ref for ref in closure['references'] if ref != closure['path']]
284        lookup[closure['path']] = nonreferential_paths
285
286    return lookup
287
288class TestMakeLookup(unittest.TestCase):
289    def test_returns_lookp(self):
290        self.assertDictEqual(
291            make_lookup([
292                {
293                    "path": "/nix/store/foo",
294                    "references": [
295                        "/nix/store/foo",
296                        "/nix/store/bar"
297                    ]
298                },
299                {
300                    "path": "/nix/store/bar",
301                    "references": [
302                        "/nix/store/bar",
303                        "/nix/store/tux"
304                    ]
305                },
306                {
307                    "path": "/nix/store/hello",
308                    "references": [
309                    ]
310                }
311            ]),
312            {
313                "/nix/store/foo": [ "/nix/store/bar" ],
314                "/nix/store/bar": [ "/nix/store/tux" ],
315                "/nix/store/hello": [ ],
316            }
317        )
318
319# Convert:
320#
321# /nix/store/foo with
322#  {
323#    /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
324#    /nix/store/bar: [ /nix/store/baz ],
325#    /nix/store/baz: [ /nix/store/tux ] },
326#    /nix/store/tux: [ ]
327#  }
328#
329# To:
330#
331# {
332#   /nix/store/bar: {
333#                    /nix/store/baz: {
334#                                     /nix/store/tux: {}
335#                    }
336#   },
337#   /nix/store/baz: {
338#                   /nix/store/tux: {}
339#   }
340# }
341subgraphs_cache = {}
342def make_graph_segment_from_root(root, lookup):
343    global subgraphs_cache
344    children = {}
345    for ref in lookup[root]:
346        # make_graph_segment_from_root is a pure function, and will
347        # always return the same result based on a given input. Thus,
348        # cache computation.
349        #
350        # Python's assignment will use a pointer, preventing memory
351        # bloat for large graphs.
352        if ref not in subgraphs_cache:
353            debug("Subgraph Cache miss on {}".format(ref))
354            subgraphs_cache[ref] = make_graph_segment_from_root(ref, lookup)
355        else:
356            debug("Subgraph Cache hit on {}".format(ref))
357        children[ref] = subgraphs_cache[ref]
358    return children
359
360class TestMakeGraphSegmentFromRoot(unittest.TestCase):
361    def test_returns_graph(self):
362        self.assertDictEqual(
363            make_graph_segment_from_root("/nix/store/foo", {
364                "/nix/store/foo": [ "/nix/store/bar" ],
365                "/nix/store/bar": [ "/nix/store/tux" ],
366                "/nix/store/tux": [ ],
367                "/nix/store/hello": [ ],
368            }),
369            {
370                "/nix/store/bar": {
371                    "/nix/store/tux": {}
372                }
373            }
374        )
375    def test_returns_graph_tiny(self):
376        self.assertDictEqual(
377            make_graph_segment_from_root("/nix/store/tux", {
378                "/nix/store/foo": [ "/nix/store/bar" ],
379                "/nix/store/bar": [ "/nix/store/tux" ],
380                "/nix/store/tux": [ ],
381            }),
382            {}
383        )
384
385# Convert a graph segment in to a popularity-counted dictionary:
386#
387# From:
388# {
389#    /nix/store/foo: {
390#                      /nix/store/bar: {
391#                                        /nix/store/baz: {
392#                                                           /nix/store/tux: {}
393#                                        }
394#                      }
395#                      /nix/store/baz: {
396#                                         /nix/store/tux: {}
397#                      }
398#    }
399# }
400#
401# to:
402# [
403#   /nix/store/foo: 1
404#   /nix/store/bar: 2
405#   /nix/store/baz: 4
406#   /nix/store/tux: 6
407# ]
408popularity_cache = {}
409def graph_popularity_contest(full_graph):
410    global popularity_cache
411    popularity = defaultdict(int)
412    for path, subgraph in full_graph.items():
413        popularity[path] += 1
414        # graph_popularity_contest is a pure function, and will
415        # always return the same result based on a given input. Thus,
416        # cache computation.
417        #
418        # Python's assignment will use a pointer, preventing memory
419        # bloat for large graphs.
420        if path not in popularity_cache:
421            debug("Popularity Cache miss on {}", path)
422            popularity_cache[path] = graph_popularity_contest(subgraph)
423        else:
424            debug("Popularity Cache hit on {}", path)
425
426        subcontest = popularity_cache[path]
427        for subpath, subpopularity in subcontest.items():
428            debug("Calculating popularity for {}", subpath)
429            popularity[subpath] += subpopularity + 1
430
431    return popularity
432
433class TestGraphPopularityContest(unittest.TestCase):
434    def test_counts_popularity(self):
435        self.assertDictEqual(
436            graph_popularity_contest({
437                "/nix/store/foo": {
438                    "/nix/store/bar": {
439                        "/nix/store/baz": {
440                            "/nix/store/tux": {}
441                        }
442                    },
443                    "/nix/store/baz": {
444                        "/nix/store/tux": {}
445                    }
446                }
447            }),
448            {
449                   "/nix/store/foo": 1,
450                   "/nix/store/bar": 2,
451                   "/nix/store/baz": 4,
452                   "/nix/store/tux": 6,
453            }
454        )
455
456# Emit a list of packages by popularity, most first:
457#
458# From:
459# [
460#   /nix/store/foo: 1
461#   /nix/store/bar: 1
462#   /nix/store/baz: 2
463#   /nix/store/tux: 2
464# ]
465#
466# To:
467# [ /nix/store/baz /nix/store/tux /nix/store/bar /nix/store/foo ]
468def order_by_popularity(paths):
469    paths_by_popularity = defaultdict(list)
470    popularities = []
471    for path, popularity in paths.items():
472        popularities.append(popularity)
473        paths_by_popularity[popularity].append(path)
474
475    popularities = list(set(popularities))
476    popularities.sort()
477
478    flat_ordered = []
479    for popularity in popularities:
480        paths = paths_by_popularity[popularity]
481        paths.sort(key=package_name)
482
483        flat_ordered.extend(reversed(paths))
484    return list(reversed(flat_ordered))
485
486
487class TestOrderByPopularity(unittest.TestCase):
488    def test_returns_in_order(self):
489        self.assertEqual(
490            order_by_popularity({
491                   "/nix/store/foo": 1,
492                   "/nix/store/bar": 1,
493                   "/nix/store/baz": 2,
494                   "/nix/store/tux": 2,
495            }),
496            [
497                "/nix/store/baz",
498                "/nix/store/tux",
499                "/nix/store/bar",
500                "/nix/store/foo"
501            ]
502        )
503
504def package_name(path):
505    parts = path.split('-')
506    start = parts.pop(0)
507    # don't throw away any data, so the order is always the same.
508    # even in cases where only the hash at the start has changed.
509    parts.append(start)
510    return '-'.join(parts)
511
512def main():
513    filename = sys.argv[1]
514    key = sys.argv[2]
515
516    debug("Loading from {}", filename)
517    with open(filename) as f:
518        data = json.load(f)
519
520    # Data comes in as:
521    # [
522    #    { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },
523    #    { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
524    #    { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
525    #    { path: /nix/store/tux, references: [ /nix/store/tux ] }
526    #  ]
527    #
528    # and we want to get out a list of paths ordered by how universally,
529    # important they are, ie: tux is referenced by every path, transitively
530    # so it should be #1
531    #
532    # [
533    #   /nix/store/tux,
534    #   /nix/store/baz,
535    #   /nix/store/bar,
536    #   /nix/store/foo,
537    # ]
538    graph = data[key]
539
540    debug("Finding roots from {}", key)
541    roots = find_roots(graph);
542    debug("Making lookup for {}", key)
543    lookup = make_lookup(graph)
544
545    full_graph = {}
546    for root in roots:
547        debug("Making full graph for {}", root)
548        full_graph[root] = make_graph_segment_from_root(root, lookup)
549
550    debug("Running contest")
551    contest = graph_popularity_contest(full_graph)
552    debug("Ordering by popularity")
553    ordered = order_by_popularity(contest)
554    debug("Checking for missing paths")
555    missing = []
556    for path in all_paths(graph):
557        if path not in ordered:
558            missing.append(path)
559
560    ordered.extend(missing)
561    print("\n".join(ordered))
562
563if "--test" in sys.argv:
564    # Don't pass --test otherwise unittest gets mad
565    unittest.main(argv = [f for f in sys.argv if f != "--test" ])
566else:
567    main()