pkgs/by-name/fl/flattenReferencesGraph/src/flatten_references_graph/popularity_contest.py at devShellTools-shell · tjh.dev/nixpkgs

tjh.dev / nixpkgs
Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
nixpkgs / pkgs / by-name / fl / flattenReferencesGraph / src / flatten_references_graph / popularity_contest.py
at devShellTools-shell 397 lines 11 kB view raw
  1# Using a simple algorithm, convert the references to a path in to a
  2# sorted list of dependent paths based on how often they're referenced
  3# and how deep in the tree they live. Equally-"popular" paths are then
  4# sorted by name.
  5#
  6# The existing writeClosure prints the paths in a simple ascii-based sorting of the paths.
  7#
  8# Sorting the paths by graph improves the chances that the difference
  9# between two builds appear near the end of the list, instead of near
 10# the beginning. This makes a difference for Nix builds which export a
 11# closure for another program to consume, if that program implements its
 12# own level of binary diffing.
 13#
 14# For an example, Docker Images. If each store path is a separate layer
 15# then Docker Images can be very efficiently transfered between systems,
 16# and we get very good cache reuse between images built with the same
 17# version of Nixpkgs. However, since Docker only reliably supports a
 18# small number of layers (42) it is important to pick the individual
 19# layers carefully. By storing very popular store paths in the first 40
 20# layers, we improve the chances that the next Docker image will share
 21# many of those layers.*
 22#
 23# Given the dependency tree:
 24#
 25#     A - B - C - D -\
 26#      \   \   \      \
 27#       \   \   \      \
 28#        \   \ - E ---- F
 29#         \- G
 30#
 31# Nodes which have multiple references are duplicated:
 32#
 33#     A - B - C - D - F
 34#      \   \   \
 35#       \   \   \- E - F
 36#        \   \
 37#         \   \- E - F
 38#          \
 39#           \- G
 40#
 41# Each leaf node is now replaced by a counter defaulted to 1:
 42#
 43#     A - B - C - D - (F:1)
 44#      \   \   \
 45#       \   \   \- E - (F:1)
 46#        \   \
 47#         \   \- E - (F:1)
 48#          \
 49#           \- (G:1)
 50#
 51# Then each leaf counter is merged with its parent node, replacing the
 52# parent node with a counter of 1, and each existing counter being
 53# incremented by 1. That is to say `- D - (F:1)` becomes `- (D:1, F:2)`:
 54#
 55#     A - B - C - (D:1, F:2)
 56#      \   \   \
 57#       \   \   \- (E:1, F:2)
 58#        \   \
 59#         \   \- (E:1, F:2)
 60#          \
 61#           \- (G:1)
 62#
 63# Then each leaf counter is merged with its parent node again, merging
 64# any counters, then incrementing each:
 65#
 66#     A - B - (C:1, D:2, E:2, F:5)
 67#      \   \
 68#       \   \- (E:1, F:2)
 69#        \
 70#         \- (G:1)
 71#
 72# And again:
 73#
 74#     A - (B:1, C:2, D:3, E:4, F:8)
 75#      \
 76#       \- (G:1)
 77#
 78# And again:
 79#
 80#     (A:1, B:2, C:3, D:4, E:5, F:9, G:2)
 81#
 82# and then paths have the following "popularity":
 83#
 84#     A     1
 85#     B     2
 86#     C     3
 87#     D     4
 88#     E     5
 89#     F     9
 90#     G     2
 91#
 92# and the popularity contest would result in the paths being printed as:
 93#
 94#     F
 95#     E
 96#     D
 97#     C
 98#     B
 99#     G
100#     A
101#
102# * Note: People who have used a Dockerfile before assume Docker's
103# Layers are inherently ordered. However, this is not true -- Docker
104# layers are content-addressable and are not explicitly layered until
105# they are composed in to an Image.
106
107import igraph as igraph
108
109from collections import defaultdict
110from operator import eq
111from toolz import curried as tlz
112from toolz import curry
113
114from .lib import (
115    debug,
116    directed_graph,
117    igraph_to_reference_graph,
118    over,
119    pick_keys,
120    reference_graph_node_keys_to_keep
121)
122
123eq = curry(eq)
124
125pick_keys_to_keep = pick_keys(reference_graph_node_keys_to_keep)
126
127
128# Find paths in the original dataset which are never referenced by
129# any other paths
130def find_roots(closures):
131    debug('closures', closures)
132    roots = []
133
134    for closure in closures:
135        path = closure['path']
136        if not any_refer_to(path, closures):
137            roots.append(path)
138
139    return roots
140
141
142def any_refer_to(path, closures):
143    for closure in closures:
144        if path != closure['path']:
145            if path in closure['references']:
146                return True
147    return False
148
149
150def all_paths(closures):
151    paths = []
152    for closure in closures:
153        paths.append(closure['path'])
154        paths.extend(closure['references'])
155    paths.sort()
156    return list(set(paths))
157
158
159# Convert:
160#
161# [
162#    { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },      # noqa: E501
163#    { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
164#    { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
165#    { path: /nix/store/tux, references: [ /nix/store/tux ] }
166#  ]
167#
168# To:
169#    {
170#      /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
171#      /nix/store/bar: [ /nix/store/baz ],
172#      /nix/store/baz: [ /nix/store/tux ] },
173#      /nix/store/tux: [ ]
174#    }
175#
176# Note that it drops self-references to avoid loops.
177
178
179def make_lookup(closures):
180    return {
181        # remove self reference
182        node["path"]: over("references", tlz.remove(eq(node["path"])), node)
183        for node in closures
184    }
185
186
187# Convert:
188#
189# /nix/store/foo with
190#  {
191#    /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
192#    /nix/store/bar: [ /nix/store/baz ],
193#    /nix/store/baz: [ /nix/store/tux ] },
194#    /nix/store/tux: [ ]
195#  }
196#
197# To:
198#
199# {
200#   /nix/store/bar: {
201#                    /nix/store/baz: {
202#                                     /nix/store/tux: {}
203#                    }
204#   },
205#   /nix/store/baz: {
206#                   /nix/store/tux: {}
207#   }
208# }
209
210
211def make_graph_segment_from_root(subgraphs_cache, root, lookup):
212    children = {}
213    for ref in lookup[root]:
214        # make_graph_segment_from_root is a pure function, and will
215        # always return the same result based on a given input. Thus,
216        # cache computation.
217        #
218        # Python's assignment will use a pointer, preventing memory
219        # bloat for large graphs.
220        if ref not in subgraphs_cache:
221            debug("Subgraph Cache miss on {}".format(ref))
222            subgraphs_cache[ref] = make_graph_segment_from_root(
223                subgraphs_cache, ref, lookup
224            )
225        else:
226            debug("Subgraph Cache hit on {}".format(ref))
227        children[ref] = subgraphs_cache[ref]
228    return children
229
230
231# Convert a graph segment in to a popularity-counted dictionary:
232#
233# From:
234# {
235#    /nix/store/foo: {
236#                      /nix/store/bar: {
237#                                        /nix/store/baz: {
238#                                                           /nix/store/tux: {}
239#                                        }
240#                      }
241#                      /nix/store/baz: {
242#                                         /nix/store/tux: {}
243#                      }
244#    }
245# }
246#
247# to:
248# [
249#   /nix/store/foo: 1
250#   /nix/store/bar: 2
251#   /nix/store/baz: 4
252#   /nix/store/tux: 6
253# ]
254
255def graph_popularity_contest(popularity_cache, full_graph):
256    popularity = defaultdict(int)
257    for path, subgraph in full_graph.items():
258        popularity[path] += 1
259        # graph_popularity_contest is a pure function, and will
260        # always return the same result based on a given input. Thus,
261        # cache computation.
262        #
263        # Python's assignment will use a pointer, preventing memory
264        # bloat for large graphs.
265        if path not in popularity_cache:
266            debug("Popularity Cache miss on", path)
267            popularity_cache[path] = graph_popularity_contest(
268                popularity_cache, subgraph
269            )
270        else:
271            debug("Popularity Cache hit on", path)
272
273        subcontest = popularity_cache[path]
274        for subpath, subpopularity in subcontest.items():
275            debug("Calculating popularity for", subpath)
276            popularity[subpath] += subpopularity + 1
277
278    return popularity
279
280# Emit a list of packages by popularity, most first:
281#
282# From:
283# [
284#   /nix/store/foo: 1
285#   /nix/store/bar: 1
286#   /nix/store/baz: 2
287#   /nix/store/tux: 2
288# ]
289#
290# To:
291# [ /nix/store/baz /nix/store/tux /nix/store/bar /nix/store/foo ]
292
293
294def order_by_popularity(paths):
295    paths_by_popularity = defaultdict(list)
296    popularities = []
297    for path, popularity in paths.items():
298        popularities.append(popularity)
299        paths_by_popularity[popularity].append(path)
300
301    popularities = sorted(set(popularities))
302
303    flat_ordered = []
304    for popularity in popularities:
305        paths = paths_by_popularity[popularity]
306        paths.sort(key=package_name)
307
308        flat_ordered.extend(reversed(paths))
309    return list(reversed(flat_ordered))
310
311
312def package_name(path):
313    parts = path.split('-')
314    start = parts.pop(0)
315    # don't throw away any data, so the order is always the same.
316    # even in cases where only the hash at the start has changed.
317    parts.append(start)
318    return '-'.join(parts)
319
320
321@curry
322def popularity_contest(graph):
323    # Data comes in as an igraph directed graph or in the format produced
324    # by nix's exportReferencesGraph:
325    # [
326    #    { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },  # noqa: E501
327    #    { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },                  # noqa: E501
328    #    { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },                  # noqa: E501
329    #    { path: /nix/store/tux, references: [ /nix/store/tux ] }
330    #  ]
331    #
332    # We want to get out a list of paths ordered by how universally,
333    # important they are, ie: tux is referenced by every path, transitively
334    # so it should be #1
335    #
336    # [
337    #   /nix/store/tux,
338    #   /nix/store/baz,
339    #   /nix/store/bar,
340    #   /nix/store/foo,
341    # ]
342    #
343    # NOTE: the output is actually a list of igraph graphs with a single vertex
344    # with v["name"] == path, and some properties (defined in
345    # reference_graph_node_keys_to_keep) from the nodes of the input graph
346    # copied as vertex attributes.
347    debug('graph', graph)
348
349    if isinstance(graph, igraph.Graph):
350        graph = igraph_to_reference_graph(graph)
351
352    debug("Finding roots")
353    roots = find_roots(graph)
354
355    debug("Making lookup")
356    lookup = make_lookup(graph)
357
358    full_graph = {}
359    subgraphs_cache = {}
360    for root in roots:
361        debug("Making full graph for", root)
362        full_graph[root] = make_graph_segment_from_root(
363            subgraphs_cache,
364            root,
365            tlz.valmap(
366                tlz.get("references"),
367                lookup
368            )
369        )
370
371    debug("Running contest")
372    contest = graph_popularity_contest({}, full_graph)
373
374    debug("Ordering by popularity")
375    ordered = order_by_popularity(contest)
376
377    debug("Checking for missing paths")
378    missing = []
379
380    for path in all_paths(graph):
381        if path not in ordered:
382            missing.append(path)
383
384    ordered.extend(missing)
385
386    return map(
387        # Turn each path into a graph with 1 vertex.
388        lambda path: directed_graph(
389            # No edges
390            [],
391            # One vertex, with name=path
392            [path],
393            # Setting desired attributes on the vertex.
394            [(path, pick_keys_to_keep(lookup[path]))]
395        ),
396        ordered
397    )