Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
at devShellTools-shell 397 lines 11 kB view raw
1# Using a simple algorithm, convert the references to a path in to a 2# sorted list of dependent paths based on how often they're referenced 3# and how deep in the tree they live. Equally-"popular" paths are then 4# sorted by name. 5# 6# The existing writeClosure prints the paths in a simple ascii-based sorting of the paths. 7# 8# Sorting the paths by graph improves the chances that the difference 9# between two builds appear near the end of the list, instead of near 10# the beginning. This makes a difference for Nix builds which export a 11# closure for another program to consume, if that program implements its 12# own level of binary diffing. 13# 14# For an example, Docker Images. If each store path is a separate layer 15# then Docker Images can be very efficiently transfered between systems, 16# and we get very good cache reuse between images built with the same 17# version of Nixpkgs. However, since Docker only reliably supports a 18# small number of layers (42) it is important to pick the individual 19# layers carefully. By storing very popular store paths in the first 40 20# layers, we improve the chances that the next Docker image will share 21# many of those layers.* 22# 23# Given the dependency tree: 24# 25# A - B - C - D -\ 26# \ \ \ \ 27# \ \ \ \ 28# \ \ - E ---- F 29# \- G 30# 31# Nodes which have multiple references are duplicated: 32# 33# A - B - C - D - F 34# \ \ \ 35# \ \ \- E - F 36# \ \ 37# \ \- E - F 38# \ 39# \- G 40# 41# Each leaf node is now replaced by a counter defaulted to 1: 42# 43# A - B - C - D - (F:1) 44# \ \ \ 45# \ \ \- E - (F:1) 46# \ \ 47# \ \- E - (F:1) 48# \ 49# \- (G:1) 50# 51# Then each leaf counter is merged with its parent node, replacing the 52# parent node with a counter of 1, and each existing counter being 53# incremented by 1. That is to say `- D - (F:1)` becomes `- (D:1, F:2)`: 54# 55# A - B - C - (D:1, F:2) 56# \ \ \ 57# \ \ \- (E:1, F:2) 58# \ \ 59# \ \- (E:1, F:2) 60# \ 61# \- (G:1) 62# 63# Then each leaf counter is merged with its parent node again, merging 64# any counters, then incrementing each: 65# 66# A - B - (C:1, D:2, E:2, F:5) 67# \ \ 68# \ \- (E:1, F:2) 69# \ 70# \- (G:1) 71# 72# And again: 73# 74# A - (B:1, C:2, D:3, E:4, F:8) 75# \ 76# \- (G:1) 77# 78# And again: 79# 80# (A:1, B:2, C:3, D:4, E:5, F:9, G:2) 81# 82# and then paths have the following "popularity": 83# 84# A 1 85# B 2 86# C 3 87# D 4 88# E 5 89# F 9 90# G 2 91# 92# and the popularity contest would result in the paths being printed as: 93# 94# F 95# E 96# D 97# C 98# B 99# G 100# A 101# 102# * Note: People who have used a Dockerfile before assume Docker's 103# Layers are inherently ordered. However, this is not true -- Docker 104# layers are content-addressable and are not explicitly layered until 105# they are composed in to an Image. 106 107import igraph as igraph 108 109from collections import defaultdict 110from operator import eq 111from toolz import curried as tlz 112from toolz import curry 113 114from .lib import ( 115 debug, 116 directed_graph, 117 igraph_to_reference_graph, 118 over, 119 pick_keys, 120 reference_graph_node_keys_to_keep 121) 122 123eq = curry(eq) 124 125pick_keys_to_keep = pick_keys(reference_graph_node_keys_to_keep) 126 127 128# Find paths in the original dataset which are never referenced by 129# any other paths 130def find_roots(closures): 131 debug('closures', closures) 132 roots = [] 133 134 for closure in closures: 135 path = closure['path'] 136 if not any_refer_to(path, closures): 137 roots.append(path) 138 139 return roots 140 141 142def any_refer_to(path, closures): 143 for closure in closures: 144 if path != closure['path']: 145 if path in closure['references']: 146 return True 147 return False 148 149 150def all_paths(closures): 151 paths = [] 152 for closure in closures: 153 paths.append(closure['path']) 154 paths.extend(closure['references']) 155 paths.sort() 156 return list(set(paths)) 157 158 159# Convert: 160# 161# [ 162# { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] }, # noqa: E501 163# { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] }, 164# { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] }, 165# { path: /nix/store/tux, references: [ /nix/store/tux ] } 166# ] 167# 168# To: 169# { 170# /nix/store/foo: [ /nix/store/bar, /nix/store/baz ], 171# /nix/store/bar: [ /nix/store/baz ], 172# /nix/store/baz: [ /nix/store/tux ] }, 173# /nix/store/tux: [ ] 174# } 175# 176# Note that it drops self-references to avoid loops. 177 178 179def make_lookup(closures): 180 return { 181 # remove self reference 182 node["path"]: over("references", tlz.remove(eq(node["path"])), node) 183 for node in closures 184 } 185 186 187# Convert: 188# 189# /nix/store/foo with 190# { 191# /nix/store/foo: [ /nix/store/bar, /nix/store/baz ], 192# /nix/store/bar: [ /nix/store/baz ], 193# /nix/store/baz: [ /nix/store/tux ] }, 194# /nix/store/tux: [ ] 195# } 196# 197# To: 198# 199# { 200# /nix/store/bar: { 201# /nix/store/baz: { 202# /nix/store/tux: {} 203# } 204# }, 205# /nix/store/baz: { 206# /nix/store/tux: {} 207# } 208# } 209 210 211def make_graph_segment_from_root(subgraphs_cache, root, lookup): 212 children = {} 213 for ref in lookup[root]: 214 # make_graph_segment_from_root is a pure function, and will 215 # always return the same result based on a given input. Thus, 216 # cache computation. 217 # 218 # Python's assignment will use a pointer, preventing memory 219 # bloat for large graphs. 220 if ref not in subgraphs_cache: 221 debug("Subgraph Cache miss on {}".format(ref)) 222 subgraphs_cache[ref] = make_graph_segment_from_root( 223 subgraphs_cache, ref, lookup 224 ) 225 else: 226 debug("Subgraph Cache hit on {}".format(ref)) 227 children[ref] = subgraphs_cache[ref] 228 return children 229 230 231# Convert a graph segment in to a popularity-counted dictionary: 232# 233# From: 234# { 235# /nix/store/foo: { 236# /nix/store/bar: { 237# /nix/store/baz: { 238# /nix/store/tux: {} 239# } 240# } 241# /nix/store/baz: { 242# /nix/store/tux: {} 243# } 244# } 245# } 246# 247# to: 248# [ 249# /nix/store/foo: 1 250# /nix/store/bar: 2 251# /nix/store/baz: 4 252# /nix/store/tux: 6 253# ] 254 255def graph_popularity_contest(popularity_cache, full_graph): 256 popularity = defaultdict(int) 257 for path, subgraph in full_graph.items(): 258 popularity[path] += 1 259 # graph_popularity_contest is a pure function, and will 260 # always return the same result based on a given input. Thus, 261 # cache computation. 262 # 263 # Python's assignment will use a pointer, preventing memory 264 # bloat for large graphs. 265 if path not in popularity_cache: 266 debug("Popularity Cache miss on", path) 267 popularity_cache[path] = graph_popularity_contest( 268 popularity_cache, subgraph 269 ) 270 else: 271 debug("Popularity Cache hit on", path) 272 273 subcontest = popularity_cache[path] 274 for subpath, subpopularity in subcontest.items(): 275 debug("Calculating popularity for", subpath) 276 popularity[subpath] += subpopularity + 1 277 278 return popularity 279 280# Emit a list of packages by popularity, most first: 281# 282# From: 283# [ 284# /nix/store/foo: 1 285# /nix/store/bar: 1 286# /nix/store/baz: 2 287# /nix/store/tux: 2 288# ] 289# 290# To: 291# [ /nix/store/baz /nix/store/tux /nix/store/bar /nix/store/foo ] 292 293 294def order_by_popularity(paths): 295 paths_by_popularity = defaultdict(list) 296 popularities = [] 297 for path, popularity in paths.items(): 298 popularities.append(popularity) 299 paths_by_popularity[popularity].append(path) 300 301 popularities = sorted(set(popularities)) 302 303 flat_ordered = [] 304 for popularity in popularities: 305 paths = paths_by_popularity[popularity] 306 paths.sort(key=package_name) 307 308 flat_ordered.extend(reversed(paths)) 309 return list(reversed(flat_ordered)) 310 311 312def package_name(path): 313 parts = path.split('-') 314 start = parts.pop(0) 315 # don't throw away any data, so the order is always the same. 316 # even in cases where only the hash at the start has changed. 317 parts.append(start) 318 return '-'.join(parts) 319 320 321@curry 322def popularity_contest(graph): 323 # Data comes in as an igraph directed graph or in the format produced 324 # by nix's exportReferencesGraph: 325 # [ 326 # { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] }, # noqa: E501 327 # { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] }, # noqa: E501 328 # { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] }, # noqa: E501 329 # { path: /nix/store/tux, references: [ /nix/store/tux ] } 330 # ] 331 # 332 # We want to get out a list of paths ordered by how universally, 333 # important they are, ie: tux is referenced by every path, transitively 334 # so it should be #1 335 # 336 # [ 337 # /nix/store/tux, 338 # /nix/store/baz, 339 # /nix/store/bar, 340 # /nix/store/foo, 341 # ] 342 # 343 # NOTE: the output is actually a list of igraph graphs with a single vertex 344 # with v["name"] == path, and some properties (defined in 345 # reference_graph_node_keys_to_keep) from the nodes of the input graph 346 # copied as vertex attributes. 347 debug('graph', graph) 348 349 if isinstance(graph, igraph.Graph): 350 graph = igraph_to_reference_graph(graph) 351 352 debug("Finding roots") 353 roots = find_roots(graph) 354 355 debug("Making lookup") 356 lookup = make_lookup(graph) 357 358 full_graph = {} 359 subgraphs_cache = {} 360 for root in roots: 361 debug("Making full graph for", root) 362 full_graph[root] = make_graph_segment_from_root( 363 subgraphs_cache, 364 root, 365 tlz.valmap( 366 tlz.get("references"), 367 lookup 368 ) 369 ) 370 371 debug("Running contest") 372 contest = graph_popularity_contest({}, full_graph) 373 374 debug("Ordering by popularity") 375 ordered = order_by_popularity(contest) 376 377 debug("Checking for missing paths") 378 missing = [] 379 380 for path in all_paths(graph): 381 if path not in ordered: 382 missing.append(path) 383 384 ordered.extend(missing) 385 386 return map( 387 # Turn each path into a graph with 1 vertex. 388 lambda path: directed_graph( 389 # No edges 390 [], 391 # One vertex, with name=path 392 [path], 393 # Setting desired attributes on the vertex. 394 [(path, pick_keys_to_keep(lookup[path]))] 395 ), 396 ordered 397 )