Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
at python-updates 567 lines 16 kB view raw
1# IMPORTANT: Making changes? 2# 3# Validate your changes with python3 ./closure-graph.py --test 4 5 6# Using a simple algorithm, convert the references to a path in to a 7# sorted list of dependent paths based on how often they're referenced 8# and how deep in the tree they live. Equally-"popular" paths are then 9# sorted by name. 10# 11# The existing writeClosure prints the paths in a simple ascii-based 12# sorting of the paths. 13# 14# Sorting the paths by graph improves the chances that the difference 15# between two builds appear near the end of the list, instead of near 16# the beginning. This makes a difference for Nix builds which export a 17# closure for another program to consume, if that program implements its 18# own level of binary diffing. 19# 20# For an example, Docker Images. If each store path is a separate layer 21# then Docker Images can be very efficiently transfered between systems, 22# and we get very good cache reuse between images built with the same 23# version of Nixpkgs. However, since Docker only reliably supports a 24# small number of layers (42) it is important to pick the individual 25# layers carefully. By storing very popular store paths in the first 40 26# layers, we improve the chances that the next Docker image will share 27# many of those layers.* 28# 29# Given the dependency tree: 30# 31# A - B - C - D -\ 32# \ \ \ \ 33# \ \ \ \ 34# \ \ - E ---- F 35# \- G 36# 37# Nodes which have multiple references are duplicated: 38# 39# A - B - C - D - F 40# \ \ \ 41# \ \ \- E - F 42# \ \ 43# \ \- E - F 44# \ 45# \- G 46# 47# Each leaf node is now replaced by a counter defaulted to 1: 48# 49# A - B - C - D - (F:1) 50# \ \ \ 51# \ \ \- E - (F:1) 52# \ \ 53# \ \- E - (F:1) 54# \ 55# \- (G:1) 56# 57# Then each leaf counter is merged with its parent node, replacing the 58# parent node with a counter of 1, and each existing counter being 59# incremented by 1. That is to say `- D - (F:1)` becomes `- (D:1, F:2)`: 60# 61# A - B - C - (D:1, F:2) 62# \ \ \ 63# \ \ \- (E:1, F:2) 64# \ \ 65# \ \- (E:1, F:2) 66# \ 67# \- (G:1) 68# 69# Then each leaf counter is merged with its parent node again, merging 70# any counters, then incrementing each: 71# 72# A - B - (C:1, D:2, E:2, F:5) 73# \ \ 74# \ \- (E:1, F:2) 75# \ 76# \- (G:1) 77# 78# And again: 79# 80# A - (B:1, C:2, D:3, E:4, F:8) 81# \ 82# \- (G:1) 83# 84# And again: 85# 86# (A:1, B:2, C:3, D:4, E:5, F:9, G:2) 87# 88# and then paths have the following "popularity": 89# 90# A 1 91# B 2 92# C 3 93# D 4 94# E 5 95# F 9 96# G 2 97# 98# and the popularity contest would result in the paths being printed as: 99# 100# F 101# E 102# D 103# C 104# B 105# G 106# A 107# 108# * Note: People who have used a Dockerfile before assume Docker's 109# Layers are inherently ordered. However, this is not true -- Docker 110# layers are content-addressable and are not explicitly layered until 111# they are composed in to an Image. 112 113import sys 114import json 115import unittest 116 117from pprint import pprint 118from collections import defaultdict 119 120 121def debug(msg, *args, **kwargs): 122 if False: 123 print( 124 "DEBUG: {}".format( 125 msg.format(*args, **kwargs) 126 ), 127 file=sys.stderr 128 ) 129 130 131# Find paths in the original dataset which are never referenced by 132# any other paths 133def find_roots(closures): 134 roots = []; 135 136 for closure in closures: 137 path = closure['path'] 138 if not any_refer_to(path, closures): 139 roots.append(path) 140 141 return roots 142 143class TestFindRoots(unittest.TestCase): 144 def test_find_roots(self): 145 self.assertCountEqual( 146 find_roots([ 147 { 148 "path": "/nix/store/foo", 149 "references": [ 150 "/nix/store/foo", 151 "/nix/store/bar" 152 ] 153 }, 154 { 155 "path": "/nix/store/bar", 156 "references": [ 157 "/nix/store/bar", 158 "/nix/store/tux" 159 ] 160 }, 161 { 162 "path": "/nix/store/hello", 163 "references": [ 164 ] 165 } 166 ]), 167 ["/nix/store/foo", "/nix/store/hello"] 168 ) 169 170 171def any_refer_to(path, closures): 172 for closure in closures: 173 if path != closure['path']: 174 if path in closure['references']: 175 return True 176 return False 177 178class TestAnyReferTo(unittest.TestCase): 179 def test_has_references(self): 180 self.assertTrue( 181 any_refer_to( 182 "/nix/store/bar", 183 [ 184 { 185 "path": "/nix/store/foo", 186 "references": [ 187 "/nix/store/bar" 188 ] 189 }, 190 ] 191 ), 192 ) 193 def test_no_references(self): 194 self.assertFalse( 195 any_refer_to( 196 "/nix/store/foo", 197 [ 198 { 199 "path": "/nix/store/foo", 200 "references": [ 201 "/nix/store/foo", 202 "/nix/store/bar" 203 ] 204 }, 205 ] 206 ), 207 ) 208 209def all_paths(closures): 210 paths = [] 211 for closure in closures: 212 paths.append(closure['path']) 213 paths.extend(closure['references']) 214 paths.sort() 215 return list(set(paths)) 216 217 218class TestAllPaths(unittest.TestCase): 219 def test_returns_all_paths(self): 220 self.assertCountEqual( 221 all_paths([ 222 { 223 "path": "/nix/store/foo", 224 "references": [ 225 "/nix/store/foo", 226 "/nix/store/bar" 227 ] 228 }, 229 { 230 "path": "/nix/store/bar", 231 "references": [ 232 "/nix/store/bar", 233 "/nix/store/tux" 234 ] 235 }, 236 { 237 "path": "/nix/store/hello", 238 "references": [ 239 ] 240 } 241 ]), 242 ["/nix/store/foo", "/nix/store/bar", "/nix/store/hello", "/nix/store/tux",] 243 ) 244 def test_no_references(self): 245 self.assertFalse( 246 any_refer_to( 247 "/nix/store/foo", 248 [ 249 { 250 "path": "/nix/store/foo", 251 "references": [ 252 "/nix/store/foo", 253 "/nix/store/bar" 254 ] 255 }, 256 ] 257 ), 258 ) 259 260# Convert: 261# 262# [ 263# { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] }, 264# { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] }, 265# { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] }, 266# { path: /nix/store/tux, references: [ /nix/store/tux ] } 267# ] 268# 269# To: 270# { 271# /nix/store/foo: [ /nix/store/bar, /nix/store/baz ], 272# /nix/store/bar: [ /nix/store/baz ], 273# /nix/store/baz: [ /nix/store/tux ] }, 274# /nix/store/tux: [ ] 275# } 276# 277# Note that it drops self-references to avoid loops. 278def make_lookup(closures): 279 lookup = {} 280 281 for closure in closures: 282 # paths often self-refer 283 nonreferential_paths = [ref for ref in closure['references'] if ref != closure['path']] 284 lookup[closure['path']] = nonreferential_paths 285 286 return lookup 287 288class TestMakeLookup(unittest.TestCase): 289 def test_returns_lookp(self): 290 self.assertDictEqual( 291 make_lookup([ 292 { 293 "path": "/nix/store/foo", 294 "references": [ 295 "/nix/store/foo", 296 "/nix/store/bar" 297 ] 298 }, 299 { 300 "path": "/nix/store/bar", 301 "references": [ 302 "/nix/store/bar", 303 "/nix/store/tux" 304 ] 305 }, 306 { 307 "path": "/nix/store/hello", 308 "references": [ 309 ] 310 } 311 ]), 312 { 313 "/nix/store/foo": [ "/nix/store/bar" ], 314 "/nix/store/bar": [ "/nix/store/tux" ], 315 "/nix/store/hello": [ ], 316 } 317 ) 318 319# Convert: 320# 321# /nix/store/foo with 322# { 323# /nix/store/foo: [ /nix/store/bar, /nix/store/baz ], 324# /nix/store/bar: [ /nix/store/baz ], 325# /nix/store/baz: [ /nix/store/tux ] }, 326# /nix/store/tux: [ ] 327# } 328# 329# To: 330# 331# { 332# /nix/store/bar: { 333# /nix/store/baz: { 334# /nix/store/tux: {} 335# } 336# }, 337# /nix/store/baz: { 338# /nix/store/tux: {} 339# } 340# } 341subgraphs_cache = {} 342def make_graph_segment_from_root(root, lookup): 343 global subgraphs_cache 344 children = {} 345 for ref in lookup[root]: 346 # make_graph_segment_from_root is a pure function, and will 347 # always return the same result based on a given input. Thus, 348 # cache computation. 349 # 350 # Python's assignment will use a pointer, preventing memory 351 # bloat for large graphs. 352 if ref not in subgraphs_cache: 353 debug("Subgraph Cache miss on {}".format(ref)) 354 subgraphs_cache[ref] = make_graph_segment_from_root(ref, lookup) 355 else: 356 debug("Subgraph Cache hit on {}".format(ref)) 357 children[ref] = subgraphs_cache[ref] 358 return children 359 360class TestMakeGraphSegmentFromRoot(unittest.TestCase): 361 def test_returns_graph(self): 362 self.assertDictEqual( 363 make_graph_segment_from_root("/nix/store/foo", { 364 "/nix/store/foo": [ "/nix/store/bar" ], 365 "/nix/store/bar": [ "/nix/store/tux" ], 366 "/nix/store/tux": [ ], 367 "/nix/store/hello": [ ], 368 }), 369 { 370 "/nix/store/bar": { 371 "/nix/store/tux": {} 372 } 373 } 374 ) 375 def test_returns_graph_tiny(self): 376 self.assertDictEqual( 377 make_graph_segment_from_root("/nix/store/tux", { 378 "/nix/store/foo": [ "/nix/store/bar" ], 379 "/nix/store/bar": [ "/nix/store/tux" ], 380 "/nix/store/tux": [ ], 381 }), 382 {} 383 ) 384 385# Convert a graph segment in to a popularity-counted dictionary: 386# 387# From: 388# { 389# /nix/store/foo: { 390# /nix/store/bar: { 391# /nix/store/baz: { 392# /nix/store/tux: {} 393# } 394# } 395# /nix/store/baz: { 396# /nix/store/tux: {} 397# } 398# } 399# } 400# 401# to: 402# [ 403# /nix/store/foo: 1 404# /nix/store/bar: 2 405# /nix/store/baz: 4 406# /nix/store/tux: 6 407# ] 408popularity_cache = {} 409def graph_popularity_contest(full_graph): 410 global popularity_cache 411 popularity = defaultdict(int) 412 for path, subgraph in full_graph.items(): 413 popularity[path] += 1 414 # graph_popularity_contest is a pure function, and will 415 # always return the same result based on a given input. Thus, 416 # cache computation. 417 # 418 # Python's assignment will use a pointer, preventing memory 419 # bloat for large graphs. 420 if path not in popularity_cache: 421 debug("Popularity Cache miss on {}", path) 422 popularity_cache[path] = graph_popularity_contest(subgraph) 423 else: 424 debug("Popularity Cache hit on {}", path) 425 426 subcontest = popularity_cache[path] 427 for subpath, subpopularity in subcontest.items(): 428 debug("Calculating popularity for {}", subpath) 429 popularity[subpath] += subpopularity + 1 430 431 return popularity 432 433class TestGraphPopularityContest(unittest.TestCase): 434 def test_counts_popularity(self): 435 self.assertDictEqual( 436 graph_popularity_contest({ 437 "/nix/store/foo": { 438 "/nix/store/bar": { 439 "/nix/store/baz": { 440 "/nix/store/tux": {} 441 } 442 }, 443 "/nix/store/baz": { 444 "/nix/store/tux": {} 445 } 446 } 447 }), 448 { 449 "/nix/store/foo": 1, 450 "/nix/store/bar": 2, 451 "/nix/store/baz": 4, 452 "/nix/store/tux": 6, 453 } 454 ) 455 456# Emit a list of packages by popularity, most first: 457# 458# From: 459# [ 460# /nix/store/foo: 1 461# /nix/store/bar: 1 462# /nix/store/baz: 2 463# /nix/store/tux: 2 464# ] 465# 466# To: 467# [ /nix/store/baz /nix/store/tux /nix/store/bar /nix/store/foo ] 468def order_by_popularity(paths): 469 paths_by_popularity = defaultdict(list) 470 popularities = [] 471 for path, popularity in paths.items(): 472 popularities.append(popularity) 473 paths_by_popularity[popularity].append(path) 474 475 popularities = list(set(popularities)) 476 popularities.sort() 477 478 flat_ordered = [] 479 for popularity in popularities: 480 paths = paths_by_popularity[popularity] 481 paths.sort(key=package_name) 482 483 flat_ordered.extend(reversed(paths)) 484 return list(reversed(flat_ordered)) 485 486 487class TestOrderByPopularity(unittest.TestCase): 488 def test_returns_in_order(self): 489 self.assertEqual( 490 order_by_popularity({ 491 "/nix/store/foo": 1, 492 "/nix/store/bar": 1, 493 "/nix/store/baz": 2, 494 "/nix/store/tux": 2, 495 }), 496 [ 497 "/nix/store/baz", 498 "/nix/store/tux", 499 "/nix/store/bar", 500 "/nix/store/foo" 501 ] 502 ) 503 504def package_name(path): 505 parts = path.split('-') 506 start = parts.pop(0) 507 # don't throw away any data, so the order is always the same. 508 # even in cases where only the hash at the start has changed. 509 parts.append(start) 510 return '-'.join(parts) 511 512def main(): 513 filename = sys.argv[1] 514 key = sys.argv[2] 515 516 debug("Loading from {}", filename) 517 with open(filename) as f: 518 data = json.load(f) 519 520 # Data comes in as: 521 # [ 522 # { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] }, 523 # { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] }, 524 # { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] }, 525 # { path: /nix/store/tux, references: [ /nix/store/tux ] } 526 # ] 527 # 528 # and we want to get out a list of paths ordered by how universally, 529 # important they are, ie: tux is referenced by every path, transitively 530 # so it should be #1 531 # 532 # [ 533 # /nix/store/tux, 534 # /nix/store/baz, 535 # /nix/store/bar, 536 # /nix/store/foo, 537 # ] 538 graph = data[key] 539 540 debug("Finding roots from {}", key) 541 roots = find_roots(graph); 542 debug("Making lookup for {}", key) 543 lookup = make_lookup(graph) 544 545 full_graph = {} 546 for root in roots: 547 debug("Making full graph for {}", root) 548 full_graph[root] = make_graph_segment_from_root(root, lookup) 549 550 debug("Running contest") 551 contest = graph_popularity_contest(full_graph) 552 debug("Ordering by popularity") 553 ordered = order_by_popularity(contest) 554 debug("Checking for missing paths") 555 missing = [] 556 for path in all_paths(graph): 557 if path not in ordered: 558 missing.append(path) 559 560 ordered.extend(missing) 561 print("\n".join(ordered)) 562 563if "--test" in sys.argv: 564 # Don't pass --test otherwise unittest gets mad 565 unittest.main(argv = [f for f in sys.argv if f != "--test" ]) 566else: 567 main()