lol

nixos/test-driver: provide a global timeout

Since the debut of the test-driver, we didn't obtain
a race timer with the test execution to ensure that tests doesn't run beyond
a certain amount of time.

This is particularly important when you are running into hanging tests
which cannot be detected by current facilities (requires more pvpanic wiring up, QMP
API stuff, etc.).

Two easy examples:

- Some QEMU tests may get stuck in some situation and run for more than 24 hours → we default to 1 hour max.
- Some QEMU tests may panic in the wrong place, e.g. UEFI firmware or worse → end users can set a "reasonable" amount of time

And then, we should let the retry logic retest them until they succeed and adjust
their global timeouts.

Of course, this does not help with the fact that the timeout may need to be
a function of the actual busyness of the machine running the tests.
This is only one step towards increased reliability.

+48
+9
nixos/lib/test-driver/test_driver/__init__.py
··· 77 77 help="vlans to span by the driver", 78 78 ) 79 79 arg_parser.add_argument( 80 + "--global-timeout", 81 + type=int, 82 + metavar="GLOBAL_TIMEOUT", 83 + action=EnvDefault, 84 + envvar="globalTimeout", 85 + help="Timeout in seconds for the whole test", 86 + ) 87 + arg_parser.add_argument( 80 88 "-o", 81 89 "--output_directory", 82 90 help="""The path to the directory where outputs copied from the VM will be placed. ··· 103 111 args.testscript.read_text(), 104 112 args.output_directory.resolve(), 105 113 args.keep_vm_state, 114 + args.global_timeout, 106 115 ) as driver: 107 116 if args.interactive: 108 117 history_dir = os.getcwd()
+25
nixos/lib/test-driver/test_driver/driver.py
··· 1 1 import os 2 2 import re 3 + import signal 3 4 import tempfile 5 + import threading 4 6 from contextlib import contextmanager 5 7 from pathlib import Path 6 8 from typing import Any, Callable, ContextManager, Dict, Iterator, List, Optional, Union ··· 41 43 vlans: List[VLan] 42 44 machines: List[Machine] 43 45 polling_conditions: List[PollingCondition] 46 + global_timeout: int 47 + race_timer: threading.Timer 44 48 45 49 def __init__( 46 50 self, ··· 49 53 tests: str, 50 54 out_dir: Path, 51 55 keep_vm_state: bool = False, 56 + global_timeout: int = 24 * 60 * 60 * 7, 52 57 ): 53 58 self.tests = tests 54 59 self.out_dir = out_dir 60 + self.global_timeout = global_timeout 61 + self.race_timer = threading.Timer(global_timeout, self.terminate_test) 55 62 56 63 tmp_dir = get_tmp_dir() 57 64 ··· 82 89 83 90 def __exit__(self, *_: Any) -> None: 84 91 with rootlog.nested("cleanup"): 92 + self.race_timer.cancel() 85 93 for machine in self.machines: 86 94 machine.release() 87 95 ··· 144 152 145 153 def run_tests(self) -> None: 146 154 """Run the test script (for non-interactive test runs)""" 155 + rootlog.info( 156 + f"Test will time out and terminate in {self.global_timeout} seconds" 157 + ) 158 + self.race_timer.start() 147 159 self.test_script() 148 160 # TODO: Collect coverage data 149 161 for machine in self.machines: ··· 161 173 with rootlog.nested("wait for all VMs to finish"): 162 174 for machine in self.machines: 163 175 machine.wait_for_shutdown() 176 + self.race_timer.cancel() 177 + 178 + def terminate_test(self) -> None: 179 + # This will be usually running in another thread than 180 + # the thread actually executing the test script. 181 + with rootlog.nested("timeout reached; test terminating..."): 182 + for machine in self.machines: 183 + machine.release() 184 + # As we cannot `sys.exit` from another thread 185 + # We can at least force the main thread to get SIGTERM'ed. 186 + # This will prevent any user who caught all the exceptions 187 + # to swallow them and prevent itself from terminating. 188 + os.kill(os.getpid(), signal.SIGTERM) 164 189 165 190 def create_machine(self, args: Dict[str, Any]) -> Machine: 166 191 tmp_dir = get_tmp_dir()
+1
nixos/lib/testing-python.nix
··· 42 42 , nodes ? {} 43 43 , testScript 44 44 , enableOCR ? false 45 + , globalTimeout ? (60 * 60) 45 46 , name ? "unnamed" 46 47 , skipTypeCheck ? false 47 48 # Skip linting (mainly intended for faster dev cycles)
+13
nixos/lib/testing/driver.nix
··· 94 94 wrapProgram $out/bin/nixos-test-driver \ 95 95 --set startScripts "''${vmStartScripts[*]}" \ 96 96 --set testScript "$out/test-script" \ 97 + --set globalTimeout "${toString config.globalTimeout}" \ 97 98 --set vlans '${toString vlans}' \ 98 99 ${lib.escapeShellArgs (lib.concatMap (arg: ["--add-flags" arg]) config.extraDriverArgs)} 99 100 ''; ··· 121 122 type = types.package; 122 123 default = hostPkgs.qemu_test; 123 124 defaultText = "hostPkgs.qemu_test"; 125 + }; 126 + 127 + globalTimeout = mkOption { 128 + description = mdDoc '' 129 + A global timeout for the complete test, expressed in seconds. 130 + Beyond that timeout, every resource will be killed and released and the test will fail. 131 + 132 + By default, we use a 1 hour timeout. 133 + ''; 134 + type = types.int; 135 + default = 60 * 60; 136 + example = 10 * 60; 124 137 }; 125 138 126 139 enableOCR = mkOption {