this repo has no description
at trunk 336 lines 11 kB view raw
1#!/usr/bin/env python3 2# Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com) 3""" 4Utility script that provides default arguments for executing a command 5with various performance measurement tools. 6""" 7import logging 8import os 9import re 10import subprocess 11import tempfile 12from abc import ABC, abstractmethod 13from multiprocessing.pool import ThreadPool 14 15 16log = logging.getLogger(__name__) 17SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) 18 19 20def run(cmd, **kwargs): 21 env = dict(os.environ) 22 env["PYTHONHASHSEED"] = "0" 23 log.info(f">>> {' '.join(cmd)}") 24 return subprocess.run(cmd, encoding="UTF-8", env=env, check=True, **kwargs) 25 26 27def create_taskset_command(isolated_cpus): 28 if isolated_cpus == "": 29 return [] 30 # If it ever matters in the future, this only pulls out the first integer 31 # encountered in the list of isolated cpus 32 isolated_cpus = re.findall(r"\d+", isolated_cpus)[0] 33 return ["taskset", "--cpu-list", isolated_cpus] 34 35 36def pin_to_cpus(): 37 if not os.path.exists("/sys/devices/system/cpu/isolated"): 38 return [] 39 completed_process = run( 40 ["cat", "/sys/devices/system/cpu/isolated"], stdout=subprocess.PIPE 41 ) 42 isolated_cpus = completed_process.stdout.strip() 43 return create_taskset_command(isolated_cpus) 44 45 46class PerformanceTool(ABC): 47 # Read any optional command line arguments to set the internal defaults 48 # Input: A dictionary with command line arguments 49 def __init__(self, args): 50 pass 51 52 # Specify the name of the tool along with a description 53 @staticmethod 54 @abstractmethod 55 def add_tool(): 56 return "" 57 58 # Add any optional command line arguments to tune the tool 59 @staticmethod 60 def add_optional_arguments(parser): 61 return parser 62 63 64class SequentialPerformanceTool(PerformanceTool): 65 # The main function to execute the specified performance tool. 66 # Input: run.Interpreter, run.Benchmark 67 # Output: A dictionary with the values to be reported 68 @abstractmethod 69 def execute(self, interpreter, benchmark): 70 pass 71 72 73class ParallelPerformanceTool(PerformanceTool): 74 # TODO update 75 # The main function to execute the specified performance tool in parallel. 76 # Input: list<run.Interpreter>, list<run.Benchmark> 77 # Output: A list of dictionaries with the values to be reported. Each 78 # dictionary must have both 'benchmark' and 'interpreter' reported 79 @abstractmethod 80 def execute_parallel(self, interpreters, benchmarks): 81 pass 82 83 84class TimeTool(SequentialPerformanceTool): 85 NAME = "time" 86 87 def execute(self, interpreter, benchmark): 88 command = pin_to_cpus() 89 command.extend( 90 [ 91 *interpreter.interpreter_cmd, 92 f"{SCRIPT_DIR}/_time_tool.py", 93 # The time tool imports the module, which will use the bytecode 94 # cache. Pass the source file instead of the bytecode file. 95 benchmark.filepath(), 96 *interpreter.benchmark_args, 97 ] 98 ) 99 completed_process = run(command, stdout=subprocess.PIPE) 100 time_output = completed_process.stdout.strip() 101 events = [event.split(" , ") for event in time_output.split("\n")] 102 result = {event[0]: event[1] for event in events} 103 if "time_sec" in result: 104 result["time_sec"] = float(result["time_sec"]) 105 if "time_sec_mean" in result: 106 result["time_sec_mean"] = float(result["time_sec_mean"]) 107 result["time_sec_stdev"] = float(result["time_sec_stdev"]) 108 return result 109 110 @staticmethod 111 def add_tool(): 112 return f""" 113'{TimeTool.NAME}': Use the 'time' command to measure execution time 114""" 115 116 117class PerfStat(SequentialPerformanceTool): 118 NAME = "perfstat" 119 DEFAULT_EVENTS = ["task-clock", "instructions"] 120 121 def __init__(self, args): 122 self.events = PerfStat.DEFAULT_EVENTS if not args["events"] else args["events"] 123 124 def parse_perfstat(self, output): 125 if ";" not in output: 126 log.error(f"perf stat returned an error: {output}") 127 return {} 128 events = [e.split(";") for e in output.split("\n") if ";" in e] 129 results = {} 130 for event in events: 131 name = event[2] 132 value = event[0] 133 if value in ("<not counted>", "<not supported>", ""): 134 continue 135 value = float(value) if "." in value else int(value) 136 results[name] = value 137 return results 138 139 def execute(self, interpreter, benchmark): 140 command = pin_to_cpus() 141 command += ["perf", "stat"] 142 command += ["--field-separator", ";"] 143 command += ["--repeat", "5"] 144 145 # To avoid event multiplexing, we only run two events at a time 146 results = {} 147 events = [event for event in self.events] 148 bytecode_path = compile_bytecode(interpreter, benchmark) 149 while events: 150 full_command = command + ["--event", events.pop(0)] 151 if events: 152 full_command += ["--event", events.pop(0)] 153 full_command += [ 154 *interpreter.interpreter_cmd, 155 bytecode_path, 156 *interpreter.benchmark_args, 157 ] 158 completed_process = run(full_command, stderr=subprocess.PIPE) 159 perfstat_output = completed_process.stderr.strip() 160 results.update(self.parse_perfstat(perfstat_output)) 161 return results 162 163 @staticmethod 164 def add_tool(): 165 return f""" 166'{PerfStat.NAME}': Use `perf stat` to measure the execution time of 167a benchmark. This repeats the run 10 times to find a significant result 168""" 169 170 # Add any optional command line arguments to tune the tool 171 @staticmethod 172 def add_optional_arguments(parser): 173 perfstat_event_help = f""" 174Specify the perf stat event to run. Please note, only two are run at the 175same time to avoid event multiplexing. For a full list of perf stat events, 176run: `perf list`. 177 178Examples: 'instructions', 'branch-misses', 'L1-icache-load-misses' 179 180Default: {PerfStat.DEFAULT_EVENTS} 181""" 182 parser.add_argument( 183 "--event", 184 metavar="EVENT", 185 dest="events", 186 type=str, 187 action="append", 188 default=[], 189 help=perfstat_event_help, 190 ) 191 return parser 192 193 194class Callgrind(ParallelPerformanceTool): 195 NAME = "callgrind" 196 197 def __init__(self, args): 198 self.callgrind_out_dir = args.get("callgrind_out_dir") 199 200 def _worker(self, interpreter, benchmark): 201 delete = True 202 callgrind_out_dir = self.callgrind_out_dir 203 if callgrind_out_dir is not None: 204 callgrind_out_dir = os.path.abspath(callgrind_out_dir) 205 os.makedirs(callgrind_out_dir, exist_ok=True) 206 delete = False 207 with tempfile.NamedTemporaryFile( 208 dir=callgrind_out_dir, 209 prefix=f"{benchmark.name}_", 210 suffix=".cg", 211 delete=delete, 212 ) as temp_file: 213 bytecode_path = compile_bytecode(interpreter, benchmark) 214 run( 215 [ 216 "valgrind", 217 "--quiet", 218 "--tool=callgrind", 219 "--trace-children=yes", 220 f"--callgrind-out-file={temp_file.name}", 221 *interpreter.interpreter_cmd, 222 bytecode_path, 223 *interpreter.benchmark_args, 224 ] 225 ) 226 227 instructions = 1 228 with open(temp_file.name) as fd: 229 r = re.compile(r"summary:\s*(.*)") 230 for line in fd: 231 m = r.match(line) 232 if m: 233 instructions = int(m.group(1)) 234 return { 235 "benchmark": benchmark.name, 236 "interpreter": interpreter.name, 237 "cg_instructions": instructions, 238 } 239 240 def execute_parallel(self, interpreters, benchmarks): 241 pool = ThreadPool() 242 async_results = [] 243 for interpreter in interpreters: 244 for benchmark in benchmarks: 245 r = pool.apply_async(self._worker, (interpreter, benchmark)) 246 async_results.append(r) 247 248 results = [] 249 for ar in async_results: 250 results.append(ar.get()) 251 return results 252 253 @classmethod 254 def add_tool(cls): 255 return f""" 256'{cls.NAME}': Measure executed instructions with `valgrind`/`callgrind`. 257""" 258 259 @staticmethod 260 def add_optional_arguments(parser): 261 parser.add_argument("--callgrind-out-dir", metavar="DIRECTORY") 262 return parser 263 264 265class Size(SequentialPerformanceTool): 266 NAME = "size" 267 268 def __init__(self, args): 269 pass 270 271 def execute(self, interpreter, benchmark): 272 command = ["size", "--format=sysv", interpreter.binary] 273 completed_process = run(command, stdout=subprocess.PIPE) 274 size_output = completed_process.stdout.strip() 275 size = 0 276 r = re.compile(r"([a-zA-Z0-9_.]+)\s+([0-9]+)\s+[0-9a-fA-F]+$") 277 for line in size_output.splitlines(): 278 m = r.match(line) 279 if not m: 280 continue 281 section_name = m.group(1) 282 section_size = m.group(2) 283 if section_name == ".text" or section_name == "__text": 284 size += int(section_size) 285 if size == 0: 286 log.error(f"Could not determine text segment size of {interpreter.binary}") 287 return {} 288 return {"size_text": size} 289 290 @classmethod 291 def add_tool(cls): 292 return f""" 293'{cls.NAME}': Use `size` to measure the size of the interpreters text segment. 294""" 295 296 297def add_tools_arguments(parser): 298 measure_tools_help = "The measurement tool to use. Available Tools: \n" 299 for tool in TOOLS: 300 measure_tools_help += tool.add_tool() 301 302 available_tools = [tool.NAME for tool in TOOLS] 303 parser.add_argument( 304 "--tool", 305 "-t", 306 metavar="TOOL", 307 dest="tools", 308 type=str, 309 action="append", 310 default=[], 311 choices=available_tools, 312 help=measure_tools_help, 313 ) 314 315 for tool in TOOLS: 316 parser = tool.add_optional_arguments(parser) 317 318 return parser 319 320 321def compile_bytecode(interpreter, benchmark): 322 log.info(f"Compiling benchmark for {interpreter.name}: {benchmark.name}") 323 command = [ 324 *interpreter.interpreter_cmd, 325 f"{SCRIPT_DIR}/_compile_tool.py", 326 benchmark.filepath(), 327 *interpreter.benchmark_args, 328 ] 329 result = run(command, stdout=subprocess.PIPE) 330 return result.stdout.lstrip().rstrip() # remove '\n' 331 332 333# Use this to register any new tools 334SEQUENTIAL_TOOLS = [TimeTool, PerfStat, Size] 335PARALLEL_TOOLS = [Callgrind] 336TOOLS = SEQUENTIAL_TOOLS + PARALLEL_TOOLS