benchmarks/_tools.py at trunk · bernsteinbear.com/skybison

bernsteinbear.com / skybison
fork atom
this repo has no description
fork atom
skybison / benchmarks / _tools.py
at trunk 336 lines 11 kB view raw
wrap content
Max Bernstein Add license headers 4y ago
29d072a3
  1#!/usr/bin/env python3
  2# Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com)
  3"""
  4Utility script that provides default arguments for executing a command
  5with various performance measurement tools.
  6"""
  7import logging
  8import os
  9import re
 10import subprocess
 11import tempfile
 12from abc import ABC, abstractmethod
 13from multiprocessing.pool import ThreadPool
 14
 15
 16log = logging.getLogger(__name__)
 17SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 18
 19
 20def run(cmd, **kwargs):
 21    env = dict(os.environ)
 22    env["PYTHONHASHSEED"] = "0"
 23    log.info(f">>> {' '.join(cmd)}")
 24    return subprocess.run(cmd, encoding="UTF-8", env=env, check=True, **kwargs)
 25
 26
 27def create_taskset_command(isolated_cpus):
 28    if isolated_cpus == "":
 29        return []
 30    # If it ever matters in the future, this only pulls out the first integer
 31    # encountered in the list of isolated cpus
 32    isolated_cpus = re.findall(r"\d+", isolated_cpus)[0]
 33    return ["taskset", "--cpu-list", isolated_cpus]
 34
 35
 36def pin_to_cpus():
 37    if not os.path.exists("/sys/devices/system/cpu/isolated"):
 38        return []
 39    completed_process = run(
 40        ["cat", "/sys/devices/system/cpu/isolated"], stdout=subprocess.PIPE
 41    )
 42    isolated_cpus = completed_process.stdout.strip()
 43    return create_taskset_command(isolated_cpus)
 44
 45
 46class PerformanceTool(ABC):
 47    # Read any optional command line arguments to set the internal defaults
 48    # Input: A dictionary with command line arguments
 49    def __init__(self, args):
 50        pass
 51
 52    # Specify the name of the tool along with a description
 53    @staticmethod
 54    @abstractmethod
 55    def add_tool():
 56        return ""
 57
 58    # Add any optional command line arguments to tune the tool
 59    @staticmethod
 60    def add_optional_arguments(parser):
 61        return parser
 62
 63
 64class SequentialPerformanceTool(PerformanceTool):
 65    # The main function to execute the specified performance tool.
 66    # Input: run.Interpreter, run.Benchmark
 67    # Output: A dictionary with the values to be reported
 68    @abstractmethod
 69    def execute(self, interpreter, benchmark):
 70        pass
 71
 72
 73class ParallelPerformanceTool(PerformanceTool):
 74    # TODO update
 75    # The main function to execute the specified performance tool in parallel.
 76    # Input: list<run.Interpreter>, list<run.Benchmark>
 77    # Output: A list of dictionaries with the values to be reported. Each
 78    #         dictionary must have both 'benchmark' and 'interpreter' reported
 79    @abstractmethod
 80    def execute_parallel(self, interpreters, benchmarks):
 81        pass
 82
 83
 84class TimeTool(SequentialPerformanceTool):
 85    NAME = "time"
 86
 87    def execute(self, interpreter, benchmark):
 88        command = pin_to_cpus()
 89        command.extend(
 90            [
 91                *interpreter.interpreter_cmd,
 92                f"{SCRIPT_DIR}/_time_tool.py",
 93                # The time tool imports the module, which will use the bytecode
 94                # cache. Pass the source file instead of the bytecode file.
 95                benchmark.filepath(),
 96                *interpreter.benchmark_args,
 97            ]
 98        )
 99        completed_process = run(command, stdout=subprocess.PIPE)
100        time_output = completed_process.stdout.strip()
101        events = [event.split(" , ") for event in time_output.split("\n")]
102        result = {event[0]: event[1] for event in events}
103        if "time_sec" in result:
104            result["time_sec"] = float(result["time_sec"])
105        if "time_sec_mean" in result:
106            result["time_sec_mean"] = float(result["time_sec_mean"])
107            result["time_sec_stdev"] = float(result["time_sec_stdev"])
108        return result
109
110    @staticmethod
111    def add_tool():
112        return f"""
113'{TimeTool.NAME}': Use the 'time' command to measure execution time
114"""
115
116
117class PerfStat(SequentialPerformanceTool):
118    NAME = "perfstat"
119    DEFAULT_EVENTS = ["task-clock", "instructions"]
120
121    def __init__(self, args):
122        self.events = PerfStat.DEFAULT_EVENTS if not args["events"] else args["events"]
123
124    def parse_perfstat(self, output):
125        if ";" not in output:
126            log.error(f"perf stat returned an error: {output}")
127            return {}
128        events = [e.split(";") for e in output.split("\n") if ";" in e]
129        results = {}
130        for event in events:
131            name = event[2]
132            value = event[0]
133            if value in ("<not counted>", "<not supported>", ""):
134                continue
135            value = float(value) if "." in value else int(value)
136            results[name] = value
137        return results
138
139    def execute(self, interpreter, benchmark):
140        command = pin_to_cpus()
141        command += ["perf", "stat"]
142        command += ["--field-separator", ";"]
143        command += ["--repeat", "5"]
144
145        # To avoid event multiplexing, we only run two events at a time
146        results = {}
147        events = [event for event in self.events]
148        bytecode_path = compile_bytecode(interpreter, benchmark)
149        while events:
150            full_command = command + ["--event", events.pop(0)]
151            if events:
152                full_command += ["--event", events.pop(0)]
153            full_command += [
154                *interpreter.interpreter_cmd,
155                bytecode_path,
156                *interpreter.benchmark_args,
157            ]
158            completed_process = run(full_command, stderr=subprocess.PIPE)
159            perfstat_output = completed_process.stderr.strip()
160            results.update(self.parse_perfstat(perfstat_output))
161        return results
162
163    @staticmethod
164    def add_tool():
165        return f"""
166'{PerfStat.NAME}': Use `perf stat` to measure the execution time of
167a benchmark. This repeats the run 10 times to find a significant result
168"""
169
170    # Add any optional command line arguments to tune the tool
171    @staticmethod
172    def add_optional_arguments(parser):
173        perfstat_event_help = f"""
174Specify the perf stat event to run. Please note, only two are run at the
175same time to avoid event multiplexing. For a full list of perf stat events,
176run: `perf list`.
177
178Examples: 'instructions', 'branch-misses', 'L1-icache-load-misses'
179
180Default: {PerfStat.DEFAULT_EVENTS}
181"""
182        parser.add_argument(
183            "--event",
184            metavar="EVENT",
185            dest="events",
186            type=str,
187            action="append",
188            default=[],
189            help=perfstat_event_help,
190        )
191        return parser
192
193
194class Callgrind(ParallelPerformanceTool):
195    NAME = "callgrind"
196
197    def __init__(self, args):
198        self.callgrind_out_dir = args.get("callgrind_out_dir")
199
200    def _worker(self, interpreter, benchmark):
201        delete = True
202        callgrind_out_dir = self.callgrind_out_dir
203        if callgrind_out_dir is not None:
204            callgrind_out_dir = os.path.abspath(callgrind_out_dir)
205            os.makedirs(callgrind_out_dir, exist_ok=True)
206            delete = False
207        with tempfile.NamedTemporaryFile(
208            dir=callgrind_out_dir,
209            prefix=f"{benchmark.name}_",
210            suffix=".cg",
211            delete=delete,
212        ) as temp_file:
213            bytecode_path = compile_bytecode(interpreter, benchmark)
214            run(
215                [
216                    "valgrind",
217                    "--quiet",
218                    "--tool=callgrind",
219                    "--trace-children=yes",
220                    f"--callgrind-out-file={temp_file.name}",
221                    *interpreter.interpreter_cmd,
222                    bytecode_path,
223                    *interpreter.benchmark_args,
224                ]
225            )
226
227            instructions = 1
228            with open(temp_file.name) as fd:
229                r = re.compile(r"summary:\s*(.*)")
230                for line in fd:
231                    m = r.match(line)
232                    if m:
233                        instructions = int(m.group(1))
234            return {
235                "benchmark": benchmark.name,
236                "interpreter": interpreter.name,
237                "cg_instructions": instructions,
238            }
239
240    def execute_parallel(self, interpreters, benchmarks):
241        pool = ThreadPool()
242        async_results = []
243        for interpreter in interpreters:
244            for benchmark in benchmarks:
245                r = pool.apply_async(self._worker, (interpreter, benchmark))
246                async_results.append(r)
247
248        results = []
249        for ar in async_results:
250            results.append(ar.get())
251        return results
252
253    @classmethod
254    def add_tool(cls):
255        return f"""
256'{cls.NAME}': Measure executed instructions with `valgrind`/`callgrind`.
257"""
258
259    @staticmethod
260    def add_optional_arguments(parser):
261        parser.add_argument("--callgrind-out-dir", metavar="DIRECTORY")
262        return parser
263
264
265class Size(SequentialPerformanceTool):
266    NAME = "size"
267
268    def __init__(self, args):
269        pass
270
271    def execute(self, interpreter, benchmark):
272        command = ["size", "--format=sysv", interpreter.binary]
273        completed_process = run(command, stdout=subprocess.PIPE)
274        size_output = completed_process.stdout.strip()
275        size = 0
276        r = re.compile(r"([a-zA-Z0-9_.]+)\s+([0-9]+)\s+[0-9a-fA-F]+$")
277        for line in size_output.splitlines():
278            m = r.match(line)
279            if not m:
280                continue
281            section_name = m.group(1)
282            section_size = m.group(2)
283            if section_name == ".text" or section_name == "__text":
284                size += int(section_size)
285        if size == 0:
286            log.error(f"Could not determine text segment size of {interpreter.binary}")
287            return {}
288        return {"size_text": size}
289
290    @classmethod
291    def add_tool(cls):
292        return f"""
293'{cls.NAME}': Use `size` to measure the size of the interpreters text segment.
294"""
295
296
297def add_tools_arguments(parser):
298    measure_tools_help = "The measurement tool to use. Available Tools: \n"
299    for tool in TOOLS:
300        measure_tools_help += tool.add_tool()
301
302    available_tools = [tool.NAME for tool in TOOLS]
303    parser.add_argument(
304        "--tool",
305        "-t",
306        metavar="TOOL",
307        dest="tools",
308        type=str,
309        action="append",
310        default=[],
311        choices=available_tools,
312        help=measure_tools_help,
313    )
314
315    for tool in TOOLS:
316        parser = tool.add_optional_arguments(parser)
317
318    return parser
319
320
321def compile_bytecode(interpreter, benchmark):
322    log.info(f"Compiling benchmark for {interpreter.name}: {benchmark.name}")
323    command = [
324        *interpreter.interpreter_cmd,
325        f"{SCRIPT_DIR}/_compile_tool.py",
326        benchmark.filepath(),
327        *interpreter.benchmark_args,
328    ]
329    result = run(command, stdout=subprocess.PIPE)
330    return result.stdout.lstrip().rstrip()  # remove '\n'
331
332
333# Use this to register any new tools
334SEQUENTIAL_TOOLS = [TimeTool, PerfStat, Size]
335PARALLEL_TOOLS = [Callgrind]
336TOOLS = SEQUENTIAL_TOOLS + PARALLEL_TOOLS