// ------------------------------------------------------------------------------------------------- // zmath - benchmarks // ------------------------------------------------------------------------------------------------- // 'zig build benchmark -Doptimize=ReleaseFast' will build and benchmakrs with all optimisations. // // ------------------------------------------------------------------------------------------------- // 'AMD Ryzen 9 3950X 16-Core Processor', Windows 11, Zig 0.10.0-dev.2620+0e9458a3f, ReleaseFast // ------------------------------------------------------------------------------------------------- // matrix mul benchmark (AOS) - scalar version: 1.5880s, zmath version: 1.0642s // cross3, scale, bias benchmark (AOS) - scalar version: 0.9318s, zmath version: 0.6888s // cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.2258s, zmath version: 1.1095s // quaternion mul benchmark (AOS) - scalar version: 1.4123s, zmath version: 0.6958s // wave benchmark (SOA) - scalar version: 4.8165s, zmath version: 0.7338s // // ------------------------------------------------------------------------------------------------- // 'AMD Ryzen 7 5800X 8-Core Processer', Linux 5.17.14, Zig 0.10.0-dev.2624+d506275a0, ReleaseFast // ------------------------------------------------------------------------------------------------- // matrix mul benchmark (AOS) - scalar version: 1.3672s, zmath version: 0.8617s // cross3, scale, bias benchmark (AOS) - scalar version: 0.6586s, zmath version: 0.4803s // cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.0620s, zmath version: 0.8942s // quaternion mul benchmark (AOS) - scalar version: 1.1324s, zmath version: 0.6064s // wave benchmark (SOA) - scalar version: 3.6598s, zmath version: 0.4231s // // ------------------------------------------------------------------------------------------------- // 'Apple M1 Max', macOS Version 12.4, Zig 0.10.0-dev.2657+74442f350, ReleaseFast // ------------------------------------------------------------------------------------------------- // matrix mul benchmark (AOS) - scalar version: 1.0297s, zmath version: 1.0538s // cross3, scale, bias benchmark (AOS) - scalar version: 0.6294s, zmath version: 0.6532s // cross3, dot3, scale, bias benchmark (AOS) - scalar version: 0.9807s, zmath version: 1.0988s // quaternion mul benchmark (AOS) - scalar version: 1.5413s, zmath version: 0.7800s // wave benchmark (SOA) - scalar version: 3.4220s, zmath version: 1.0255s // // ------------------------------------------------------------------------------------------------- // '11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz', Windows 11, Zig 0.10.0-dev.2620+0e9458a3f, ReleaseFast // ------------------------------------------------------------------------------------------------- // matrix mul benchmark (AOS) - scalar version: 2.2308s, zmath version: 0.9376s // cross3, scale, bias benchmark (AOS) - scalar version: 1.0821s, zmath version: 0.5110s // cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.6580s, zmath version: 0.9167s // quaternion mul benchmark (AOS) - scalar version: 2.0139s, zmath version: 0.5856s // wave benchmark (SOA) - scalar version: 3.7832s, zmath version: 0.3642s // // ------------------------------------------------------------------------------------------------- pub fn main() !void { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer _ = gpa.deinit(); const allocator = gpa.allocator(); // m = mul(ma, mb); data set fits in L1 cache; AOS data layout. try mat4MulBenchmark(allocator, 100_000); // v = 0.01 * cross3(va, vb) + vec3(1.0); data set fits in L1 cache; AOS data layout. try cross3ScaleBiasBenchmark(allocator, 10_000); // v = dot3(va, vb) * (0.1 * cross3(va, vb) + vec3(1.0)); data set fits in L1 cache; AOS data layout. try cross3Dot3ScaleBiasBenchmark(allocator, 10_000); // q = qmul(qa, qb); data set fits in L1 cache; AOS data layout. try quatBenchmark(allocator, 10_000); // d = sqrt(x * x + z * z); y = sin(d - t); SOA layout. try waveBenchmark(allocator, 1_000); } const std = @import("std"); const time = std.time; const Timer = time.Timer; const zm = @import("zmath"); var prng = std.Random.DefaultPrng.init(0); const random = prng.random(); noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { std.debug.print("\n", .{}); std.debug.print("{s:>42} - ", .{"matrix mul benchmark (AOS)"}); var data0 = std.ArrayList([16]f32).init(allocator); defer data0.deinit(); var data1 = std.ArrayList([16]f32).init(allocator); defer data1.deinit(); var i: usize = 0; while (i < 64) : (i += 1) { try data0.append([16]f32{ random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), }); try data1.append([16]f32{ random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), random.float(f32), }); } // Warmup, fills L1 cache. i = 0; while (i < 100) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { const ma = zm.loadMat(a[0..]); const mb = zm.loadMat(b[0..]); const r = zm.mul(ma, mb); std.mem.doNotOptimizeAway(&r); } } } { i = 0; var timer = try Timer.start(); const start = timer.lap(); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { const r = [16]f32{ a[0] * b[0] + a[1] * b[4] + a[2] * b[8] + a[3] * b[12], a[0] * b[1] + a[1] * b[5] + a[2] * b[9] + a[3] * b[13], a[0] * b[2] + a[1] * b[6] + a[2] * b[10] + a[3] * b[14], a[0] * b[3] + a[1] * b[7] + a[2] * b[11] + a[3] * b[15], a[4] * b[0] + a[5] * b[4] + a[6] * b[8] + a[7] * b[12], a[4] * b[1] + a[5] * b[5] + a[6] * b[9] + a[7] * b[13], a[4] * b[2] + a[5] * b[6] + a[6] * b[10] + a[7] * b[14], a[4] * b[3] + a[5] * b[7] + a[6] * b[11] + a[7] * b[15], a[8] * b[0] + a[9] * b[4] + a[10] * b[8] + a[11] * b[12], a[8] * b[1] + a[9] * b[5] + a[10] * b[9] + a[11] * b[13], a[8] * b[2] + a[9] * b[6] + a[10] * b[10] + a[11] * b[14], a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11] * b[15], a[12] * b[0] + a[13] * b[4] + a[14] * b[8] + a[15] * b[12], a[12] * b[1] + a[13] * b[5] + a[14] * b[9] + a[15] * b[13], a[12] * b[2] + a[13] * b[6] + a[14] * b[10] + a[15] * b[14], a[12] * b[3] + a[13] * b[7] + a[14] * b[11] + a[15] * b[15], }; std.mem.doNotOptimizeAway(&r); } } } const end = timer.read(); const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); } { i = 0; var timer = try Timer.start(); const start = timer.lap(); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { const ma = zm.loadMat(a[0..]); const mb = zm.loadMat(b[0..]); const r = zm.mul(ma, mb); std.mem.doNotOptimizeAway(&r); } } } const end = timer.read(); const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); } } noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { std.debug.print("{s:>42} - ", .{"cross3, scale, bias benchmark (AOS)"}); var data0 = std.ArrayList([3]f32).init(allocator); defer data0.deinit(); var data1 = std.ArrayList([3]f32).init(allocator); defer data1.deinit(); var i: usize = 0; while (i < 256) : (i += 1) { try data0.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); try data1.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); } // Warmup, fills L1 cache. i = 0; while (i < 100) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { const va = zm.loadArr3(a); const vb = zm.loadArr3(b); const cp = zm.f32x4s(0.01) * zm.cross3(va, vb) + zm.f32x4s(1.0); std.mem.doNotOptimizeAway(&cp); } } } { i = 0; var timer = try Timer.start(); const start = timer.lap(); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { const r = [3]f32{ 0.01 * (a[1] * b[2] - a[2] * b[1]) + 1.0, 0.01 * (a[2] * b[0] - a[0] * b[2]) + 1.0, 0.01 * (a[0] * b[1] - a[1] * b[0]) + 1.0, }; std.mem.doNotOptimizeAway(&r); } } } const end = timer.read(); const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); } { i = 0; var timer = try Timer.start(); const start = timer.lap(); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { const va = zm.loadArr3(a); const vb = zm.loadArr3(b); const cp = zm.f32x4s(0.01) * zm.cross3(va, vb) + zm.f32x4s(1.0); std.mem.doNotOptimizeAway(&cp); } } } const end = timer.read(); const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); } } noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { std.debug.print("{s:>42} - ", .{"cross3, dot3, scale, bias benchmark (AOS)"}); var data0 = std.ArrayList([3]f32).init(allocator); defer data0.deinit(); var data1 = std.ArrayList([3]f32).init(allocator); defer data1.deinit(); var i: usize = 0; while (i < 256) : (i += 1) { try data0.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); try data1.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); } // Warmup, fills L1 cache. i = 0; while (i < 100) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { const va = zm.loadArr3(a); const vb = zm.loadArr3(b); const r = (zm.dot3(va, vb) * (zm.f32x4s(0.1) * zm.cross3(va, vb) + zm.f32x4s(1.0)))[0]; std.mem.doNotOptimizeAway(&r); } } } { i = 0; var timer = try Timer.start(); const start = timer.lap(); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { const d = a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; const r = [3]f32{ d * (0.1 * (a[1] * b[2] - a[2] * b[1]) + 1.0), d * (0.1 * (a[2] * b[0] - a[0] * b[2]) + 1.0), d * (0.1 * (a[0] * b[1] - a[1] * b[0]) + 1.0), }; std.mem.doNotOptimizeAway(&r); } } } const end = timer.read(); const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); } { i = 0; var timer = try Timer.start(); const start = timer.lap(); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { const va = zm.loadArr3(a); const vb = zm.loadArr3(b); const r = zm.dot3(va, vb) * (zm.f32x4s(0.1) * zm.cross3(va, vb) + zm.f32x4s(1.0)); std.mem.doNotOptimizeAway(&r); } } } const end = timer.read(); const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); } } noinline fn quatBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { std.debug.print("{s:>42} - ", .{"quaternion mul benchmark (AOS)"}); var data0 = std.ArrayList([4]f32).init(allocator); defer data0.deinit(); var data1 = std.ArrayList([4]f32).init(allocator); defer data1.deinit(); var i: usize = 0; while (i < 256) : (i += 1) { try data0.append([4]f32{ random.float(f32), random.float(f32), random.float(f32), random.float(f32) }); try data1.append([4]f32{ random.float(f32), random.float(f32), random.float(f32), random.float(f32) }); } // Warmup, fills L1 cache. i = 0; while (i < 100) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { const va = zm.loadArr4(a); const vb = zm.loadArr4(b); const r = zm.qmul(va, vb); std.mem.doNotOptimizeAway(&r); } } } { i = 0; var timer = try Timer.start(); const start = timer.lap(); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { const r = [4]f32{ (b[3] * a[0]) + (b[0] * a[3]) + (b[1] * a[2]) - (b[2] * a[1]), (b[3] * a[1]) - (b[0] * a[2]) + (b[1] * a[3]) + (b[2] * a[0]), (b[3] * a[2]) + (b[0] * a[1]) - (b[1] * a[0]) + (b[2] * a[3]), (b[3] * a[3]) - (b[0] * a[0]) - (b[1] * a[1]) - (b[2] * a[2]), }; std.mem.doNotOptimizeAway(&r); } } } const end = timer.read(); const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); } { i = 0; var timer = try Timer.start(); const start = timer.lap(); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { const va = zm.loadArr4(a); const vb = zm.loadArr4(b); const r = zm.qmul(va, vb); std.mem.doNotOptimizeAway(&r); } } } const end = timer.read(); const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); } } noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { _ = allocator; std.debug.print("{s:>42} - ", .{"wave benchmark (SOA)"}); const grid_size = 1024; { var t: f32 = 0.0; const scale: f32 = 0.05; var timer = try Timer.start(); const start = timer.lap(); var iter: usize = 0; while (iter < count) : (iter += 1) { var z_index: i32 = 0; while (z_index < grid_size) : (z_index += 1) { const z = scale * @as(f32, @floatFromInt(z_index - grid_size / 2)); var x_index: i32 = 0; while (x_index < grid_size) : (x_index += 4) { const x0 = scale * @as(f32, @floatFromInt(x_index + 0 - grid_size / 2)); const x1 = scale * @as(f32, @floatFromInt(x_index + 1 - grid_size / 2)); const x2 = scale * @as(f32, @floatFromInt(x_index + 2 - grid_size / 2)); const x3 = scale * @as(f32, @floatFromInt(x_index + 3 - grid_size / 2)); const d0 = zm.sqrt(x0 * x0 + z * z); const d1 = zm.sqrt(x1 * x1 + z * z); const d2 = zm.sqrt(x2 * x2 + z * z); const d3 = zm.sqrt(x3 * x3 + z * z); const y0 = zm.sin(d0 - t); const y1 = zm.sin(d1 - t); const y2 = zm.sin(d2 - t); const y3 = zm.sin(d3 - t); std.mem.doNotOptimizeAway(&y0); std.mem.doNotOptimizeAway(&y1); std.mem.doNotOptimizeAway(&y2); std.mem.doNotOptimizeAway(&y3); } } t += 0.001; } const end = timer.read(); const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); } { const T = zm.F32x16; const static = struct { const offsets = [16]f32{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; }; const voffset = zm.load(static.offsets[0..], T, 0); var vt = zm.splat(T, 0.0); const scale: f32 = 0.05; var timer = try Timer.start(); const start = timer.lap(); var iter: usize = 0; while (iter < count) : (iter += 1) { var z_index: i32 = 0; while (z_index < grid_size) : (z_index += 1) { const z = scale * @as(f32, @floatFromInt(z_index - grid_size / 2)); const vz = zm.splat(T, z); var x_index: i32 = 0; while (x_index < grid_size) : (x_index += zm.veclen(T)) { const x = scale * @as(f32, @floatFromInt(x_index - grid_size / 2)); const vx = zm.splat(T, x) + voffset * zm.splat(T, scale); const d = zm.sqrt(vx * vx + vz * vz); const vy = zm.sin(d - vt); std.mem.doNotOptimizeAway(&vy); } } vt += zm.splat(T, 0.001); } const end = timer.read(); const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); } }