this repo has no description
at main 19 kB view raw
1// ------------------------------------------------------------------------------------------------- 2// zmath - benchmarks 3// ------------------------------------------------------------------------------------------------- 4// 'zig build benchmark -Doptimize=ReleaseFast' will build and benchmakrs with all optimisations. 5// 6// ------------------------------------------------------------------------------------------------- 7// 'AMD Ryzen 9 3950X 16-Core Processor', Windows 11, Zig 0.10.0-dev.2620+0e9458a3f, ReleaseFast 8// ------------------------------------------------------------------------------------------------- 9// matrix mul benchmark (AOS) - scalar version: 1.5880s, zmath version: 1.0642s 10// cross3, scale, bias benchmark (AOS) - scalar version: 0.9318s, zmath version: 0.6888s 11// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.2258s, zmath version: 1.1095s 12// quaternion mul benchmark (AOS) - scalar version: 1.4123s, zmath version: 0.6958s 13// wave benchmark (SOA) - scalar version: 4.8165s, zmath version: 0.7338s 14// 15// ------------------------------------------------------------------------------------------------- 16// 'AMD Ryzen 7 5800X 8-Core Processer', Linux 5.17.14, Zig 0.10.0-dev.2624+d506275a0, ReleaseFast 17// ------------------------------------------------------------------------------------------------- 18// matrix mul benchmark (AOS) - scalar version: 1.3672s, zmath version: 0.8617s 19// cross3, scale, bias benchmark (AOS) - scalar version: 0.6586s, zmath version: 0.4803s 20// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.0620s, zmath version: 0.8942s 21// quaternion mul benchmark (AOS) - scalar version: 1.1324s, zmath version: 0.6064s 22// wave benchmark (SOA) - scalar version: 3.6598s, zmath version: 0.4231s 23// 24// ------------------------------------------------------------------------------------------------- 25// 'Apple M1 Max', macOS Version 12.4, Zig 0.10.0-dev.2657+74442f350, ReleaseFast 26// ------------------------------------------------------------------------------------------------- 27// matrix mul benchmark (AOS) - scalar version: 1.0297s, zmath version: 1.0538s 28// cross3, scale, bias benchmark (AOS) - scalar version: 0.6294s, zmath version: 0.6532s 29// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 0.9807s, zmath version: 1.0988s 30// quaternion mul benchmark (AOS) - scalar version: 1.5413s, zmath version: 0.7800s 31// wave benchmark (SOA) - scalar version: 3.4220s, zmath version: 1.0255s 32// 33// ------------------------------------------------------------------------------------------------- 34// '11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz', Windows 11, Zig 0.10.0-dev.2620+0e9458a3f, ReleaseFast 35// ------------------------------------------------------------------------------------------------- 36// matrix mul benchmark (AOS) - scalar version: 2.2308s, zmath version: 0.9376s 37// cross3, scale, bias benchmark (AOS) - scalar version: 1.0821s, zmath version: 0.5110s 38// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.6580s, zmath version: 0.9167s 39// quaternion mul benchmark (AOS) - scalar version: 2.0139s, zmath version: 0.5856s 40// wave benchmark (SOA) - scalar version: 3.7832s, zmath version: 0.3642s 41// 42// ------------------------------------------------------------------------------------------------- 43 44pub fn main() !void { 45 var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 46 defer _ = gpa.deinit(); 47 const allocator = gpa.allocator(); 48 49 // m = mul(ma, mb); data set fits in L1 cache; AOS data layout. 50 try mat4MulBenchmark(allocator, 100_000); 51 52 // v = 0.01 * cross3(va, vb) + vec3(1.0); data set fits in L1 cache; AOS data layout. 53 try cross3ScaleBiasBenchmark(allocator, 10_000); 54 55 // v = dot3(va, vb) * (0.1 * cross3(va, vb) + vec3(1.0)); data set fits in L1 cache; AOS data layout. 56 try cross3Dot3ScaleBiasBenchmark(allocator, 10_000); 57 58 // q = qmul(qa, qb); data set fits in L1 cache; AOS data layout. 59 try quatBenchmark(allocator, 10_000); 60 61 // d = sqrt(x * x + z * z); y = sin(d - t); SOA layout. 62 try waveBenchmark(allocator, 1_000); 63} 64 65const std = @import("std"); 66const time = std.time; 67const Timer = time.Timer; 68const zm = @import("zmath"); 69 70var prng = std.Random.DefaultPrng.init(0); 71const random = prng.random(); 72 73noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { 74 std.debug.print("\n", .{}); 75 std.debug.print("{s:>42} - ", .{"matrix mul benchmark (AOS)"}); 76 77 var data0 = std.ArrayList([16]f32).init(allocator); 78 defer data0.deinit(); 79 var data1 = std.ArrayList([16]f32).init(allocator); 80 defer data1.deinit(); 81 82 var i: usize = 0; 83 while (i < 64) : (i += 1) { 84 try data0.append([16]f32{ 85 random.float(f32), random.float(f32), random.float(f32), random.float(f32), 86 random.float(f32), random.float(f32), random.float(f32), random.float(f32), 87 random.float(f32), random.float(f32), random.float(f32), random.float(f32), 88 random.float(f32), random.float(f32), random.float(f32), random.float(f32), 89 }); 90 try data1.append([16]f32{ 91 random.float(f32), random.float(f32), random.float(f32), random.float(f32), 92 random.float(f32), random.float(f32), random.float(f32), random.float(f32), 93 random.float(f32), random.float(f32), random.float(f32), random.float(f32), 94 random.float(f32), random.float(f32), random.float(f32), random.float(f32), 95 }); 96 } 97 98 // Warmup, fills L1 cache. 99 i = 0; 100 while (i < 100) : (i += 1) { 101 for (data1.items) |b| { 102 for (data0.items) |a| { 103 const ma = zm.loadMat(a[0..]); 104 const mb = zm.loadMat(b[0..]); 105 const r = zm.mul(ma, mb); 106 std.mem.doNotOptimizeAway(&r); 107 } 108 } 109 } 110 111 { 112 i = 0; 113 var timer = try Timer.start(); 114 const start = timer.lap(); 115 while (i < count) : (i += 1) { 116 for (data1.items) |b| { 117 for (data0.items) |a| { 118 const r = [16]f32{ 119 a[0] * b[0] + a[1] * b[4] + a[2] * b[8] + a[3] * b[12], 120 a[0] * b[1] + a[1] * b[5] + a[2] * b[9] + a[3] * b[13], 121 a[0] * b[2] + a[1] * b[6] + a[2] * b[10] + a[3] * b[14], 122 a[0] * b[3] + a[1] * b[7] + a[2] * b[11] + a[3] * b[15], 123 a[4] * b[0] + a[5] * b[4] + a[6] * b[8] + a[7] * b[12], 124 a[4] * b[1] + a[5] * b[5] + a[6] * b[9] + a[7] * b[13], 125 a[4] * b[2] + a[5] * b[6] + a[6] * b[10] + a[7] * b[14], 126 a[4] * b[3] + a[5] * b[7] + a[6] * b[11] + a[7] * b[15], 127 a[8] * b[0] + a[9] * b[4] + a[10] * b[8] + a[11] * b[12], 128 a[8] * b[1] + a[9] * b[5] + a[10] * b[9] + a[11] * b[13], 129 a[8] * b[2] + a[9] * b[6] + a[10] * b[10] + a[11] * b[14], 130 a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11] * b[15], 131 a[12] * b[0] + a[13] * b[4] + a[14] * b[8] + a[15] * b[12], 132 a[12] * b[1] + a[13] * b[5] + a[14] * b[9] + a[15] * b[13], 133 a[12] * b[2] + a[13] * b[6] + a[14] * b[10] + a[15] * b[14], 134 a[12] * b[3] + a[13] * b[7] + a[14] * b[11] + a[15] * b[15], 135 }; 136 std.mem.doNotOptimizeAway(&r); 137 } 138 } 139 } 140 const end = timer.read(); 141 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 142 143 std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); 144 } 145 146 { 147 i = 0; 148 var timer = try Timer.start(); 149 const start = timer.lap(); 150 while (i < count) : (i += 1) { 151 for (data1.items) |b| { 152 for (data0.items) |a| { 153 const ma = zm.loadMat(a[0..]); 154 const mb = zm.loadMat(b[0..]); 155 const r = zm.mul(ma, mb); 156 std.mem.doNotOptimizeAway(&r); 157 } 158 } 159 } 160 const end = timer.read(); 161 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 162 163 std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); 164 } 165} 166 167noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { 168 std.debug.print("{s:>42} - ", .{"cross3, scale, bias benchmark (AOS)"}); 169 170 var data0 = std.ArrayList([3]f32).init(allocator); 171 defer data0.deinit(); 172 var data1 = std.ArrayList([3]f32).init(allocator); 173 defer data1.deinit(); 174 175 var i: usize = 0; 176 while (i < 256) : (i += 1) { 177 try data0.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); 178 try data1.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); 179 } 180 181 // Warmup, fills L1 cache. 182 i = 0; 183 while (i < 100) : (i += 1) { 184 for (data1.items) |b| { 185 for (data0.items) |a| { 186 const va = zm.loadArr3(a); 187 const vb = zm.loadArr3(b); 188 const cp = zm.f32x4s(0.01) * zm.cross3(va, vb) + zm.f32x4s(1.0); 189 std.mem.doNotOptimizeAway(&cp); 190 } 191 } 192 } 193 194 { 195 i = 0; 196 var timer = try Timer.start(); 197 const start = timer.lap(); 198 while (i < count) : (i += 1) { 199 for (data1.items) |b| { 200 for (data0.items) |a| { 201 const r = [3]f32{ 202 0.01 * (a[1] * b[2] - a[2] * b[1]) + 1.0, 203 0.01 * (a[2] * b[0] - a[0] * b[2]) + 1.0, 204 0.01 * (a[0] * b[1] - a[1] * b[0]) + 1.0, 205 }; 206 std.mem.doNotOptimizeAway(&r); 207 } 208 } 209 } 210 const end = timer.read(); 211 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 212 213 std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); 214 } 215 216 { 217 i = 0; 218 var timer = try Timer.start(); 219 const start = timer.lap(); 220 while (i < count) : (i += 1) { 221 for (data1.items) |b| { 222 for (data0.items) |a| { 223 const va = zm.loadArr3(a); 224 const vb = zm.loadArr3(b); 225 const cp = zm.f32x4s(0.01) * zm.cross3(va, vb) + zm.f32x4s(1.0); 226 std.mem.doNotOptimizeAway(&cp); 227 } 228 } 229 } 230 const end = timer.read(); 231 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 232 233 std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); 234 } 235} 236 237noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { 238 std.debug.print("{s:>42} - ", .{"cross3, dot3, scale, bias benchmark (AOS)"}); 239 240 var data0 = std.ArrayList([3]f32).init(allocator); 241 defer data0.deinit(); 242 var data1 = std.ArrayList([3]f32).init(allocator); 243 defer data1.deinit(); 244 245 var i: usize = 0; 246 while (i < 256) : (i += 1) { 247 try data0.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); 248 try data1.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); 249 } 250 251 // Warmup, fills L1 cache. 252 i = 0; 253 while (i < 100) : (i += 1) { 254 for (data1.items) |b| { 255 for (data0.items) |a| { 256 const va = zm.loadArr3(a); 257 const vb = zm.loadArr3(b); 258 const r = (zm.dot3(va, vb) * (zm.f32x4s(0.1) * zm.cross3(va, vb) + zm.f32x4s(1.0)))[0]; 259 std.mem.doNotOptimizeAway(&r); 260 } 261 } 262 } 263 264 { 265 i = 0; 266 var timer = try Timer.start(); 267 const start = timer.lap(); 268 while (i < count) : (i += 1) { 269 for (data1.items) |b| { 270 for (data0.items) |a| { 271 const d = a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; 272 const r = [3]f32{ 273 d * (0.1 * (a[1] * b[2] - a[2] * b[1]) + 1.0), 274 d * (0.1 * (a[2] * b[0] - a[0] * b[2]) + 1.0), 275 d * (0.1 * (a[0] * b[1] - a[1] * b[0]) + 1.0), 276 }; 277 std.mem.doNotOptimizeAway(&r); 278 } 279 } 280 } 281 const end = timer.read(); 282 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 283 284 std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); 285 } 286 287 { 288 i = 0; 289 var timer = try Timer.start(); 290 const start = timer.lap(); 291 while (i < count) : (i += 1) { 292 for (data1.items) |b| { 293 for (data0.items) |a| { 294 const va = zm.loadArr3(a); 295 const vb = zm.loadArr3(b); 296 const r = zm.dot3(va, vb) * (zm.f32x4s(0.1) * zm.cross3(va, vb) + zm.f32x4s(1.0)); 297 std.mem.doNotOptimizeAway(&r); 298 } 299 } 300 } 301 const end = timer.read(); 302 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 303 304 std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); 305 } 306} 307 308noinline fn quatBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { 309 std.debug.print("{s:>42} - ", .{"quaternion mul benchmark (AOS)"}); 310 311 var data0 = std.ArrayList([4]f32).init(allocator); 312 defer data0.deinit(); 313 var data1 = std.ArrayList([4]f32).init(allocator); 314 defer data1.deinit(); 315 316 var i: usize = 0; 317 while (i < 256) : (i += 1) { 318 try data0.append([4]f32{ random.float(f32), random.float(f32), random.float(f32), random.float(f32) }); 319 try data1.append([4]f32{ random.float(f32), random.float(f32), random.float(f32), random.float(f32) }); 320 } 321 322 // Warmup, fills L1 cache. 323 i = 0; 324 while (i < 100) : (i += 1) { 325 for (data1.items) |b| { 326 for (data0.items) |a| { 327 const va = zm.loadArr4(a); 328 const vb = zm.loadArr4(b); 329 const r = zm.qmul(va, vb); 330 std.mem.doNotOptimizeAway(&r); 331 } 332 } 333 } 334 335 { 336 i = 0; 337 var timer = try Timer.start(); 338 const start = timer.lap(); 339 while (i < count) : (i += 1) { 340 for (data1.items) |b| { 341 for (data0.items) |a| { 342 const r = [4]f32{ 343 (b[3] * a[0]) + (b[0] * a[3]) + (b[1] * a[2]) - (b[2] * a[1]), 344 (b[3] * a[1]) - (b[0] * a[2]) + (b[1] * a[3]) + (b[2] * a[0]), 345 (b[3] * a[2]) + (b[0] * a[1]) - (b[1] * a[0]) + (b[2] * a[3]), 346 (b[3] * a[3]) - (b[0] * a[0]) - (b[1] * a[1]) - (b[2] * a[2]), 347 }; 348 std.mem.doNotOptimizeAway(&r); 349 } 350 } 351 } 352 const end = timer.read(); 353 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 354 355 std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); 356 } 357 358 { 359 i = 0; 360 var timer = try Timer.start(); 361 const start = timer.lap(); 362 while (i < count) : (i += 1) { 363 for (data1.items) |b| { 364 for (data0.items) |a| { 365 const va = zm.loadArr4(a); 366 const vb = zm.loadArr4(b); 367 const r = zm.qmul(va, vb); 368 std.mem.doNotOptimizeAway(&r); 369 } 370 } 371 } 372 const end = timer.read(); 373 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 374 375 std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); 376 } 377} 378 379noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { 380 _ = allocator; 381 std.debug.print("{s:>42} - ", .{"wave benchmark (SOA)"}); 382 383 const grid_size = 1024; 384 { 385 var t: f32 = 0.0; 386 387 const scale: f32 = 0.05; 388 389 var timer = try Timer.start(); 390 const start = timer.lap(); 391 392 var iter: usize = 0; 393 while (iter < count) : (iter += 1) { 394 var z_index: i32 = 0; 395 while (z_index < grid_size) : (z_index += 1) { 396 const z = scale * @as(f32, @floatFromInt(z_index - grid_size / 2)); 397 398 var x_index: i32 = 0; 399 while (x_index < grid_size) : (x_index += 4) { 400 const x0 = scale * @as(f32, @floatFromInt(x_index + 0 - grid_size / 2)); 401 const x1 = scale * @as(f32, @floatFromInt(x_index + 1 - grid_size / 2)); 402 const x2 = scale * @as(f32, @floatFromInt(x_index + 2 - grid_size / 2)); 403 const x3 = scale * @as(f32, @floatFromInt(x_index + 3 - grid_size / 2)); 404 405 const d0 = zm.sqrt(x0 * x0 + z * z); 406 const d1 = zm.sqrt(x1 * x1 + z * z); 407 const d2 = zm.sqrt(x2 * x2 + z * z); 408 const d3 = zm.sqrt(x3 * x3 + z * z); 409 410 const y0 = zm.sin(d0 - t); 411 const y1 = zm.sin(d1 - t); 412 const y2 = zm.sin(d2 - t); 413 const y3 = zm.sin(d3 - t); 414 415 std.mem.doNotOptimizeAway(&y0); 416 std.mem.doNotOptimizeAway(&y1); 417 std.mem.doNotOptimizeAway(&y2); 418 std.mem.doNotOptimizeAway(&y3); 419 } 420 } 421 t += 0.001; 422 } 423 const end = timer.read(); 424 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 425 426 std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); 427 } 428 429 { 430 const T = zm.F32x16; 431 432 const static = struct { 433 const offsets = [16]f32{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; 434 }; 435 const voffset = zm.load(static.offsets[0..], T, 0); 436 var vt = zm.splat(T, 0.0); 437 438 const scale: f32 = 0.05; 439 440 var timer = try Timer.start(); 441 const start = timer.lap(); 442 443 var iter: usize = 0; 444 while (iter < count) : (iter += 1) { 445 var z_index: i32 = 0; 446 while (z_index < grid_size) : (z_index += 1) { 447 const z = scale * @as(f32, @floatFromInt(z_index - grid_size / 2)); 448 const vz = zm.splat(T, z); 449 450 var x_index: i32 = 0; 451 while (x_index < grid_size) : (x_index += zm.veclen(T)) { 452 const x = scale * @as(f32, @floatFromInt(x_index - grid_size / 2)); 453 const vx = zm.splat(T, x) + voffset * zm.splat(T, scale); 454 455 const d = zm.sqrt(vx * vx + vz * vz); 456 457 const vy = zm.sin(d - vt); 458 459 std.mem.doNotOptimizeAway(&vy); 460 } 461 } 462 vt += zm.splat(T, 0.001); 463 } 464 const end = timer.read(); 465 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 466 467 std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); 468 } 469}