this repo has no description
1// -------------------------------------------------------------------------------------------------
2// zmath - benchmarks
3// -------------------------------------------------------------------------------------------------
4// 'zig build benchmark -Doptimize=ReleaseFast' will build and benchmakrs with all optimisations.
5//
6// -------------------------------------------------------------------------------------------------
7// 'AMD Ryzen 9 3950X 16-Core Processor', Windows 11, Zig 0.10.0-dev.2620+0e9458a3f, ReleaseFast
8// -------------------------------------------------------------------------------------------------
9// matrix mul benchmark (AOS) - scalar version: 1.5880s, zmath version: 1.0642s
10// cross3, scale, bias benchmark (AOS) - scalar version: 0.9318s, zmath version: 0.6888s
11// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.2258s, zmath version: 1.1095s
12// quaternion mul benchmark (AOS) - scalar version: 1.4123s, zmath version: 0.6958s
13// wave benchmark (SOA) - scalar version: 4.8165s, zmath version: 0.7338s
14//
15// -------------------------------------------------------------------------------------------------
16// 'AMD Ryzen 7 5800X 8-Core Processer', Linux 5.17.14, Zig 0.10.0-dev.2624+d506275a0, ReleaseFast
17// -------------------------------------------------------------------------------------------------
18// matrix mul benchmark (AOS) - scalar version: 1.3672s, zmath version: 0.8617s
19// cross3, scale, bias benchmark (AOS) - scalar version: 0.6586s, zmath version: 0.4803s
20// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.0620s, zmath version: 0.8942s
21// quaternion mul benchmark (AOS) - scalar version: 1.1324s, zmath version: 0.6064s
22// wave benchmark (SOA) - scalar version: 3.6598s, zmath version: 0.4231s
23//
24// -------------------------------------------------------------------------------------------------
25// 'Apple M1 Max', macOS Version 12.4, Zig 0.10.0-dev.2657+74442f350, ReleaseFast
26// -------------------------------------------------------------------------------------------------
27// matrix mul benchmark (AOS) - scalar version: 1.0297s, zmath version: 1.0538s
28// cross3, scale, bias benchmark (AOS) - scalar version: 0.6294s, zmath version: 0.6532s
29// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 0.9807s, zmath version: 1.0988s
30// quaternion mul benchmark (AOS) - scalar version: 1.5413s, zmath version: 0.7800s
31// wave benchmark (SOA) - scalar version: 3.4220s, zmath version: 1.0255s
32//
33// -------------------------------------------------------------------------------------------------
34// '11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz', Windows 11, Zig 0.10.0-dev.2620+0e9458a3f, ReleaseFast
35// -------------------------------------------------------------------------------------------------
36// matrix mul benchmark (AOS) - scalar version: 2.2308s, zmath version: 0.9376s
37// cross3, scale, bias benchmark (AOS) - scalar version: 1.0821s, zmath version: 0.5110s
38// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.6580s, zmath version: 0.9167s
39// quaternion mul benchmark (AOS) - scalar version: 2.0139s, zmath version: 0.5856s
40// wave benchmark (SOA) - scalar version: 3.7832s, zmath version: 0.3642s
41//
42// -------------------------------------------------------------------------------------------------
43
44pub fn main() !void {
45 var gpa = std.heap.GeneralPurposeAllocator(.{}){};
46 defer _ = gpa.deinit();
47 const allocator = gpa.allocator();
48
49 // m = mul(ma, mb); data set fits in L1 cache; AOS data layout.
50 try mat4MulBenchmark(allocator, 100_000);
51
52 // v = 0.01 * cross3(va, vb) + vec3(1.0); data set fits in L1 cache; AOS data layout.
53 try cross3ScaleBiasBenchmark(allocator, 10_000);
54
55 // v = dot3(va, vb) * (0.1 * cross3(va, vb) + vec3(1.0)); data set fits in L1 cache; AOS data layout.
56 try cross3Dot3ScaleBiasBenchmark(allocator, 10_000);
57
58 // q = qmul(qa, qb); data set fits in L1 cache; AOS data layout.
59 try quatBenchmark(allocator, 10_000);
60
61 // d = sqrt(x * x + z * z); y = sin(d - t); SOA layout.
62 try waveBenchmark(allocator, 1_000);
63}
64
65const std = @import("std");
66const time = std.time;
67const Timer = time.Timer;
68const zm = @import("zmath");
69
70var prng = std.Random.DefaultPrng.init(0);
71const random = prng.random();
72
73noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
74 std.debug.print("\n", .{});
75 std.debug.print("{s:>42} - ", .{"matrix mul benchmark (AOS)"});
76
77 var data0 = std.ArrayList([16]f32).init(allocator);
78 defer data0.deinit();
79 var data1 = std.ArrayList([16]f32).init(allocator);
80 defer data1.deinit();
81
82 var i: usize = 0;
83 while (i < 64) : (i += 1) {
84 try data0.append([16]f32{
85 random.float(f32), random.float(f32), random.float(f32), random.float(f32),
86 random.float(f32), random.float(f32), random.float(f32), random.float(f32),
87 random.float(f32), random.float(f32), random.float(f32), random.float(f32),
88 random.float(f32), random.float(f32), random.float(f32), random.float(f32),
89 });
90 try data1.append([16]f32{
91 random.float(f32), random.float(f32), random.float(f32), random.float(f32),
92 random.float(f32), random.float(f32), random.float(f32), random.float(f32),
93 random.float(f32), random.float(f32), random.float(f32), random.float(f32),
94 random.float(f32), random.float(f32), random.float(f32), random.float(f32),
95 });
96 }
97
98 // Warmup, fills L1 cache.
99 i = 0;
100 while (i < 100) : (i += 1) {
101 for (data1.items) |b| {
102 for (data0.items) |a| {
103 const ma = zm.loadMat(a[0..]);
104 const mb = zm.loadMat(b[0..]);
105 const r = zm.mul(ma, mb);
106 std.mem.doNotOptimizeAway(&r);
107 }
108 }
109 }
110
111 {
112 i = 0;
113 var timer = try Timer.start();
114 const start = timer.lap();
115 while (i < count) : (i += 1) {
116 for (data1.items) |b| {
117 for (data0.items) |a| {
118 const r = [16]f32{
119 a[0] * b[0] + a[1] * b[4] + a[2] * b[8] + a[3] * b[12],
120 a[0] * b[1] + a[1] * b[5] + a[2] * b[9] + a[3] * b[13],
121 a[0] * b[2] + a[1] * b[6] + a[2] * b[10] + a[3] * b[14],
122 a[0] * b[3] + a[1] * b[7] + a[2] * b[11] + a[3] * b[15],
123 a[4] * b[0] + a[5] * b[4] + a[6] * b[8] + a[7] * b[12],
124 a[4] * b[1] + a[5] * b[5] + a[6] * b[9] + a[7] * b[13],
125 a[4] * b[2] + a[5] * b[6] + a[6] * b[10] + a[7] * b[14],
126 a[4] * b[3] + a[5] * b[7] + a[6] * b[11] + a[7] * b[15],
127 a[8] * b[0] + a[9] * b[4] + a[10] * b[8] + a[11] * b[12],
128 a[8] * b[1] + a[9] * b[5] + a[10] * b[9] + a[11] * b[13],
129 a[8] * b[2] + a[9] * b[6] + a[10] * b[10] + a[11] * b[14],
130 a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11] * b[15],
131 a[12] * b[0] + a[13] * b[4] + a[14] * b[8] + a[15] * b[12],
132 a[12] * b[1] + a[13] * b[5] + a[14] * b[9] + a[15] * b[13],
133 a[12] * b[2] + a[13] * b[6] + a[14] * b[10] + a[15] * b[14],
134 a[12] * b[3] + a[13] * b[7] + a[14] * b[11] + a[15] * b[15],
135 };
136 std.mem.doNotOptimizeAway(&r);
137 }
138 }
139 }
140 const end = timer.read();
141 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
142
143 std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
144 }
145
146 {
147 i = 0;
148 var timer = try Timer.start();
149 const start = timer.lap();
150 while (i < count) : (i += 1) {
151 for (data1.items) |b| {
152 for (data0.items) |a| {
153 const ma = zm.loadMat(a[0..]);
154 const mb = zm.loadMat(b[0..]);
155 const r = zm.mul(ma, mb);
156 std.mem.doNotOptimizeAway(&r);
157 }
158 }
159 }
160 const end = timer.read();
161 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
162
163 std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
164 }
165}
166
167noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
168 std.debug.print("{s:>42} - ", .{"cross3, scale, bias benchmark (AOS)"});
169
170 var data0 = std.ArrayList([3]f32).init(allocator);
171 defer data0.deinit();
172 var data1 = std.ArrayList([3]f32).init(allocator);
173 defer data1.deinit();
174
175 var i: usize = 0;
176 while (i < 256) : (i += 1) {
177 try data0.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) });
178 try data1.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) });
179 }
180
181 // Warmup, fills L1 cache.
182 i = 0;
183 while (i < 100) : (i += 1) {
184 for (data1.items) |b| {
185 for (data0.items) |a| {
186 const va = zm.loadArr3(a);
187 const vb = zm.loadArr3(b);
188 const cp = zm.f32x4s(0.01) * zm.cross3(va, vb) + zm.f32x4s(1.0);
189 std.mem.doNotOptimizeAway(&cp);
190 }
191 }
192 }
193
194 {
195 i = 0;
196 var timer = try Timer.start();
197 const start = timer.lap();
198 while (i < count) : (i += 1) {
199 for (data1.items) |b| {
200 for (data0.items) |a| {
201 const r = [3]f32{
202 0.01 * (a[1] * b[2] - a[2] * b[1]) + 1.0,
203 0.01 * (a[2] * b[0] - a[0] * b[2]) + 1.0,
204 0.01 * (a[0] * b[1] - a[1] * b[0]) + 1.0,
205 };
206 std.mem.doNotOptimizeAway(&r);
207 }
208 }
209 }
210 const end = timer.read();
211 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
212
213 std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
214 }
215
216 {
217 i = 0;
218 var timer = try Timer.start();
219 const start = timer.lap();
220 while (i < count) : (i += 1) {
221 for (data1.items) |b| {
222 for (data0.items) |a| {
223 const va = zm.loadArr3(a);
224 const vb = zm.loadArr3(b);
225 const cp = zm.f32x4s(0.01) * zm.cross3(va, vb) + zm.f32x4s(1.0);
226 std.mem.doNotOptimizeAway(&cp);
227 }
228 }
229 }
230 const end = timer.read();
231 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
232
233 std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
234 }
235}
236
237noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
238 std.debug.print("{s:>42} - ", .{"cross3, dot3, scale, bias benchmark (AOS)"});
239
240 var data0 = std.ArrayList([3]f32).init(allocator);
241 defer data0.deinit();
242 var data1 = std.ArrayList([3]f32).init(allocator);
243 defer data1.deinit();
244
245 var i: usize = 0;
246 while (i < 256) : (i += 1) {
247 try data0.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) });
248 try data1.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) });
249 }
250
251 // Warmup, fills L1 cache.
252 i = 0;
253 while (i < 100) : (i += 1) {
254 for (data1.items) |b| {
255 for (data0.items) |a| {
256 const va = zm.loadArr3(a);
257 const vb = zm.loadArr3(b);
258 const r = (zm.dot3(va, vb) * (zm.f32x4s(0.1) * zm.cross3(va, vb) + zm.f32x4s(1.0)))[0];
259 std.mem.doNotOptimizeAway(&r);
260 }
261 }
262 }
263
264 {
265 i = 0;
266 var timer = try Timer.start();
267 const start = timer.lap();
268 while (i < count) : (i += 1) {
269 for (data1.items) |b| {
270 for (data0.items) |a| {
271 const d = a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
272 const r = [3]f32{
273 d * (0.1 * (a[1] * b[2] - a[2] * b[1]) + 1.0),
274 d * (0.1 * (a[2] * b[0] - a[0] * b[2]) + 1.0),
275 d * (0.1 * (a[0] * b[1] - a[1] * b[0]) + 1.0),
276 };
277 std.mem.doNotOptimizeAway(&r);
278 }
279 }
280 }
281 const end = timer.read();
282 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
283
284 std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
285 }
286
287 {
288 i = 0;
289 var timer = try Timer.start();
290 const start = timer.lap();
291 while (i < count) : (i += 1) {
292 for (data1.items) |b| {
293 for (data0.items) |a| {
294 const va = zm.loadArr3(a);
295 const vb = zm.loadArr3(b);
296 const r = zm.dot3(va, vb) * (zm.f32x4s(0.1) * zm.cross3(va, vb) + zm.f32x4s(1.0));
297 std.mem.doNotOptimizeAway(&r);
298 }
299 }
300 }
301 const end = timer.read();
302 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
303
304 std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
305 }
306}
307
308noinline fn quatBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
309 std.debug.print("{s:>42} - ", .{"quaternion mul benchmark (AOS)"});
310
311 var data0 = std.ArrayList([4]f32).init(allocator);
312 defer data0.deinit();
313 var data1 = std.ArrayList([4]f32).init(allocator);
314 defer data1.deinit();
315
316 var i: usize = 0;
317 while (i < 256) : (i += 1) {
318 try data0.append([4]f32{ random.float(f32), random.float(f32), random.float(f32), random.float(f32) });
319 try data1.append([4]f32{ random.float(f32), random.float(f32), random.float(f32), random.float(f32) });
320 }
321
322 // Warmup, fills L1 cache.
323 i = 0;
324 while (i < 100) : (i += 1) {
325 for (data1.items) |b| {
326 for (data0.items) |a| {
327 const va = zm.loadArr4(a);
328 const vb = zm.loadArr4(b);
329 const r = zm.qmul(va, vb);
330 std.mem.doNotOptimizeAway(&r);
331 }
332 }
333 }
334
335 {
336 i = 0;
337 var timer = try Timer.start();
338 const start = timer.lap();
339 while (i < count) : (i += 1) {
340 for (data1.items) |b| {
341 for (data0.items) |a| {
342 const r = [4]f32{
343 (b[3] * a[0]) + (b[0] * a[3]) + (b[1] * a[2]) - (b[2] * a[1]),
344 (b[3] * a[1]) - (b[0] * a[2]) + (b[1] * a[3]) + (b[2] * a[0]),
345 (b[3] * a[2]) + (b[0] * a[1]) - (b[1] * a[0]) + (b[2] * a[3]),
346 (b[3] * a[3]) - (b[0] * a[0]) - (b[1] * a[1]) - (b[2] * a[2]),
347 };
348 std.mem.doNotOptimizeAway(&r);
349 }
350 }
351 }
352 const end = timer.read();
353 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
354
355 std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
356 }
357
358 {
359 i = 0;
360 var timer = try Timer.start();
361 const start = timer.lap();
362 while (i < count) : (i += 1) {
363 for (data1.items) |b| {
364 for (data0.items) |a| {
365 const va = zm.loadArr4(a);
366 const vb = zm.loadArr4(b);
367 const r = zm.qmul(va, vb);
368 std.mem.doNotOptimizeAway(&r);
369 }
370 }
371 }
372 const end = timer.read();
373 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
374
375 std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
376 }
377}
378
379noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
380 _ = allocator;
381 std.debug.print("{s:>42} - ", .{"wave benchmark (SOA)"});
382
383 const grid_size = 1024;
384 {
385 var t: f32 = 0.0;
386
387 const scale: f32 = 0.05;
388
389 var timer = try Timer.start();
390 const start = timer.lap();
391
392 var iter: usize = 0;
393 while (iter < count) : (iter += 1) {
394 var z_index: i32 = 0;
395 while (z_index < grid_size) : (z_index += 1) {
396 const z = scale * @as(f32, @floatFromInt(z_index - grid_size / 2));
397
398 var x_index: i32 = 0;
399 while (x_index < grid_size) : (x_index += 4) {
400 const x0 = scale * @as(f32, @floatFromInt(x_index + 0 - grid_size / 2));
401 const x1 = scale * @as(f32, @floatFromInt(x_index + 1 - grid_size / 2));
402 const x2 = scale * @as(f32, @floatFromInt(x_index + 2 - grid_size / 2));
403 const x3 = scale * @as(f32, @floatFromInt(x_index + 3 - grid_size / 2));
404
405 const d0 = zm.sqrt(x0 * x0 + z * z);
406 const d1 = zm.sqrt(x1 * x1 + z * z);
407 const d2 = zm.sqrt(x2 * x2 + z * z);
408 const d3 = zm.sqrt(x3 * x3 + z * z);
409
410 const y0 = zm.sin(d0 - t);
411 const y1 = zm.sin(d1 - t);
412 const y2 = zm.sin(d2 - t);
413 const y3 = zm.sin(d3 - t);
414
415 std.mem.doNotOptimizeAway(&y0);
416 std.mem.doNotOptimizeAway(&y1);
417 std.mem.doNotOptimizeAway(&y2);
418 std.mem.doNotOptimizeAway(&y3);
419 }
420 }
421 t += 0.001;
422 }
423 const end = timer.read();
424 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
425
426 std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
427 }
428
429 {
430 const T = zm.F32x16;
431
432 const static = struct {
433 const offsets = [16]f32{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
434 };
435 const voffset = zm.load(static.offsets[0..], T, 0);
436 var vt = zm.splat(T, 0.0);
437
438 const scale: f32 = 0.05;
439
440 var timer = try Timer.start();
441 const start = timer.lap();
442
443 var iter: usize = 0;
444 while (iter < count) : (iter += 1) {
445 var z_index: i32 = 0;
446 while (z_index < grid_size) : (z_index += 1) {
447 const z = scale * @as(f32, @floatFromInt(z_index - grid_size / 2));
448 const vz = zm.splat(T, z);
449
450 var x_index: i32 = 0;
451 while (x_index < grid_size) : (x_index += zm.veclen(T)) {
452 const x = scale * @as(f32, @floatFromInt(x_index - grid_size / 2));
453 const vx = zm.splat(T, x) + voffset * zm.splat(T, scale);
454
455 const d = zm.sqrt(vx * vx + vz * vz);
456
457 const vy = zm.sin(d - vt);
458
459 std.mem.doNotOptimizeAway(&vy);
460 }
461 }
462 vt += zm.splat(T, 0.001);
463 }
464 const end = timer.read();
465 const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
466
467 std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
468 }
469}