libs/zmath/src/benchmark.zig at main · altagos.dev/rayray

altagos.dev / rayray
this repo has no description
rayray / libs / zmath / src / benchmark.zig
at main 19 kB view raw
  1// -------------------------------------------------------------------------------------------------
  2// zmath - benchmarks
  3// -------------------------------------------------------------------------------------------------
  4// 'zig build benchmark -Doptimize=ReleaseFast' will build and benchmakrs with all optimisations.
  5//
  6// -------------------------------------------------------------------------------------------------
  7// 'AMD Ryzen 9 3950X 16-Core Processor', Windows 11, Zig 0.10.0-dev.2620+0e9458a3f, ReleaseFast
  8// -------------------------------------------------------------------------------------------------
  9//                matrix mul benchmark (AOS) - scalar version: 1.5880s, zmath version: 1.0642s
 10//       cross3, scale, bias benchmark (AOS) - scalar version: 0.9318s, zmath version: 0.6888s
 11// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.2258s, zmath version: 1.1095s
 12//            quaternion mul benchmark (AOS) - scalar version: 1.4123s, zmath version: 0.6958s
 13//                      wave benchmark (SOA) - scalar version: 4.8165s, zmath version: 0.7338s
 14//
 15// -------------------------------------------------------------------------------------------------
 16// 'AMD Ryzen 7 5800X 8-Core Processer', Linux 5.17.14, Zig 0.10.0-dev.2624+d506275a0, ReleaseFast
 17// -------------------------------------------------------------------------------------------------
 18//                matrix mul benchmark (AOS) - scalar version: 1.3672s, zmath version: 0.8617s
 19//       cross3, scale, bias benchmark (AOS) - scalar version: 0.6586s, zmath version: 0.4803s
 20// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.0620s, zmath version: 0.8942s
 21//            quaternion mul benchmark (AOS) - scalar version: 1.1324s, zmath version: 0.6064s
 22//                      wave benchmark (SOA) - scalar version: 3.6598s, zmath version: 0.4231s
 23//
 24// -------------------------------------------------------------------------------------------------
 25// 'Apple M1 Max', macOS Version 12.4, Zig 0.10.0-dev.2657+74442f350, ReleaseFast
 26// -------------------------------------------------------------------------------------------------
 27//                matrix mul benchmark (AOS) - scalar version: 1.0297s, zmath version: 1.0538s
 28//       cross3, scale, bias benchmark (AOS) - scalar version: 0.6294s, zmath version: 0.6532s
 29// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 0.9807s, zmath version: 1.0988s
 30//            quaternion mul benchmark (AOS) - scalar version: 1.5413s, zmath version: 0.7800s
 31//                      wave benchmark (SOA) - scalar version: 3.4220s, zmath version: 1.0255s
 32//
 33// -------------------------------------------------------------------------------------------------
 34// '11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz', Windows 11, Zig 0.10.0-dev.2620+0e9458a3f, ReleaseFast
 35// -------------------------------------------------------------------------------------------------
 36//                matrix mul benchmark (AOS) - scalar version: 2.2308s, zmath version: 0.9376s
 37//       cross3, scale, bias benchmark (AOS) - scalar version: 1.0821s, zmath version: 0.5110s
 38// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.6580s, zmath version: 0.9167s
 39//            quaternion mul benchmark (AOS) - scalar version: 2.0139s, zmath version: 0.5856s
 40//                      wave benchmark (SOA) - scalar version: 3.7832s, zmath version: 0.3642s
 41//
 42// -------------------------------------------------------------------------------------------------
 43
 44pub fn main() !void {
 45    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
 46    defer _ = gpa.deinit();
 47    const allocator = gpa.allocator();
 48
 49    // m = mul(ma, mb); data set fits in L1 cache; AOS data layout.
 50    try mat4MulBenchmark(allocator, 100_000);
 51
 52    // v = 0.01 * cross3(va, vb) + vec3(1.0); data set fits in L1 cache; AOS data layout.
 53    try cross3ScaleBiasBenchmark(allocator, 10_000);
 54
 55    // v = dot3(va, vb) * (0.1 * cross3(va, vb) + vec3(1.0)); data set fits in L1 cache; AOS data layout.
 56    try cross3Dot3ScaleBiasBenchmark(allocator, 10_000);
 57
 58    // q = qmul(qa, qb); data set fits in L1 cache; AOS data layout.
 59    try quatBenchmark(allocator, 10_000);
 60
 61    // d = sqrt(x * x + z * z); y = sin(d - t); SOA layout.
 62    try waveBenchmark(allocator, 1_000);
 63}
 64
 65const std = @import("std");
 66const time = std.time;
 67const Timer = time.Timer;
 68const zm = @import("zmath");
 69
 70var prng = std.Random.DefaultPrng.init(0);
 71const random = prng.random();
 72
 73noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
 74    std.debug.print("\n", .{});
 75    std.debug.print("{s:>42} - ", .{"matrix mul benchmark (AOS)"});
 76
 77    var data0 = std.ArrayList([16]f32).init(allocator);
 78    defer data0.deinit();
 79    var data1 = std.ArrayList([16]f32).init(allocator);
 80    defer data1.deinit();
 81
 82    var i: usize = 0;
 83    while (i < 64) : (i += 1) {
 84        try data0.append([16]f32{
 85            random.float(f32), random.float(f32), random.float(f32), random.float(f32),
 86            random.float(f32), random.float(f32), random.float(f32), random.float(f32),
 87            random.float(f32), random.float(f32), random.float(f32), random.float(f32),
 88            random.float(f32), random.float(f32), random.float(f32), random.float(f32),
 89        });
 90        try data1.append([16]f32{
 91            random.float(f32), random.float(f32), random.float(f32), random.float(f32),
 92            random.float(f32), random.float(f32), random.float(f32), random.float(f32),
 93            random.float(f32), random.float(f32), random.float(f32), random.float(f32),
 94            random.float(f32), random.float(f32), random.float(f32), random.float(f32),
 95        });
 96    }
 97
 98    // Warmup, fills L1 cache.
 99    i = 0;
100    while (i < 100) : (i += 1) {
101        for (data1.items) |b| {
102            for (data0.items) |a| {
103                const ma = zm.loadMat(a[0..]);
104                const mb = zm.loadMat(b[0..]);
105                const r = zm.mul(ma, mb);
106                std.mem.doNotOptimizeAway(&r);
107            }
108        }
109    }
110
111    {
112        i = 0;
113        var timer = try Timer.start();
114        const start = timer.lap();
115        while (i < count) : (i += 1) {
116            for (data1.items) |b| {
117                for (data0.items) |a| {
118                    const r = [16]f32{
119                        a[0] * b[0] + a[1] * b[4] + a[2] * b[8] + a[3] * b[12],
120                        a[0] * b[1] + a[1] * b[5] + a[2] * b[9] + a[3] * b[13],
121                        a[0] * b[2] + a[1] * b[6] + a[2] * b[10] + a[3] * b[14],
122                        a[0] * b[3] + a[1] * b[7] + a[2] * b[11] + a[3] * b[15],
123                        a[4] * b[0] + a[5] * b[4] + a[6] * b[8] + a[7] * b[12],
124                        a[4] * b[1] + a[5] * b[5] + a[6] * b[9] + a[7] * b[13],
125                        a[4] * b[2] + a[5] * b[6] + a[6] * b[10] + a[7] * b[14],
126                        a[4] * b[3] + a[5] * b[7] + a[6] * b[11] + a[7] * b[15],
127                        a[8] * b[0] + a[9] * b[4] + a[10] * b[8] + a[11] * b[12],
128                        a[8] * b[1] + a[9] * b[5] + a[10] * b[9] + a[11] * b[13],
129                        a[8] * b[2] + a[9] * b[6] + a[10] * b[10] + a[11] * b[14],
130                        a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11] * b[15],
131                        a[12] * b[0] + a[13] * b[4] + a[14] * b[8] + a[15] * b[12],
132                        a[12] * b[1] + a[13] * b[5] + a[14] * b[9] + a[15] * b[13],
133                        a[12] * b[2] + a[13] * b[6] + a[14] * b[10] + a[15] * b[14],
134                        a[12] * b[3] + a[13] * b[7] + a[14] * b[11] + a[15] * b[15],
135                    };
136                    std.mem.doNotOptimizeAway(&r);
137                }
138            }
139        }
140        const end = timer.read();
141        const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
142
143        std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
144    }
145
146    {
147        i = 0;
148        var timer = try Timer.start();
149        const start = timer.lap();
150        while (i < count) : (i += 1) {
151            for (data1.items) |b| {
152                for (data0.items) |a| {
153                    const ma = zm.loadMat(a[0..]);
154                    const mb = zm.loadMat(b[0..]);
155                    const r = zm.mul(ma, mb);
156                    std.mem.doNotOptimizeAway(&r);
157                }
158            }
159        }
160        const end = timer.read();
161        const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
162
163        std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
164    }
165}
166
167noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
168    std.debug.print("{s:>42} - ", .{"cross3, scale, bias benchmark (AOS)"});
169
170    var data0 = std.ArrayList([3]f32).init(allocator);
171    defer data0.deinit();
172    var data1 = std.ArrayList([3]f32).init(allocator);
173    defer data1.deinit();
174
175    var i: usize = 0;
176    while (i < 256) : (i += 1) {
177        try data0.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) });
178        try data1.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) });
179    }
180
181    // Warmup, fills L1 cache.
182    i = 0;
183    while (i < 100) : (i += 1) {
184        for (data1.items) |b| {
185            for (data0.items) |a| {
186                const va = zm.loadArr3(a);
187                const vb = zm.loadArr3(b);
188                const cp = zm.f32x4s(0.01) * zm.cross3(va, vb) + zm.f32x4s(1.0);
189                std.mem.doNotOptimizeAway(&cp);
190            }
191        }
192    }
193
194    {
195        i = 0;
196        var timer = try Timer.start();
197        const start = timer.lap();
198        while (i < count) : (i += 1) {
199            for (data1.items) |b| {
200                for (data0.items) |a| {
201                    const r = [3]f32{
202                        0.01 * (a[1] * b[2] - a[2] * b[1]) + 1.0,
203                        0.01 * (a[2] * b[0] - a[0] * b[2]) + 1.0,
204                        0.01 * (a[0] * b[1] - a[1] * b[0]) + 1.0,
205                    };
206                    std.mem.doNotOptimizeAway(&r);
207                }
208            }
209        }
210        const end = timer.read();
211        const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
212
213        std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
214    }
215
216    {
217        i = 0;
218        var timer = try Timer.start();
219        const start = timer.lap();
220        while (i < count) : (i += 1) {
221            for (data1.items) |b| {
222                for (data0.items) |a| {
223                    const va = zm.loadArr3(a);
224                    const vb = zm.loadArr3(b);
225                    const cp = zm.f32x4s(0.01) * zm.cross3(va, vb) + zm.f32x4s(1.0);
226                    std.mem.doNotOptimizeAway(&cp);
227                }
228            }
229        }
230        const end = timer.read();
231        const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
232
233        std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
234    }
235}
236
237noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
238    std.debug.print("{s:>42} - ", .{"cross3, dot3, scale, bias benchmark (AOS)"});
239
240    var data0 = std.ArrayList([3]f32).init(allocator);
241    defer data0.deinit();
242    var data1 = std.ArrayList([3]f32).init(allocator);
243    defer data1.deinit();
244
245    var i: usize = 0;
246    while (i < 256) : (i += 1) {
247        try data0.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) });
248        try data1.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) });
249    }
250
251    // Warmup, fills L1 cache.
252    i = 0;
253    while (i < 100) : (i += 1) {
254        for (data1.items) |b| {
255            for (data0.items) |a| {
256                const va = zm.loadArr3(a);
257                const vb = zm.loadArr3(b);
258                const r = (zm.dot3(va, vb) * (zm.f32x4s(0.1) * zm.cross3(va, vb) + zm.f32x4s(1.0)))[0];
259                std.mem.doNotOptimizeAway(&r);
260            }
261        }
262    }
263
264    {
265        i = 0;
266        var timer = try Timer.start();
267        const start = timer.lap();
268        while (i < count) : (i += 1) {
269            for (data1.items) |b| {
270                for (data0.items) |a| {
271                    const d = a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
272                    const r = [3]f32{
273                        d * (0.1 * (a[1] * b[2] - a[2] * b[1]) + 1.0),
274                        d * (0.1 * (a[2] * b[0] - a[0] * b[2]) + 1.0),
275                        d * (0.1 * (a[0] * b[1] - a[1] * b[0]) + 1.0),
276                    };
277                    std.mem.doNotOptimizeAway(&r);
278                }
279            }
280        }
281        const end = timer.read();
282        const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
283
284        std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
285    }
286
287    {
288        i = 0;
289        var timer = try Timer.start();
290        const start = timer.lap();
291        while (i < count) : (i += 1) {
292            for (data1.items) |b| {
293                for (data0.items) |a| {
294                    const va = zm.loadArr3(a);
295                    const vb = zm.loadArr3(b);
296                    const r = zm.dot3(va, vb) * (zm.f32x4s(0.1) * zm.cross3(va, vb) + zm.f32x4s(1.0));
297                    std.mem.doNotOptimizeAway(&r);
298                }
299            }
300        }
301        const end = timer.read();
302        const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
303
304        std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
305    }
306}
307
308noinline fn quatBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
309    std.debug.print("{s:>42} - ", .{"quaternion mul benchmark (AOS)"});
310
311    var data0 = std.ArrayList([4]f32).init(allocator);
312    defer data0.deinit();
313    var data1 = std.ArrayList([4]f32).init(allocator);
314    defer data1.deinit();
315
316    var i: usize = 0;
317    while (i < 256) : (i += 1) {
318        try data0.append([4]f32{ random.float(f32), random.float(f32), random.float(f32), random.float(f32) });
319        try data1.append([4]f32{ random.float(f32), random.float(f32), random.float(f32), random.float(f32) });
320    }
321
322    // Warmup, fills L1 cache.
323    i = 0;
324    while (i < 100) : (i += 1) {
325        for (data1.items) |b| {
326            for (data0.items) |a| {
327                const va = zm.loadArr4(a);
328                const vb = zm.loadArr4(b);
329                const r = zm.qmul(va, vb);
330                std.mem.doNotOptimizeAway(&r);
331            }
332        }
333    }
334
335    {
336        i = 0;
337        var timer = try Timer.start();
338        const start = timer.lap();
339        while (i < count) : (i += 1) {
340            for (data1.items) |b| {
341                for (data0.items) |a| {
342                    const r = [4]f32{
343                        (b[3] * a[0]) + (b[0] * a[3]) + (b[1] * a[2]) - (b[2] * a[1]),
344                        (b[3] * a[1]) - (b[0] * a[2]) + (b[1] * a[3]) + (b[2] * a[0]),
345                        (b[3] * a[2]) + (b[0] * a[1]) - (b[1] * a[0]) + (b[2] * a[3]),
346                        (b[3] * a[3]) - (b[0] * a[0]) - (b[1] * a[1]) - (b[2] * a[2]),
347                    };
348                    std.mem.doNotOptimizeAway(&r);
349                }
350            }
351        }
352        const end = timer.read();
353        const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
354
355        std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
356    }
357
358    {
359        i = 0;
360        var timer = try Timer.start();
361        const start = timer.lap();
362        while (i < count) : (i += 1) {
363            for (data1.items) |b| {
364                for (data0.items) |a| {
365                    const va = zm.loadArr4(a);
366                    const vb = zm.loadArr4(b);
367                    const r = zm.qmul(va, vb);
368                    std.mem.doNotOptimizeAway(&r);
369                }
370            }
371        }
372        const end = timer.read();
373        const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
374
375        std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
376    }
377}
378
379noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
380    _ = allocator;
381    std.debug.print("{s:>42} - ", .{"wave benchmark (SOA)"});
382
383    const grid_size = 1024;
384    {
385        var t: f32 = 0.0;
386
387        const scale: f32 = 0.05;
388
389        var timer = try Timer.start();
390        const start = timer.lap();
391
392        var iter: usize = 0;
393        while (iter < count) : (iter += 1) {
394            var z_index: i32 = 0;
395            while (z_index < grid_size) : (z_index += 1) {
396                const z = scale * @as(f32, @floatFromInt(z_index - grid_size / 2));
397
398                var x_index: i32 = 0;
399                while (x_index < grid_size) : (x_index += 4) {
400                    const x0 = scale * @as(f32, @floatFromInt(x_index + 0 - grid_size / 2));
401                    const x1 = scale * @as(f32, @floatFromInt(x_index + 1 - grid_size / 2));
402                    const x2 = scale * @as(f32, @floatFromInt(x_index + 2 - grid_size / 2));
403                    const x3 = scale * @as(f32, @floatFromInt(x_index + 3 - grid_size / 2));
404
405                    const d0 = zm.sqrt(x0 * x0 + z * z);
406                    const d1 = zm.sqrt(x1 * x1 + z * z);
407                    const d2 = zm.sqrt(x2 * x2 + z * z);
408                    const d3 = zm.sqrt(x3 * x3 + z * z);
409
410                    const y0 = zm.sin(d0 - t);
411                    const y1 = zm.sin(d1 - t);
412                    const y2 = zm.sin(d2 - t);
413                    const y3 = zm.sin(d3 - t);
414
415                    std.mem.doNotOptimizeAway(&y0);
416                    std.mem.doNotOptimizeAway(&y1);
417                    std.mem.doNotOptimizeAway(&y2);
418                    std.mem.doNotOptimizeAway(&y3);
419                }
420            }
421            t += 0.001;
422        }
423        const end = timer.read();
424        const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
425
426        std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
427    }
428
429    {
430        const T = zm.F32x16;
431
432        const static = struct {
433            const offsets = [16]f32{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
434        };
435        const voffset = zm.load(static.offsets[0..], T, 0);
436        var vt = zm.splat(T, 0.0);
437
438        const scale: f32 = 0.05;
439
440        var timer = try Timer.start();
441        const start = timer.lap();
442
443        var iter: usize = 0;
444        while (iter < count) : (iter += 1) {
445            var z_index: i32 = 0;
446            while (z_index < grid_size) : (z_index += 1) {
447                const z = scale * @as(f32, @floatFromInt(z_index - grid_size / 2));
448                const vz = zm.splat(T, z);
449
450                var x_index: i32 = 0;
451                while (x_index < grid_size) : (x_index += zm.veclen(T)) {
452                    const x = scale * @as(f32, @floatFromInt(x_index - grid_size / 2));
453                    const vx = zm.splat(T, x) + voffset * zm.splat(T, scale);
454
455                    const d = zm.sqrt(vx * vx + vz * vz);
456
457                    const vy = zm.sin(d - vt);
458
459                    std.mem.doNotOptimizeAway(&vy);
460                }
461            }
462            vt += zm.splat(T, 0.001);
463        }
464        const end = timer.read();
465        const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
466
467        std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
468    }
469}