libs/zmath/src/zmath.zig at main · altagos.dev/rayray

altagos.dev / rayray
this repo has no description
rayray / libs / zmath / src / zmath.zig
at main 174 kB view raw
   1// ==============================================================================
   2//
   3// SIMD math library for game developers
   4// https://github.com/michal-z/zig-gamedev/tree/main/libs/zmath
   5//
   6// Should work on all OSes supported by Zig. Works on x86_64 and ARM.
   7// Provides ~140 optimized routines and ~70 extensive tests.
   8// Can be used with any graphics API.
   9//
  10// zmath uses row-major matrices, row vectors (each row vector is stored in a SIMD register).
  11// Handedness is determined by which function version is used (Rh vs. Lh),
  12// otherwise the function works with either left-handed or right-handed view coordinates.
  13//
  14// const va = f32x4(1.0, 2.0, 3.0, 1.0);
  15// const vb = f32x4(-1.0, 1.0, -1.0, 1.0);
  16// const v0 = va + vb - f32x4(0.0, 1.0, 0.0, 1.0) * f32x4s(3.0);
  17// const v1 = cross3(va, vb) + f32x4(1.0, 1.0, 1.0, 1.0);
  18// const v2 = va + dot3(va, vb) / v1; // dotN() returns scalar replicated on all vector components
  19//
  20// const m = rotationX(math.pi * 0.25);
  21// const v = f32x4(...);
  22// const v0 = mul(v, m); // 'v' treated as a row vector
  23// const v1 = mul(m, v); // 'v' treated as a column vector
  24// const f = m[row][column];
  25//
  26// const b = va < vb;
  27// if (all(b, 0)) { ... } // '0' means check all vector components; if all are 'true'
  28// if (all(b, 3)) { ... } // '3' means check first three vector components; if all first three are 'true'
  29// if (any(b, 0)) { ... } // '0' means check all vector components; if any is 'true'
  30// if (any(b, 3)) { ... } // '3' means check first three vector components; if any from first three is 'true'
  31//
  32// var v4 = load(mem[0..], F32x4, 0);
  33// var v8 = load(mem[100..], F32x8, 0);
  34// var v16 = load(mem[200..], F32x16, 0);
  35//
  36// var camera_position = [3]f32{ 1.0, 2.0, 3.0 };
  37// var cam_pos = loadArr3(camera_position);
  38// ...
  39// storeArr3(&camera_position, cam_pos);
  40//
  41// v4 = sin(v4); // SIMDx4
  42// v8 = cos(v8); // .x86_64 -> 2 x SIMDx4, .x86_64+avx+fma -> SIMDx8
  43// v16 = atan(v16); // .x86_64 -> 4 x SIMDx4, .x86_64+avx+fma -> 2 x SIMDx8, .x86_64+avx512f -> SIMDx16
  44//
  45// store(mem[0..], v4, 0);
  46// store(mem[100..], v8, 0);
  47// store(mem[200..], v16, 0);
  48//
  49// ------------------------------------------------------------------------------
  50// 1. Initialization functions
  51// ------------------------------------------------------------------------------
  52//
  53// f32x4(e0: f32, e1: f32, e2: f32, e3: f32) F32x4
  54// f32x8(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32) F32x8
  55// f32x16(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32,
  56//        e8: f32, e9: f32, ea: f32, eb: f32, ec: f32, ed: f32, ee: f32, ef: f32) F32x16
  57//
  58// f32x4s(e0: f32) F32x4
  59// f32x8s(e0: f32) F32x8
  60// f32x16s(e0: f32) F32x16
  61//
  62// boolx4(e0: bool, e1: bool, e2: bool, e3: bool) Boolx4
  63// boolx8(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool) Boolx8
  64// boolx16(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool,
  65//         e8: bool, e9: bool, ea: bool, eb: bool, ec: bool, ed: bool, ee: bool, ef: bool) Boolx16
  66//
  67// load(mem: []const f32, comptime T: type, comptime len: u32) T
  68// store(mem: []f32, v: anytype, comptime len: u32) void
  69//
  70// loadArr2(arr: [2]f32) F32x4
  71// loadArr2zw(arr: [2]f32, z: f32, w: f32) F32x4
  72// loadArr3(arr: [3]f32) F32x4
  73// loadArr3w(arr: [3]f32, w: f32) F32x4
  74// loadArr4(arr: [4]f32) F32x4
  75//
  76// storeArr2(arr: *[2]f32, v: F32x4) void
  77// storeArr3(arr: *[3]f32, v: F32x4) void
  78// storeArr4(arr: *[4]f32, v: F32x4) void
  79//
  80// arr3Ptr(ptr: anytype) *const [3]f32
  81// arrNPtr(ptr: anytype) [*]const f32
  82//
  83// splat(comptime T: type, value: f32) T
  84// splatInt(comptime T: type, value: u32) T
  85//
  86// ------------------------------------------------------------------------------
  87// 2. Functions that work on all vector components (F32xN = F32x4 or F32x8 or F32x16)
  88// ------------------------------------------------------------------------------
  89//
  90// all(vb: anytype, comptime len: u32) bool
  91// any(vb: anytype, comptime len: u32) bool
  92//
  93// isNearEqual(v0: F32xN, v1: F32xN, epsilon: F32xN) BoolxN
  94// isNan(v: F32xN) BoolxN
  95// isInf(v: F32xN) BoolxN
  96// isInBounds(v: F32xN, bounds: F32xN) BoolxN
  97//
  98// andInt(v0: F32xN, v1: F32xN) F32xN
  99// andNotInt(v0: F32xN, v1: F32xN) F32xN
 100// orInt(v0: F32xN, v1: F32xN) F32xN
 101// norInt(v0: F32xN, v1: F32xN) F32xN
 102// xorInt(v0: F32xN, v1: F32xN) F32xN
 103//
 104// minFast(v0: F32xN, v1: F32xN) F32xN
 105// maxFast(v0: F32xN, v1: F32xN) F32xN
 106// min(v0: F32xN, v1: F32xN) F32xN
 107// max(v0: F32xN, v1: F32xN) F32xN
 108// round(v: F32xN) F32xN
 109// floor(v: F32xN) F32xN
 110// trunc(v: F32xN) F32xN
 111// ceil(v: F32xN) F32xN
 112// clamp(v0: F32xN, v1: F32xN) F32xN
 113// clampFast(v0: F32xN, v1: F32xN) F32xN
 114// saturate(v: F32xN) F32xN
 115// saturateFast(v: F32xN) F32xN
 116// lerp(v0: F32xN, v1: F32xN, t: f32) F32xN
 117// lerpV(v0: F32xN, v1: F32xN, t: F32xN) F32xN
 118// lerpInverse(v0: F32xN, v1: F32xN, t: f32) F32xN
 119// lerpInverseV(v0: F32xN, v1: F32xN, t: F32xN) F32xN
 120// mapLinear(v: F32xN, min1: f32, max1: f32, min2: f32, max2: f32) F32xN
 121// mapLinearV(v: F32xN, min1: F32xN, max1: F32xN, min2: F32xN, max2: F32xN) F32xN
 122// sqrt(v: F32xN) F32xN
 123// abs(v: F32xN) F32xN
 124// mod(v0: F32xN, v1: F32xN) F32xN
 125// modAngle(v: F32xN) F32xN
 126// mulAdd(v0: F32xN, v1: F32xN, v2: F32xN) F32xN
 127// select(mask: BoolxN, v0: F32xN, v1: F32xN)
 128// sin(v: F32xN) F32xN
 129// cos(v: F32xN) F32xN
 130// sincos(v: F32xN) [2]F32xN
 131// asin(v: F32xN) F32xN
 132// acos(v: F32xN) F32xN
 133// atan(v: F32xN) F32xN
 134// atan2(vy: F32xN, vx: F32xN) F32xN
 135// cmulSoa(re0: F32xN, im0: F32xN, re1: F32xN, im1: F32xN) [2]F32xN
 136//
 137// ------------------------------------------------------------------------------
 138// 3. 2D, 3D, 4D vector functions
 139// ------------------------------------------------------------------------------
 140//
 141// swizzle(v: Vec, c, c, c, c) Vec (comptime c = .x | .y | .z | .w)
 142// dot2(v0: Vec, v1: Vec) F32x4
 143// dot3(v0: Vec, v1: Vec) F32x4
 144// dot4(v0: Vec, v1: Vec) F32x4
 145// cross3(v0: Vec, v1: Vec) Vec
 146// lengthSq2(v: Vec) F32x4
 147// lengthSq3(v: Vec) F32x4
 148// lengthSq4(v: Vec) F32x4
 149// length2(v: Vec) F32x4
 150// length3(v: Vec) F32x4
 151// length4(v: Vec) F32x4
 152// normalize2(v: Vec) Vec
 153// normalize3(v: Vec) Vec
 154// normalize4(v: Vec) Vec
 155//
 156// vecToArr2(v: Vec) [2]f32
 157// vecToArr3(v: Vec) [3]f32
 158// vecToArr4(v: Vec) [4]f32
 159//
 160// ------------------------------------------------------------------------------
 161// 4. Matrix functions
 162// ------------------------------------------------------------------------------
 163//
 164// identity() Mat
 165// mul(m0: Mat, m1: Mat) Mat
 166// mul(s: f32, m: Mat) Mat
 167// mul(m: Mat, s: f32) Mat
 168// mul(v: Vec, m: Mat) Vec
 169// mul(m: Mat, v: Vec) Vec
 170// transpose(m: Mat) Mat
 171// rotationX(angle: f32) Mat
 172// rotationY(angle: f32) Mat
 173// rotationZ(angle: f32) Mat
 174// translation(x: f32, y: f32, z: f32) Mat
 175// translationV(v: Vec) Mat
 176// scaling(x: f32, y: f32, z: f32) Mat
 177// scalingV(v: Vec) Mat
 178// lookToLh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat
 179// lookAtLh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat
 180// lookToRh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat
 181// lookAtRh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat
 182// perspectiveFovLh(fovy: f32, aspect: f32, near: f32, far: f32) Mat
 183// perspectiveFovRh(fovy: f32, aspect: f32, near: f32, far: f32) Mat
 184// perspectiveFovLhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat
 185// perspectiveFovRhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat
 186// orthographicLh(w: f32, h: f32, near: f32, far: f32) Mat
 187// orthographicRh(w: f32, h: f32, near: f32, far: f32) Mat
 188// orthographicLhGl(w: f32, h: f32, near: f32, far: f32) Mat
 189// orthographicRhGl(w: f32, h: f32, near: f32, far: f32) Mat
 190// orthographicOffCenterLh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat
 191// orthographicOffCenterRh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat
 192// orthographicOffCenterLhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat
 193// orthographicOffCenterRhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat
 194// determinant(m: Mat) F32x4
 195// inverse(m: Mat) Mat
 196// inverseDet(m: Mat, det: ?*F32x4) Mat
 197// matToQuat(m: Mat) Quat
 198// matFromAxisAngle(axis: Vec, angle: f32) Mat
 199// matFromNormAxisAngle(axis: Vec, angle: f32) Mat
 200// matFromQuat(quat: Quat) Mat
 201// matFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Mat
 202// matFromRollPitchYawV(angles: Vec) Mat
 203// matFromArr(arr: [16]f32) Mat
 204//
 205// loadMat(mem: []const f32) Mat
 206// loadMat43(mem: []const f32) Mat
 207// loadMat34(mem: []const f32) Mat
 208// storeMat(mem: []f32, m: Mat) void
 209// storeMat43(mem: []f32, m: Mat) void
 210// storeMat34(mem: []f32, m: Mat) void
 211//
 212// matToArr(m: Mat) [16]f32
 213// matToArr43(m: Mat) [12]f32
 214// matToArr34(m: Mat) [12]f32
 215//
 216// ------------------------------------------------------------------------------
 217// 5. Quaternion functions
 218// ------------------------------------------------------------------------------
 219//
 220// qmul(q0: Quat, q1: Quat) Quat
 221// qidentity() Quat
 222// conjugate(quat: Quat) Quat
 223// inverse(q: Quat) Quat
 224// rotate(q: Quat, v: Vec) Vec
 225// slerp(q0: Quat, q1: Quat, t: f32) Quat
 226// slerpV(q0: Quat, q1: Quat, t: F32x4) Quat
 227// quatToMat(quat: Quat) Mat
 228// quatToAxisAngle(quat: Quat, axis: *Vec, angle: *f32) void
 229// quatFromMat(m: Mat) Quat
 230// quatFromAxisAngle(axis: Vec, angle: f32) Quat
 231// quatFromNormAxisAngle(axis: Vec, angle: f32) Quat
 232// quatFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Quat
 233// quatFromRollPitchYawV(angles: Vec) Quat
 234//
 235// ------------------------------------------------------------------------------
 236// 6. Color functions
 237// ------------------------------------------------------------------------------
 238//
 239// adjustSaturation(color: F32x4, saturation: f32) F32x4
 240// adjustContrast(color: F32x4, contrast: f32) F32x4
 241// rgbToHsl(rgb: F32x4) F32x4
 242// hslToRgb(hsl: F32x4) F32x4
 243// rgbToHsv(rgb: F32x4) F32x4
 244// hsvToRgb(hsv: F32x4) F32x4
 245// rgbToSrgb(rgb: F32x4) F32x4
 246// srgbToRgb(srgb: F32x4) F32x4
 247//
 248// ------------------------------------------------------------------------------
 249// X. Misc functions
 250// ------------------------------------------------------------------------------
 251//
 252// linePointDistance(linept0: Vec, linept1: Vec, pt: Vec) F32x4
 253// sin(v: f32) f32
 254// cos(v: f32) f32
 255// sincos(v: f32) [2]f32
 256// asin(v: f32) f32
 257// acos(v: f32) f32
 258//
 259// fftInitUnityTable(unitytable: []F32x4) void
 260// fft(re: []F32x4, im: []F32x4, unitytable: []const F32x4) void
 261// ifft(re: []F32x4, im: []const F32x4, unitytable: []const F32x4) void
 262//
 263// ==============================================================================
 264
 265// Fundamental types
 266pub const F32x4 = @Vector(4, f32);
 267pub const F32x8 = @Vector(8, f32);
 268pub const F32x16 = @Vector(16, f32);
 269pub const Boolx4 = @Vector(4, bool);
 270pub const Boolx8 = @Vector(8, bool);
 271pub const Boolx16 = @Vector(16, bool);
 272
 273// "Higher-level" aliases
 274pub const Vec = F32x4;
 275pub const Mat = [4]F32x4;
 276pub const Quat = F32x4;
 277
 278const builtin = @import("builtin");
 279const std = @import("std");
 280const math = std.math;
 281const assert = std.debug.assert;
 282const expect = std.testing.expect;
 283
 284const cpu_arch = builtin.cpu.arch;
 285const has_avx = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .avx) else false;
 286const has_avx512f = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f) else false;
 287const has_fma = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .fma) else false;
 288// ------------------------------------------------------------------------------
 289//
 290// 1. Initialization functions
 291//
 292// ------------------------------------------------------------------------------
 293pub inline fn f32x4(e0: f32, e1: f32, e2: f32, e3: f32) F32x4 {
 294    return .{ e0, e1, e2, e3 };
 295}
 296pub inline fn f32x8(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32) F32x8 {
 297    return .{ e0, e1, e2, e3, e4, e5, e6, e7 };
 298}
 299// zig fmt: off
 300pub inline fn f32x16(
 301    e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32,
 302    e8: f32, e9: f32, ea: f32, eb: f32, ec: f32, ed: f32, ee: f32, ef: f32) F32x16 {
 303    return .{ e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, ea, eb, ec, ed, ee, ef };
 304}
 305// zig fmt: on
 306
 307pub inline fn f32x4s(e0: f32) F32x4 {
 308    return splat(F32x4, e0);
 309}
 310pub inline fn f32x8s(e0: f32) F32x8 {
 311    return splat(F32x8, e0);
 312}
 313pub inline fn f32x16s(e0: f32) F32x16 {
 314    return splat(F32x16, e0);
 315}
 316
 317pub inline fn boolx4(e0: bool, e1: bool, e2: bool, e3: bool) Boolx4 {
 318    return .{ e0, e1, e2, e3 };
 319}
 320pub inline fn boolx8(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool) Boolx8 {
 321    return .{ e0, e1, e2, e3, e4, e5, e6, e7 };
 322}
 323// zig fmt: off
 324pub inline fn boolx16(
 325    e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool,
 326    e8: bool, e9: bool, ea: bool, eb: bool, ec: bool, ed: bool, ee: bool, ef: bool) Boolx16 {
 327    return .{ e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, ea, eb, ec, ed, ee, ef };
 328}
 329// zig fmt: on
 330
 331pub inline fn veclen(comptime T: type) comptime_int {
 332    return @typeInfo(T).vector.len;
 333}
 334
 335pub inline fn splat(comptime T: type, value: f32) T {
 336    return @splat(value);
 337}
 338pub inline fn splatInt(comptime T: type, value: u32) T {
 339    return @splat(@bitCast(value));
 340}
 341
 342pub fn load(mem: []const f32, comptime T: type, comptime len: u32) T {
 343    var v = splat(T, 0.0);
 344    const loop_len = if (len == 0) veclen(T) else len;
 345    comptime var i: u32 = 0;
 346    inline while (i < loop_len) : (i += 1) {
 347        v[i] = mem[i];
 348    }
 349    return v;
 350}
 351test "zmath.load" {
 352    const a = [7]f32{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 };
 353    var ptr = &a;
 354    var i: u32 = 0;
 355    const v0 = load(a[i..], F32x4, 2);
 356    try expectVecEqual(v0, F32x4{ 1.0, 2.0, 0.0, 0.0 });
 357    i += 2;
 358    const v1 = load(a[i .. i + 2], F32x4, 2);
 359    try expectVecEqual(v1, F32x4{ 3.0, 4.0, 0.0, 0.0 });
 360    const v2 = load(a[5..7], F32x4, 2);
 361    try expectVecEqual(v2, F32x4{ 6.0, 7.0, 0.0, 0.0 });
 362    const v3 = load(ptr[1..], F32x4, 2);
 363    try expectVecEqual(v3, F32x4{ 2.0, 3.0, 0.0, 0.0 });
 364    i += 1;
 365    const v4 = load(ptr[i .. i + 2], F32x4, 2);
 366    try expectVecEqual(v4, F32x4{ 4.0, 5.0, 0.0, 0.0 });
 367}
 368
 369pub fn store(mem: []f32, v: anytype, comptime len: u32) void {
 370    const T = @TypeOf(v);
 371    const loop_len = if (len == 0) veclen(T) else len;
 372    comptime var i: u32 = 0;
 373    inline while (i < loop_len) : (i += 1) {
 374        mem[i] = v[i];
 375    }
 376}
 377test "zmath.store" {
 378    var a = [7]f32{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 };
 379    const v = load(a[1..], F32x4, 3);
 380    store(a[2..], v, 4);
 381    try expect(a[0] == 1.0);
 382    try expect(a[1] == 2.0);
 383    try expect(a[2] == 2.0);
 384    try expect(a[3] == 3.0);
 385    try expect(a[4] == 4.0);
 386    try expect(a[5] == 0.0);
 387}
 388
 389pub inline fn loadArr2(arr: [2]f32) F32x4 {
 390    return f32x4(arr[0], arr[1], 0.0, 0.0);
 391}
 392pub inline fn loadArr2zw(arr: [2]f32, z: f32, w: f32) F32x4 {
 393    return f32x4(arr[0], arr[1], z, w);
 394}
 395pub inline fn loadArr3(arr: [3]f32) F32x4 {
 396    return f32x4(arr[0], arr[1], arr[2], 0.0);
 397}
 398pub inline fn loadArr3w(arr: [3]f32, w: f32) F32x4 {
 399    return f32x4(arr[0], arr[1], arr[2], w);
 400}
 401pub inline fn loadArr4(arr: [4]f32) F32x4 {
 402    return f32x4(arr[0], arr[1], arr[2], arr[3]);
 403}
 404
 405pub inline fn storeArr2(arr: *[2]f32, v: F32x4) void {
 406    arr.* = .{ v[0], v[1] };
 407}
 408pub inline fn storeArr3(arr: *[3]f32, v: F32x4) void {
 409    arr.* = .{ v[0], v[1], v[2] };
 410}
 411pub inline fn storeArr4(arr: *[4]f32, v: F32x4) void {
 412    arr.* = .{ v[0], v[1], v[2], v[3] };
 413}
 414
 415pub inline fn arr3Ptr(ptr: anytype) *const [3]f32 {
 416    comptime assert(@typeInfo(@TypeOf(ptr)) == .pointer);
 417    const T = std.meta.Child(@TypeOf(ptr));
 418    comptime assert(T == F32x4);
 419    return @as(*const [3]f32, @ptrCast(ptr));
 420}
 421
 422pub inline fn arrNPtr(ptr: anytype) [*]const f32 {
 423    comptime assert(@typeInfo(@TypeOf(ptr)) == .pointer);
 424    const T = std.meta.Child(@TypeOf(ptr));
 425    comptime assert(T == Mat or T == F32x4 or T == F32x8 or T == F32x16);
 426    return @as([*]const f32, @ptrCast(ptr));
 427}
 428test "zmath.arrNPtr" {
 429    {
 430        const mat = identity();
 431        const f32ptr = arrNPtr(&mat);
 432        try expect(f32ptr[0] == 1.0);
 433        try expect(f32ptr[5] == 1.0);
 434        try expect(f32ptr[10] == 1.0);
 435        try expect(f32ptr[15] == 1.0);
 436    }
 437    {
 438        const v8 = f32x8s(1.0);
 439        const f32ptr = arrNPtr(&v8);
 440        try expect(f32ptr[1] == 1.0);
 441        try expect(f32ptr[7] == 1.0);
 442    }
 443}
 444
 445test "zmath.loadArr" {
 446    {
 447        const camera_position = [3]f32{ 1.0, 2.0, 3.0 };
 448        const simd_reg = loadArr3(camera_position);
 449        try expectVecEqual(simd_reg, f32x4(1.0, 2.0, 3.0, 0.0));
 450    }
 451    {
 452        const camera_position = [3]f32{ 1.0, 2.0, 3.0 };
 453        const simd_reg = loadArr3w(camera_position, 1.0);
 454        try expectVecEqual(simd_reg, f32x4(1.0, 2.0, 3.0, 1.0));
 455    }
 456}
 457
 458pub inline fn vecToArr2(v: Vec) [2]f32 {
 459    return .{ v[0], v[1] };
 460}
 461pub inline fn vecToArr3(v: Vec) [3]f32 {
 462    return .{ v[0], v[1], v[2] };
 463}
 464pub inline fn vecToArr4(v: Vec) [4]f32 {
 465    return .{ v[0], v[1], v[2], v[3] };
 466}
 467// ------------------------------------------------------------------------------
 468//
 469// 2. Functions that work on all vector components (F32xN = F32x4 or F32x8 or F32x16)
 470//
 471// ------------------------------------------------------------------------------
 472pub fn all(vb: anytype, comptime len: u32) bool {
 473    const T = @TypeOf(vb);
 474    if (len > veclen(T)) {
 475        @compileError("zmath.all(): 'len' is greater than vector len of type " ++ @typeName(T));
 476    }
 477    const loop_len = if (len == 0) veclen(T) else len;
 478    const ab: [veclen(T)]bool = vb;
 479    comptime var i: u32 = 0;
 480    var result = true;
 481    inline while (i < loop_len) : (i += 1) {
 482        result = result and ab[i];
 483    }
 484    return result;
 485}
 486test "zmath.all" {
 487    try expect(all(boolx8(true, true, true, true, true, false, true, false), 5) == true);
 488    try expect(all(boolx8(true, true, true, true, true, false, true, false), 6) == false);
 489    try expect(all(boolx8(true, true, true, true, false, false, false, false), 4) == true);
 490    try expect(all(boolx4(true, true, true, false), 3) == true);
 491    try expect(all(boolx4(true, true, true, false), 1) == true);
 492    try expect(all(boolx4(true, false, false, false), 1) == true);
 493    try expect(all(boolx4(false, true, false, false), 1) == false);
 494    try expect(all(boolx8(true, true, true, true, true, false, true, false), 0) == false);
 495    try expect(all(boolx4(false, true, false, false), 0) == false);
 496    try expect(all(boolx4(true, true, true, true), 0) == true);
 497}
 498
 499pub fn any(vb: anytype, comptime len: u32) bool {
 500    const T = @TypeOf(vb);
 501    if (len > veclen(T)) {
 502        @compileError("zmath.any(): 'len' is greater than vector len of type " ++ @typeName(T));
 503    }
 504    const loop_len = if (len == 0) veclen(T) else len;
 505    const ab: [veclen(T)]bool = vb;
 506    comptime var i: u32 = 0;
 507    var result = false;
 508    inline while (i < loop_len) : (i += 1) {
 509        result = result or ab[i];
 510    }
 511    return result;
 512}
 513test "zmath.any" {
 514    try expect(any(boolx8(true, true, true, true, true, false, true, false), 0) == true);
 515    try expect(any(boolx8(false, false, false, true, true, false, true, false), 3) == false);
 516    try expect(any(boolx8(false, false, false, false, false, true, false, false), 4) == false);
 517}
 518
 519pub inline fn isNearEqual(
 520    v0: anytype,
 521    v1: anytype,
 522    epsilon: anytype,
 523) @Vector(veclen(@TypeOf(v0)), bool) {
 524    const T = @TypeOf(v0, v1, epsilon);
 525    const delta = v0 - v1;
 526    const temp = maxFast(delta, splat(T, 0.0) - delta);
 527    return temp <= epsilon;
 528}
 529test "zmath.isNearEqual" {
 530    {
 531        const v0 = f32x4(1.0, 2.0, -3.0, 4.001);
 532        const v1 = f32x4(1.0, 2.1, 3.0, 4.0);
 533        const b = isNearEqual(v0, v1, splat(F32x4, 0.01));
 534        try expect(@reduce(.And, b == boolx4(true, false, false, true)));
 535    }
 536    {
 537        const v0 = f32x8(1.0, 2.0, -3.0, 4.001, 1.001, 2.3, -0.0, 0.0);
 538        const v1 = f32x8(1.0, 2.1, 3.0, 4.0, -1.001, 2.1, 0.0, 0.0);
 539        const b = isNearEqual(v0, v1, splat(F32x8, 0.01));
 540        try expect(@reduce(.And, b == boolx8(true, false, false, true, false, false, true, true)));
 541    }
 542    try expect(all(isNearEqual(
 543        splat(F32x4, math.inf(f32)),
 544        splat(F32x4, math.inf(f32)),
 545        splat(F32x4, 0.0001),
 546    ), 0) == false);
 547    try expect(all(isNearEqual(
 548        splat(F32x4, -math.inf(f32)),
 549        splat(F32x4, math.inf(f32)),
 550        splat(F32x4, 0.0001),
 551    ), 0) == false);
 552    try expect(all(isNearEqual(
 553        splat(F32x4, -math.inf(f32)),
 554        splat(F32x4, -math.inf(f32)),
 555        splat(F32x4, 0.0001),
 556    ), 0) == false);
 557    try expect(all(isNearEqual(
 558        splat(F32x4, -math.nan(f32)),
 559        splat(F32x4, math.inf(f32)),
 560        splat(F32x4, 0.0001),
 561    ), 0) == false);
 562}
 563
 564pub inline fn isNan(
 565    v: anytype,
 566) @Vector(veclen(@TypeOf(v)), bool) {
 567    return v != v;
 568}
 569test "zmath.isNan" {
 570    {
 571        const v0 = f32x4(math.inf(f32), math.nan(f32), math.nan(f32), 7.0);
 572        const b = isNan(v0);
 573        try expect(@reduce(.And, b == boolx4(false, true, true, false)));
 574    }
 575    {
 576        const v0 = f32x8(0, math.nan(f32), 0, 0, math.inf(f32), math.nan(f32), math.snan(f32), 7.0);
 577        const b = isNan(v0);
 578        try expect(@reduce(.And, b == boolx8(false, true, false, false, false, true, true, false)));
 579    }
 580}
 581
 582pub inline fn isInf(
 583    v: anytype,
 584) @Vector(veclen(@TypeOf(v)), bool) {
 585    const T = @TypeOf(v);
 586    return abs(v) == splat(T, math.inf(f32));
 587}
 588test "zmath.isInf" {
 589    {
 590        const v0 = f32x4(math.inf(f32), math.nan(f32), math.snan(f32), 7.0);
 591        const b = isInf(v0);
 592        try expect(@reduce(.And, b == boolx4(true, false, false, false)));
 593    }
 594    {
 595        const v0 = f32x8(0, math.inf(f32), 0, 0, math.inf(f32), math.nan(f32), math.snan(f32), 7.0);
 596        const b = isInf(v0);
 597        try expect(@reduce(.And, b == boolx8(false, true, false, false, true, false, false, false)));
 598    }
 599}
 600
 601pub inline fn isInBounds(
 602    v: anytype,
 603    bounds: anytype,
 604) @Vector(veclen(@TypeOf(v)), bool) {
 605    const T = @TypeOf(v, bounds);
 606    const Tu = @Vector(veclen(T), u1);
 607    const Tr = @Vector(veclen(T), bool);
 608
 609    // 2 x cmpleps, xorps, load, andps
 610    const b0 = v <= bounds;
 611    const b1 = (bounds * splat(T, -1.0)) <= v;
 612    const b0u = @as(Tu, @bitCast(b0));
 613    const b1u = @as(Tu, @bitCast(b1));
 614    return @as(Tr, @bitCast(b0u & b1u));
 615}
 616test "zmath.isInBounds" {
 617    {
 618        const v0 = f32x4(0.5, -2.0, -1.0, 1.9);
 619        const v1 = f32x4(-1.6, -2.001, -1.0, 1.9);
 620        const bounds = f32x4(1.0, 2.0, 1.0, 2.0);
 621        const b0 = isInBounds(v0, bounds);
 622        const b1 = isInBounds(v1, bounds);
 623        try expect(@reduce(.And, b0 == boolx4(true, true, true, true)));
 624        try expect(@reduce(.And, b1 == boolx4(false, false, true, true)));
 625    }
 626    {
 627        const v0 = f32x8(2.0, 1.0, 2.0, 1.0, 0.5, -2.0, -1.0, 1.9);
 628        const bounds = f32x8(1.0, 1.0, 1.0, math.inf(f32), 1.0, math.nan(f32), 1.0, 2.0);
 629        const b0 = isInBounds(v0, bounds);
 630        try expect(@reduce(.And, b0 == boolx8(false, true, false, true, true, false, true, true)));
 631    }
 632}
 633
 634pub inline fn andInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) {
 635    const T = @TypeOf(v0, v1);
 636    const Tu = @Vector(veclen(T), u32);
 637    const v0u = @as(Tu, @bitCast(v0));
 638    const v1u = @as(Tu, @bitCast(v1));
 639    return @as(T, @bitCast(v0u & v1u)); // andps
 640}
 641test "zmath.andInt" {
 642    {
 643        const v0 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0))));
 644        const v1 = f32x4(1.0, 2.0, 3.0, math.inf(f32));
 645        const v = andInt(v0, v1);
 646        try expect(v[3] == math.inf(f32));
 647        try expectVecEqual(v, f32x4(0.0, 2.0, 0.0, math.inf(f32)));
 648    }
 649    {
 650        const v0 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0))));
 651        const v1 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, math.inf(f32));
 652        const v = andInt(v0, v1);
 653        try expect(v[7] == math.inf(f32));
 654        try expectVecEqual(v, f32x8(0, 0, 0, 0, 0.0, 2.0, 0.0, math.inf(f32)));
 655    }
 656}
 657
 658pub inline fn andNotInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) {
 659    const T = @TypeOf(v0, v1);
 660    const Tu = @Vector(veclen(T), u32);
 661    const v0u = @as(Tu, @bitCast(v0));
 662    const v1u = @as(Tu, @bitCast(v1));
 663    return @as(T, @bitCast(~v0u & v1u)); // andnps
 664}
 665test "zmath.andNotInt" {
 666    {
 667        const v0 = f32x4(1.0, 2.0, 3.0, 4.0);
 668        const v1 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0))));
 669        const v = andNotInt(v1, v0);
 670        try expectVecEqual(v, f32x4(1.0, 0.0, 3.0, 0.0));
 671    }
 672    {
 673        const v0 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0);
 674        const v1 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0))));
 675        const v = andNotInt(v1, v0);
 676        try expectVecEqual(v, f32x8(0, 0, 0, 0, 1.0, 0.0, 3.0, 0.0));
 677    }
 678}
 679
 680pub inline fn orInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) {
 681    const T = @TypeOf(v0, v1);
 682    const Tu = @Vector(veclen(T), u32);
 683    const v0u = @as(Tu, @bitCast(v0));
 684    const v1u = @as(Tu, @bitCast(v1));
 685    return @as(T, @bitCast(v0u | v1u)); // orps
 686}
 687test "zmath.orInt" {
 688    {
 689        const v0 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0);
 690        const v1 = f32x4(1.0, 2.0, 3.0, 4.0);
 691        const v = orInt(v0, v1);
 692        try expect(v[0] == 1.0);
 693        try expect(@as(u32, @bitCast(v[1])) == ~@as(u32, 0));
 694        try expect(v[2] == 3.0);
 695        try expect(v[3] == 4.0);
 696    }
 697    {
 698        const v0 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0);
 699        const v1 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0);
 700        const v = orInt(v0, v1);
 701        try expect(v[4] == 1.0);
 702        try expect(@as(u32, @bitCast(v[5])) == ~@as(u32, 0));
 703        try expect(v[6] == 3.0);
 704        try expect(v[7] == 4.0);
 705    }
 706}
 707
 708pub inline fn norInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) {
 709    const T = @TypeOf(v0, v1);
 710    const Tu = @Vector(veclen(T), u32);
 711    const v0u = @as(Tu, @bitCast(v0));
 712    const v1u = @as(Tu, @bitCast(v1));
 713    return @as(T, @bitCast(~(v0u | v1u))); // por, pcmpeqd, pxor
 714}
 715
 716pub inline fn xorInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) {
 717    const T = @TypeOf(v0, v1);
 718    const Tu = @Vector(veclen(T), u32);
 719    const v0u = @as(Tu, @bitCast(v0));
 720    const v1u = @as(Tu, @bitCast(v1));
 721    return @as(T, @bitCast(v0u ^ v1u)); // xorps
 722}
 723test "zmath.xorInt" {
 724    {
 725        const v0 = f32x4(1.0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0);
 726        const v1 = f32x4(1.0, 0, 0, 0);
 727        const v = xorInt(v0, v1);
 728        try expect(v[0] == 0.0);
 729        try expect(@as(u32, @bitCast(v[1])) == ~@as(u32, 0));
 730        try expect(v[2] == 0.0);
 731        try expect(v[3] == 0.0);
 732    }
 733    {
 734        const v0 = f32x8(0, 0, 0, 0, 1.0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0);
 735        const v1 = f32x8(0, 0, 0, 0, 1.0, 0, 0, 0);
 736        const v = xorInt(v0, v1);
 737        try expect(v[4] == 0.0);
 738        try expect(@as(u32, @bitCast(v[5])) == ~@as(u32, 0));
 739        try expect(v[6] == 0.0);
 740        try expect(v[7] == 0.0);
 741    }
 742}
 743
 744pub inline fn minFast(v0: anytype, v1: anytype) @TypeOf(v0, v1) {
 745    return select(v0 < v1, v0, v1); // minps
 746}
 747test "zmath.minFast" {
 748    {
 749        const v0 = f32x4(1.0, 3.0, 2.0, 7.0);
 750        const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32));
 751        const v = minFast(v0, v1);
 752        try expectVecEqual(v, f32x4(1.0, 1.0, 2.0, 7.0));
 753    }
 754    {
 755        const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32));
 756        const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32));
 757        const v = minFast(v0, v1);
 758        try expect(v[0] == 1.0);
 759        try expect(v[1] == 1.0);
 760        try expect(!math.isNan(v[1]));
 761        try expect(v[2] == 4.0);
 762        try expect(v[3] == math.inf(f32));
 763        try expect(!math.isNan(v[3]));
 764    }
 765}
 766
 767pub inline fn maxFast(v0: anytype, v1: anytype) @TypeOf(v0, v1) {
 768    return select(v0 > v1, v0, v1); // maxps
 769}
 770test "zmath.maxFast" {
 771    {
 772        const v0 = f32x4(1.0, 3.0, 2.0, 7.0);
 773        const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32));
 774        const v = maxFast(v0, v1);
 775        try expectVecEqual(v, f32x4(2.0, 3.0, 4.0, math.inf(f32)));
 776    }
 777    {
 778        const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32));
 779        const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32));
 780        const v = maxFast(v0, v1);
 781        try expect(v[0] == 2.0);
 782        try expect(v[1] == 1.0);
 783        try expect(v[2] == 5.0);
 784        try expect(v[3] == math.inf(f32));
 785        try expect(!math.isNan(v[3]));
 786    }
 787}
 788
 789pub inline fn min(v0: anytype, v1: anytype) @TypeOf(v0, v1) {
 790    // This will handle inf & nan
 791    return @min(v0, v1); // minps, cmpunordps, andps, andnps, orps
 792}
 793test "zmath.min" {
 794    // Calling math.inf causes test to fail!
 795    if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest;
 796    {
 797        const v0 = f32x4(1.0, 3.0, 2.0, 7.0);
 798        const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32));
 799        const v = min(v0, v1);
 800        try expectVecEqual(v, f32x4(1.0, 1.0, 2.0, 7.0));
 801    }
 802    {
 803        const v0 = f32x8(0, 0, -2.0, 0, 1.0, 3.0, 2.0, 7.0);
 804        const v1 = f32x8(0, 1.0, 0, 0, 2.0, 1.0, 4.0, math.inf(f32));
 805        const v = min(v0, v1);
 806        try expectVecEqual(v, f32x8(0.0, 0.0, -2.0, 0.0, 1.0, 1.0, 2.0, 7.0));
 807    }
 808    {
 809        const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32));
 810        const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32));
 811        const v = min(v0, v1);
 812        try expect(v[0] == 1.0);
 813        try expect(v[1] == 1.0);
 814        try expect(!math.isNan(v[1]));
 815        try expect(v[2] == 4.0);
 816        try expect(v[3] == math.inf(f32));
 817        try expect(!math.isNan(v[3]));
 818    }
 819
 820    {
 821        const v0 = f32x4(-math.inf(f32), math.inf(f32), math.inf(f32), math.snan(f32));
 822        const v1 = f32x4(math.snan(f32), -math.inf(f32), math.snan(f32), math.nan(f32));
 823        const v = min(v0, v1);
 824        try expect(v[0] == -math.inf(f32));
 825        try expect(v[1] == -math.inf(f32));
 826        try expect(v[2] == math.inf(f32));
 827        try expect(!math.isNan(v[2]));
 828        try expect(math.isNan(v[3]));
 829        try expect(!math.isInf(v[3]));
 830    }
 831}
 832
 833pub inline fn max(v0: anytype, v1: anytype) @TypeOf(v0, v1) {
 834    // This will handle inf & nan
 835    return @max(v0, v1); // maxps, cmpunordps, andps, andnps, orps
 836}
 837test "zmath.max" {
 838    // Calling math.inf causes test to fail!
 839    if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest;
 840    {
 841        const v0 = f32x4(1.0, 3.0, 2.0, 7.0);
 842        const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32));
 843        const v = max(v0, v1);
 844        try expectVecEqual(v, f32x4(2.0, 3.0, 4.0, math.inf(f32)));
 845    }
 846    {
 847        const v0 = f32x8(0, 0, -2.0, 0, 1.0, 3.0, 2.0, 7.0);
 848        const v1 = f32x8(0, 1.0, 0, 0, 2.0, 1.0, 4.0, math.inf(f32));
 849        const v = max(v0, v1);
 850        try expectVecEqual(v, f32x8(0.0, 1.0, 0.0, 0.0, 2.0, 3.0, 4.0, math.inf(f32)));
 851    }
 852    {
 853        const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32));
 854        const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32));
 855        const v = max(v0, v1);
 856        try expect(v[0] == 2.0);
 857        try expect(v[1] == 1.0);
 858        try expect(v[2] == 5.0);
 859        try expect(v[3] == math.inf(f32));
 860        try expect(!math.isNan(v[3]));
 861    }
 862    {
 863        const v0 = f32x4(-math.inf(f32), math.inf(f32), math.inf(f32), math.snan(f32));
 864        const v1 = f32x4(math.snan(f32), -math.inf(f32), math.snan(f32), math.nan(f32));
 865        const v = max(v0, v1);
 866        try expect(v[0] == -math.inf(f32));
 867        try expect(v[1] == math.inf(f32));
 868        try expect(v[2] == math.inf(f32));
 869        try expect(!math.isNan(v[2]));
 870        try expect(math.isNan(v[3]));
 871        try expect(!math.isInf(v[3]));
 872    }
 873}
 874
 875pub fn round(v: anytype) @TypeOf(v) {
 876    const T = @TypeOf(v);
 877    if (cpu_arch == .x86_64 and has_avx) {
 878        if (T == F32x4) {
 879            return asm ("vroundps $0, %%xmm0, %%xmm0"
 880                : [ret] "={xmm0}" (-> T),
 881                : [v] "{xmm0}" (v),
 882            );
 883        } else if (T == F32x8) {
 884            return asm ("vroundps $0, %%ymm0, %%ymm0"
 885                : [ret] "={ymm0}" (-> T),
 886                : [v] "{ymm0}" (v),
 887            );
 888        } else if (T == F32x16 and has_avx512f) {
 889            return asm ("vrndscaleps $0, %%zmm0, %%zmm0"
 890                : [ret] "={zmm0}" (-> T),
 891                : [v] "{zmm0}" (v),
 892            );
 893        } else if (T == F32x16 and !has_avx512f) {
 894            const arr: [16]f32 = v;
 895            var ymm0 = @as(F32x8, arr[0..8].*);
 896            var ymm1 = @as(F32x8, arr[8..16].*);
 897            ymm0 = asm ("vroundps $0, %%ymm0, %%ymm0"
 898                : [ret] "={ymm0}" (-> F32x8),
 899                : [v] "{ymm0}" (ymm0),
 900            );
 901            ymm1 = asm ("vroundps $0, %%ymm1, %%ymm1"
 902                : [ret] "={ymm1}" (-> F32x8),
 903                : [v] "{ymm1}" (ymm1),
 904            );
 905            return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 });
 906        }
 907    } else {
 908        const sign = andInt(v, splatNegativeZero(T));
 909        const magic = orInt(splatNoFraction(T), sign);
 910        var r1 = v + magic;
 911        r1 = r1 - magic;
 912        const r2 = abs(v);
 913        const mask = r2 <= splatNoFraction(T);
 914        return select(mask, r1, v);
 915    }
 916}
 917test "zmath.round" {
 918    {
 919        try expect(all(round(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0));
 920        try expect(all(round(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0));
 921        try expect(all(isNan(round(splat(F32x4, math.nan(f32)))), 0));
 922        try expect(all(isNan(round(splat(F32x4, -math.nan(f32)))), 0));
 923        try expect(all(isNan(round(splat(F32x4, math.snan(f32)))), 0));
 924        try expect(all(isNan(round(splat(F32x4, -math.snan(f32)))), 0));
 925    }
 926    {
 927        const v = round(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1));
 928        try expectVecApproxEqAbs(
 929            v,
 930            f32x16(1.0, -1.0, -2.0, 2.0, 2.0, 3.0, 3.0, 4.0, 6.0, 6.0, 8.0, 9.0, 10.0, 11.0, 13.0, 13.0),
 931            0.0,
 932        );
 933    }
 934    var v = round(f32x4(1.1, -1.1, -1.5, 1.5));
 935    try expectVecEqual(v, f32x4(1.0, -1.0, -2.0, 2.0));
 936
 937    const v1 = f32x4(-10_000_000.1, -math.inf(f32), 10_000_001.5, math.inf(f32));
 938    v = round(v1);
 939    try expect(v[3] == math.inf(f32));
 940    try expectVecEqual(v, f32x4(-10_000_000.1, -math.inf(f32), 10_000_001.5, math.inf(f32)));
 941
 942    const v2 = f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32));
 943    v = round(v2);
 944    try expect(math.isNan(v2[0]));
 945    try expect(math.isNan(v2[1]));
 946    try expect(math.isNan(v2[2]));
 947    try expect(v2[3] == -math.inf(f32));
 948
 949    const v3 = f32x4(1001.5, -201.499, -10000.99, -101.5);
 950    v = round(v3);
 951    try expectVecEqual(v, f32x4(1002.0, -201.0, -10001.0, -102.0));
 952
 953    const v4 = f32x4(-1_388_609.9, 1_388_609.5, 1_388_109.01, 2_388_609.5);
 954    v = round(v4);
 955    try expectVecEqual(v, f32x4(-1_388_610.0, 1_388_610.0, 1_388_109.0, 2_388_610.0));
 956
 957    var f: f32 = -100.0;
 958    var i: u32 = 0;
 959    while (i < 100) : (i += 1) {
 960        const vr = round(splat(F32x4, f));
 961        const fr = @round(splat(F32x4, f));
 962        const vr8 = round(splat(F32x8, f));
 963        const fr8 = @round(splat(F32x8, f));
 964        const vr16 = round(splat(F32x16, f));
 965        const fr16 = @round(splat(F32x16, f));
 966        try expectVecEqual(vr, fr);
 967        try expectVecEqual(vr8, fr8);
 968        try expectVecEqual(vr16, fr16);
 969        f += 0.12345 * @as(f32, @floatFromInt(i));
 970    }
 971}
 972
 973pub fn trunc(v: anytype) @TypeOf(v) {
 974    const T = @TypeOf(v);
 975    if (cpu_arch == .x86_64 and has_avx) {
 976        if (T == F32x4) {
 977            return asm ("vroundps $3, %%xmm0, %%xmm0"
 978                : [ret] "={xmm0}" (-> T),
 979                : [v] "{xmm0}" (v),
 980            );
 981        } else if (T == F32x8) {
 982            return asm ("vroundps $3, %%ymm0, %%ymm0"
 983                : [ret] "={ymm0}" (-> T),
 984                : [v] "{ymm0}" (v),
 985            );
 986        } else if (T == F32x16 and has_avx512f) {
 987            return asm ("vrndscaleps $3, %%zmm0, %%zmm0"
 988                : [ret] "={zmm0}" (-> T),
 989                : [v] "{zmm0}" (v),
 990            );
 991        } else if (T == F32x16 and !has_avx512f) {
 992            const arr: [16]f32 = v;
 993            var ymm0 = @as(F32x8, arr[0..8].*);
 994            var ymm1 = @as(F32x8, arr[8..16].*);
 995            ymm0 = asm ("vroundps $3, %%ymm0, %%ymm0"
 996                : [ret] "={ymm0}" (-> F32x8),
 997                : [v] "{ymm0}" (ymm0),
 998            );
 999            ymm1 = asm ("vroundps $3, %%ymm1, %%ymm1"
1000                : [ret] "={ymm1}" (-> F32x8),
1001                : [v] "{ymm1}" (ymm1),
1002            );
1003            return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 });
1004        }
1005    } else {
1006        const mask = abs(v) < splatNoFraction(T);
1007        const result = floatToIntAndBack(v);
1008        return select(mask, result, v);
1009    }
1010}
1011test "zmath.trunc" {
1012    {
1013        try expect(all(trunc(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0));
1014        try expect(all(trunc(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0));
1015        try expect(all(isNan(trunc(splat(F32x4, math.nan(f32)))), 0));
1016        try expect(all(isNan(trunc(splat(F32x4, -math.nan(f32)))), 0));
1017        try expect(all(isNan(trunc(splat(F32x4, math.snan(f32)))), 0));
1018        try expect(all(isNan(trunc(splat(F32x4, -math.snan(f32)))), 0));
1019    }
1020    {
1021        const v = trunc(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1));
1022        try expectVecApproxEqAbs(
1023            v,
1024            f32x16(1.0, -1.0, -1.0, 1.0, 2.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 10.0, 11.0, 12.0, 13.0),
1025            0.0,
1026        );
1027    }
1028    var v = trunc(f32x4(1.1, -1.1, -1.5, 1.5));
1029    try expectVecEqual(v, f32x4(1.0, -1.0, -1.0, 1.0));
1030
1031    v = trunc(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32)));
1032    try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32)));
1033
1034    v = trunc(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32)));
1035    try expect(math.isNan(v[0]));
1036    try expect(math.isNan(v[1]));
1037    try expect(math.isNan(v[2]));
1038    try expect(v[3] == -math.inf(f32));
1039
1040    v = trunc(f32x4(1000.5001, -201.499, -10000.99, 100.750001));
1041    try expectVecEqual(v, f32x4(1000.0, -201.0, -10000.0, 100.0));
1042
1043    v = trunc(f32x4(-7_388_609.5, 7_388_609.1, 8_388_109.5, -8_388_509.5));
1044    try expectVecEqual(v, f32x4(-7_388_609.0, 7_388_609.0, 8_388_109.0, -8_388_509.0));
1045
1046    var f: f32 = -100.0;
1047    var i: u32 = 0;
1048    while (i < 100) : (i += 1) {
1049        const vr = trunc(splat(F32x4, f));
1050        const fr = @trunc(splat(F32x4, f));
1051        const vr8 = trunc(splat(F32x8, f));
1052        const fr8 = @trunc(splat(F32x8, f));
1053        const vr16 = trunc(splat(F32x16, f));
1054        const fr16 = @trunc(splat(F32x16, f));
1055        try expectVecEqual(vr, fr);
1056        try expectVecEqual(vr8, fr8);
1057        try expectVecEqual(vr16, fr16);
1058        f += 0.12345 * @as(f32, @floatFromInt(i));
1059    }
1060}
1061
1062pub fn floor(v: anytype) @TypeOf(v) {
1063    const T = @TypeOf(v);
1064    if (cpu_arch == .x86_64 and has_avx) {
1065        if (T == F32x4) {
1066            return asm ("vroundps $1, %%xmm0, %%xmm0"
1067                : [ret] "={xmm0}" (-> T),
1068                : [v] "{xmm0}" (v),
1069            );
1070        } else if (T == F32x8) {
1071            return asm ("vroundps $1, %%ymm0, %%ymm0"
1072                : [ret] "={ymm0}" (-> T),
1073                : [v] "{ymm0}" (v),
1074            );
1075        } else if (T == F32x16 and has_avx512f) {
1076            return asm ("vrndscaleps $1, %%zmm0, %%zmm0"
1077                : [ret] "={zmm0}" (-> T),
1078                : [v] "{zmm0}" (v),
1079            );
1080        } else if (T == F32x16 and !has_avx512f) {
1081            const arr: [16]f32 = v;
1082            var ymm0 = @as(F32x8, arr[0..8].*);
1083            var ymm1 = @as(F32x8, arr[8..16].*);
1084            ymm0 = asm ("vroundps $1, %%ymm0, %%ymm0"
1085                : [ret] "={ymm0}" (-> F32x8),
1086                : [v] "{ymm0}" (ymm0),
1087            );
1088            ymm1 = asm ("vroundps $1, %%ymm1, %%ymm1"
1089                : [ret] "={ymm1}" (-> F32x8),
1090                : [v] "{ymm1}" (ymm1),
1091            );
1092            return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 });
1093        }
1094    } else {
1095        const mask = abs(v) < splatNoFraction(T);
1096        var result = floatToIntAndBack(v);
1097        const larger_mask = result > v;
1098        const larger = select(larger_mask, splat(T, -1.0), splat(T, 0.0));
1099        result = result + larger;
1100        return select(mask, result, v);
1101    }
1102}
1103test "zmath.floor" {
1104    {
1105        try expect(all(floor(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0));
1106        try expect(all(floor(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0));
1107        try expect(all(isNan(floor(splat(F32x4, math.nan(f32)))), 0));
1108        try expect(all(isNan(floor(splat(F32x4, -math.nan(f32)))), 0));
1109        try expect(all(isNan(floor(splat(F32x4, math.snan(f32)))), 0));
1110        try expect(all(isNan(floor(splat(F32x4, -math.snan(f32)))), 0));
1111    }
1112    {
1113        const v = floor(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1));
1114        try expectVecApproxEqAbs(
1115            v,
1116            f32x16(1.0, -2.0, -2.0, 1.0, 2.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 10.0, 11.0, 12.0, 13.0),
1117            0.0,
1118        );
1119    }
1120    var v = floor(f32x4(1.5, -1.5, -1.7, -2.1));
1121    try expectVecEqual(v, f32x4(1.0, -2.0, -2.0, -3.0));
1122
1123    v = floor(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32)));
1124    try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32)));
1125
1126    v = floor(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32)));
1127    try expect(math.isNan(v[0]));
1128    try expect(math.isNan(v[1]));
1129    try expect(math.isNan(v[2]));
1130    try expect(v[3] == -math.inf(f32));
1131
1132    v = floor(f32x4(1000.5001, -201.499, -10000.99, 100.75001));
1133    try expectVecEqual(v, f32x4(1000.0, -202.0, -10001.0, 100.0));
1134
1135    v = floor(f32x4(-7_388_609.5, 7_388_609.1, 8_388_109.5, -8_388_509.5));
1136    try expectVecEqual(v, f32x4(-7_388_610.0, 7_388_609.0, 8_388_109.0, -8_388_510.0));
1137
1138    var f: f32 = -100.0;
1139    var i: u32 = 0;
1140    while (i < 100) : (i += 1) {
1141        const vr = floor(splat(F32x4, f));
1142        const fr = @floor(splat(F32x4, f));
1143        const vr8 = floor(splat(F32x8, f));
1144        const fr8 = @floor(splat(F32x8, f));
1145        const vr16 = floor(splat(F32x16, f));
1146        const fr16 = @floor(splat(F32x16, f));
1147        try expectVecEqual(vr, fr);
1148        try expectVecEqual(vr8, fr8);
1149        try expectVecEqual(vr16, fr16);
1150        f += 0.12345 * @as(f32, @floatFromInt(i));
1151    }
1152}
1153
1154pub fn ceil(v: anytype) @TypeOf(v) {
1155    const T = @TypeOf(v);
1156    if (cpu_arch == .x86_64 and has_avx) {
1157        if (T == F32x4) {
1158            return asm ("vroundps $2, %%xmm0, %%xmm0"
1159                : [ret] "={xmm0}" (-> T),
1160                : [v] "{xmm0}" (v),
1161            );
1162        } else if (T == F32x8) {
1163            return asm ("vroundps $2, %%ymm0, %%ymm0"
1164                : [ret] "={ymm0}" (-> T),
1165                : [v] "{ymm0}" (v),
1166            );
1167        } else if (T == F32x16 and has_avx512f) {
1168            return asm ("vrndscaleps $2, %%zmm0, %%zmm0"
1169                : [ret] "={zmm0}" (-> T),
1170                : [v] "{zmm0}" (v),
1171            );
1172        } else if (T == F32x16 and !has_avx512f) {
1173            const arr: [16]f32 = v;
1174            var ymm0 = @as(F32x8, arr[0..8].*);
1175            var ymm1 = @as(F32x8, arr[8..16].*);
1176            ymm0 = asm ("vroundps $2, %%ymm0, %%ymm0"
1177                : [ret] "={ymm0}" (-> F32x8),
1178                : [v] "{ymm0}" (ymm0),
1179            );
1180            ymm1 = asm ("vroundps $2, %%ymm1, %%ymm1"
1181                : [ret] "={ymm1}" (-> F32x8),
1182                : [v] "{ymm1}" (ymm1),
1183            );
1184            return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 });
1185        }
1186    } else {
1187        const mask = abs(v) < splatNoFraction(T);
1188        var result = floatToIntAndBack(v);
1189        const smaller_mask = result < v;
1190        const smaller = select(smaller_mask, splat(T, -1.0), splat(T, 0.0));
1191        result = result - smaller;
1192        return select(mask, result, v);
1193    }
1194}
1195test "zmath.ceil" {
1196    {
1197        try expect(all(ceil(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0));
1198        try expect(all(ceil(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0));
1199        try expect(all(isNan(ceil(splat(F32x4, math.nan(f32)))), 0));
1200        try expect(all(isNan(ceil(splat(F32x4, -math.nan(f32)))), 0));
1201        try expect(all(isNan(ceil(splat(F32x4, math.snan(f32)))), 0));
1202        try expect(all(isNan(ceil(splat(F32x4, -math.snan(f32)))), 0));
1203    }
1204    {
1205        const v = ceil(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1));
1206        try expectVecApproxEqAbs(
1207            v,
1208            f32x16(2.0, -1.0, -1.0, 2.0, 3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0, 11.0, 12.0, 13.0, 14.0),
1209            0.0,
1210        );
1211    }
1212    var v = ceil(f32x4(1.5, -1.5, -1.7, -2.1));
1213    try expectVecEqual(v, f32x4(2.0, -1.0, -1.0, -2.0));
1214
1215    v = ceil(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32)));
1216    try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32)));
1217
1218    v = ceil(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32)));
1219    try expect(math.isNan(v[0]));
1220    try expect(math.isNan(v[1]));
1221    try expect(math.isNan(v[2]));
1222    try expect(v[3] == -math.inf(f32));
1223
1224    v = ceil(f32x4(1000.5001, -201.499, -10000.99, 100.75001));
1225    try expectVecEqual(v, f32x4(1001.0, -201.0, -10000.0, 101.0));
1226
1227    v = ceil(f32x4(-1_388_609.5, 1_388_609.1, 1_388_109.9, -1_388_509.9));
1228    try expectVecEqual(v, f32x4(-1_388_609.0, 1_388_610.0, 1_388_110.0, -1_388_509.0));
1229
1230    var f: f32 = -100.0;
1231    var i: u32 = 0;
1232    while (i < 100) : (i += 1) {
1233        const vr = ceil(splat(F32x4, f));
1234        const fr = @ceil(splat(F32x4, f));
1235        const vr8 = ceil(splat(F32x8, f));
1236        const fr8 = @ceil(splat(F32x8, f));
1237        const vr16 = ceil(splat(F32x16, f));
1238        const fr16 = @ceil(splat(F32x16, f));
1239        try expectVecEqual(vr, fr);
1240        try expectVecEqual(vr8, fr8);
1241        try expectVecEqual(vr16, fr16);
1242        f += 0.12345 * @as(f32, @floatFromInt(i));
1243    }
1244}
1245
1246pub inline fn clamp(v: anytype, vmin: anytype, vmax: anytype) @TypeOf(v, vmin, vmax) {
1247    var result = max(vmin, v);
1248    result = min(vmax, result);
1249    return result;
1250}
1251test "zmath.clamp" {
1252    // Calling math.inf causes test to fail!
1253    if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest;
1254    {
1255        const v0 = f32x4(-1.0, 0.2, 1.1, -0.3);
1256        const v = clamp(v0, splat(F32x4, -0.5), splat(F32x4, 0.5));
1257        try expectVecApproxEqAbs(v, f32x4(-0.5, 0.2, 0.5, -0.3), 0.0001);
1258    }
1259    {
1260        const v0 = f32x8(-2.0, 0.25, -0.25, 100.0, -1.0, 0.2, 1.1, -0.3);
1261        const v = clamp(v0, splat(F32x8, -0.5), splat(F32x8, 0.5));
1262        try expectVecApproxEqAbs(v, f32x8(-0.5, 0.25, -0.25, 0.5, -0.5, 0.2, 0.5, -0.3), 0.0001);
1263    }
1264    {
1265        const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32));
1266        const v = clamp(v0, f32x4(-100.0, 0.0, -100.0, 0.0), f32x4(0.0, 100.0, 0.0, 100.0));
1267        try expectVecApproxEqAbs(v, f32x4(-100.0, 100.0, -100.0, 0.0), 0.0001);
1268    }
1269    {
1270        const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32));
1271        const v = clamp(v0, splat(F32x4, -1.0), splat(F32x4, 1.0));
1272        try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, -1.0, -1.0), 0.0001);
1273    }
1274}
1275
1276pub inline fn clampFast(v: anytype, vmin: anytype, vmax: anytype) @TypeOf(v, vmin, vmax) {
1277    var result = maxFast(vmin, v);
1278    result = minFast(vmax, result);
1279    return result;
1280}
1281test "zmath.clampFast" {
1282    {
1283        const v0 = f32x4(-1.0, 0.2, 1.1, -0.3);
1284        const v = clampFast(v0, splat(F32x4, -0.5), splat(F32x4, 0.5));
1285        try expectVecApproxEqAbs(v, f32x4(-0.5, 0.2, 0.5, -0.3), 0.0001);
1286    }
1287}
1288
1289pub inline fn saturate(v: anytype) @TypeOf(v) {
1290    const T = @TypeOf(v);
1291    var result = max(v, splat(T, 0.0));
1292    result = min(result, splat(T, 1.0));
1293    return result;
1294}
1295test "zmath.saturate" {
1296    // Calling math.inf causes test to fail!
1297    if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest;
1298    {
1299        const v0 = f32x4(-1.0, 0.2, 1.1, -0.3);
1300        const v = saturate(v0);
1301        try expectVecApproxEqAbs(v, f32x4(0.0, 0.2, 1.0, 0.0), 0.0001);
1302    }
1303    {
1304        const v0 = f32x8(0.0, 0.0, 2.0, -2.0, -1.0, 0.2, 1.1, -0.3);
1305        const v = saturate(v0);
1306        try expectVecApproxEqAbs(v, f32x8(0.0, 0.0, 1.0, 0.0, 0.0, 0.2, 1.0, 0.0), 0.0001);
1307    }
1308    {
1309        const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32));
1310        const v = saturate(v0);
1311        try expectVecApproxEqAbs(v, f32x4(0.0, 1.0, 0.0, 0.0), 0.0001);
1312    }
1313    {
1314        const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32));
1315        const v = saturate(v0);
1316        try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, 0.0, 0.0), 0.0001);
1317    }
1318}
1319
1320pub inline fn saturateFast(v: anytype) @TypeOf(v) {
1321    const T = @TypeOf(v);
1322    var result = maxFast(v, splat(T, 0.0));
1323    result = minFast(result, splat(T, 1.0));
1324    return result;
1325}
1326test "zmath.saturateFast" {
1327    {
1328        const v0 = f32x4(-1.0, 0.2, 1.1, -0.3);
1329        const v = saturateFast(v0);
1330        try expectVecApproxEqAbs(v, f32x4(0.0, 0.2, 1.0, 0.0), 0.0001);
1331    }
1332    {
1333        const v0 = f32x8(0.0, 0.0, 2.0, -2.0, -1.0, 0.2, 1.1, -0.3);
1334        const v = saturateFast(v0);
1335        try expectVecApproxEqAbs(v, f32x8(0.0, 0.0, 1.0, 0.0, 0.0, 0.2, 1.0, 0.0), 0.0001);
1336    }
1337    {
1338        const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32));
1339        const v = saturateFast(v0);
1340        try expectVecApproxEqAbs(v, f32x4(0.0, 1.0, 0.0, 0.0), 0.0001);
1341    }
1342    {
1343        const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32));
1344        const v = saturateFast(v0);
1345        try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, 0.0, 0.0), 0.0001);
1346    }
1347}
1348
1349pub inline fn sqrt(v: anytype) @TypeOf(v) {
1350    return @sqrt(v); // sqrtps
1351}
1352
1353pub inline fn abs(v: anytype) @TypeOf(v) {
1354    return @abs(v); // load, andps
1355}
1356
1357pub inline fn select(mask: anytype, v0: anytype, v1: anytype) @TypeOf(v0, v1) {
1358    return @select(f32, mask, v0, v1);
1359}
1360
1361pub inline fn lerp(v0: anytype, v1: anytype, t: f32) @TypeOf(v0, v1) {
1362    const T = @TypeOf(v0, v1);
1363    return v0 + (v1 - v0) * splat(T, t); // subps, shufps, addps, mulps
1364}
1365
1366pub inline fn lerpV(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1, t) {
1367    return v0 + (v1 - v0) * t; // subps, addps, mulps
1368}
1369
1370pub inline fn lerpInverse(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1) {
1371    const T = @TypeOf(v0, v1);
1372    return (splat(T, t) - v0) / (v1 - v0);
1373}
1374
1375pub inline fn lerpInverseV(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1, t) {
1376    return (t - v0) / (v1 - v0);
1377}
1378test "zmath.lerpInverse" {
1379    try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 10.0), 0, 0.0005));
1380    try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 100.0), 1, 0.0005));
1381    try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 55.0), 0.5, 0.05));
1382    try expectVecApproxEqAbs(lerpInverse(f32x4(0, 0, 10, 10), f32x4(100, 200, 100, 100), 10.0), f32x4(0.1, 0.05, 0, 0), 0.0005);
1383}
1384
1385// Frame rate independent lerp (or "damp"), for approaching things over time.
1386// Reference: https://www.gamedeveloper.com/programming/improved-lerp-smoothing-
1387pub inline fn lerpOverTime(v0: anytype, v1: anytype, rate: anytype, dt: anytype) @TypeOf(v0, v1) {
1388    const t = std.math.exp2(-rate * dt);
1389    return lerp(v0, v1, t);
1390}
1391
1392pub inline fn lerpVOverTime(v0: anytype, v1: anytype, rate: anytype, dt: anytype) @TypeOf(v0, v1, rate, dt) {
1393    const t = std.math.exp2(-rate * dt);
1394    return lerpV(v0, v1, t);
1395}
1396test "zmath.lerpOverTime" {
1397    try expect(math.approxEqAbs(f32, lerpVOverTime(0.0, 1.0, 1.0, 1.0), 0.5, 0.0005));
1398    try expect(math.approxEqAbs(f32, lerpVOverTime(0.5, 1.0, 1.0, 1.0), 0.75, 0.0005));
1399    try expectVecApproxEqAbs(lerpOverTime(f32x4(0, 0, 10, 10), f32x4(100, 200, 100, 100), 1.0, 1.0), f32x4(50, 100, 55, 55), 0.0005);
1400}
1401
1402/// To transform a vector of values from one range to another.
1403pub inline fn mapLinear(v: anytype, min1: anytype, max1: anytype, min2: anytype, max2: anytype) @TypeOf(v) {
1404    const T = @TypeOf(v);
1405    const min1V = splat(T, min1);
1406    const max1V = splat(T, max1);
1407    const min2V = splat(T, min2);
1408    const max2V = splat(T, max2);
1409    const dV = max1V - min1V;
1410    return min2V + (v - min1V) * (max2V - min2V) / dV;
1411}
1412
1413pub inline fn mapLinearV(v: anytype, min1: anytype, max1: anytype, min2: anytype, max2: anytype) @TypeOf(v, min1, max1, min2, max2) {
1414    const d = max1 - min1;
1415    return min2 + (v - min1) * (max2 - min2) / d;
1416}
1417test "zmath.mapLinear" {
1418    try expect(math.approxEqAbs(f32, mapLinearV(0, 0, 1.2, 10, 100), 10, 0.0005));
1419    try expect(math.approxEqAbs(f32, mapLinearV(1.2, 0, 1.2, 10, 100), 100, 0.0005));
1420    try expect(math.approxEqAbs(f32, mapLinearV(0.6, 0, 1.2, 10, 100), 55, 0.0005));
1421    try expectVecApproxEqAbs(mapLinearV(splat(F32x4, 0), splat(F32x4, 0), splat(F32x4, 1.2), splat(F32x4, 10), splat(F32x4, 100)), splat(F32x4, 10), 0.0005);
1422    try expectVecApproxEqAbs(mapLinear(f32x4(0, 0, 0.6, 1.2), 0, 1.2, 10, 100), f32x4(10, 10, 55, 100), 0.0005);
1423}
1424
1425pub const F32x4Component = enum { x, y, z, w };
1426
1427pub inline fn swizzle(
1428    v: F32x4,
1429    comptime x: F32x4Component,
1430    comptime y: F32x4Component,
1431    comptime z: F32x4Component,
1432    comptime w: F32x4Component,
1433) F32x4 {
1434    return @shuffle(f32, v, undefined, [4]i32{ @intFromEnum(x), @intFromEnum(y), @intFromEnum(z), @intFromEnum(w) });
1435}
1436
1437pub inline fn mod(v0: anytype, v1: anytype) @TypeOf(v0, v1) {
1438    // vdivps, vroundps, vmulps, vsubps
1439    return v0 - v1 * trunc(v0 / v1);
1440}
1441test "zmath.mod" {
1442    try expectVecApproxEqAbs(mod(splat(F32x4, 3.1), splat(F32x4, 1.7)), splat(F32x4, 1.4), 0.0005);
1443    try expectVecApproxEqAbs(mod(splat(F32x4, -3.0), splat(F32x4, 2.0)), splat(F32x4, -1.0), 0.0005);
1444    try expectVecApproxEqAbs(mod(splat(F32x4, -3.0), splat(F32x4, -2.0)), splat(F32x4, -1.0), 0.0005);
1445    try expectVecApproxEqAbs(mod(splat(F32x4, 3.0), splat(F32x4, -2.0)), splat(F32x4, 1.0), 0.0005);
1446    try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, 1.0))), 0));
1447    try expect(all(isNan(mod(splat(F32x4, -math.inf(f32)), splat(F32x4, 123.456))), 0));
1448    try expect(all(isNan(mod(splat(F32x4, math.nan(f32)), splat(F32x4, 123.456))), 0));
1449    try expect(all(isNan(mod(splat(F32x4, math.snan(f32)), splat(F32x4, 123.456))), 0));
1450    try expect(all(isNan(mod(splat(F32x4, -math.snan(f32)), splat(F32x4, 123.456))), 0));
1451    try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, math.inf(f32)))), 0));
1452    try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, -math.inf(f32)))), 0));
1453    try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, math.inf(f32)))), 0));
1454    try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, math.nan(f32)))), 0));
1455    try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, math.nan(f32)))), 0));
1456}
1457
1458pub fn modAngle(v: anytype) @TypeOf(v) {
1459    const T = @TypeOf(v);
1460    return switch (T) {
1461        f32 => modAngle32(v),
1462        F32x4, F32x8, F32x16 => modAngle32xN(v),
1463        else => @compileError("zmath.modAngle() not implemented for " ++ @typeName(T)),
1464    };
1465}
1466
1467pub inline fn modAngle32xN(v: anytype) @TypeOf(v) {
1468    const T = @TypeOf(v);
1469    return v - splat(T, math.tau) * round(v * splat(T, 1.0 / math.tau)); // 2 x vmulps, 2 x load, vroundps, vaddps
1470}
1471test "zmath.modAngle" {
1472    try expectVecApproxEqAbs(modAngle(splat(F32x4, math.tau)), splat(F32x4, 0.0), 0.0005);
1473    try expectVecApproxEqAbs(modAngle(splat(F32x4, 0.0)), splat(F32x4, 0.0), 0.0005);
1474    try expectVecApproxEqAbs(modAngle(splat(F32x4, math.pi)), splat(F32x4, math.pi), 0.0005);
1475    try expectVecApproxEqAbs(modAngle(splat(F32x4, 11 * math.pi)), splat(F32x4, math.pi), 0.0005);
1476    try expectVecApproxEqAbs(modAngle(splat(F32x4, 3.5 * math.pi)), splat(F32x4, -0.5 * math.pi), 0.0005);
1477    try expectVecApproxEqAbs(modAngle(splat(F32x4, 2.5 * math.pi)), splat(F32x4, 0.5 * math.pi), 0.0005);
1478}
1479
1480pub inline fn mulAdd(v0: anytype, v1: anytype, v2: anytype) @TypeOf(v0, v1, v2) {
1481    const T = @TypeOf(v0, v1, v2);
1482    if (@import("zmath_options").enable_cross_platform_determinism) {
1483        return v0 * v1 + v2; // Compiler will generate mul, add sequence (no fma even if the target supports it).
1484    } else {
1485        if (cpu_arch == .x86_64 and has_avx and has_fma) {
1486            return @mulAdd(T, v0, v1, v2);
1487        } else {
1488            // NOTE(mziulek): On .x86_64 without HW fma instructions @mulAdd maps to really slow code!
1489            return v0 * v1 + v2;
1490        }
1491    }
1492}
1493
1494fn sin32xN(v: anytype) @TypeOf(v) {
1495    // 11-degree minimax approximation
1496    const T = @TypeOf(v);
1497
1498    var x = modAngle(v);
1499    const sign = andInt(x, splatNegativeZero(T));
1500    const c = orInt(sign, splat(T, math.pi));
1501    const absx = andNotInt(sign, x);
1502    const rflx = c - x;
1503    const comp = absx <= splat(T, 0.5 * math.pi);
1504    x = select(comp, x, rflx);
1505    const x2 = x * x;
1506
1507    var result = mulAdd(splat(T, -2.3889859e-08), x2, splat(T, 2.7525562e-06));
1508    result = mulAdd(result, x2, splat(T, -0.00019840874));
1509    result = mulAdd(result, x2, splat(T, 0.0083333310));
1510    result = mulAdd(result, x2, splat(T, -0.16666667));
1511    result = mulAdd(result, x2, splat(T, 1.0));
1512    return x * result;
1513}
1514test "zmath.sin" {
1515    const epsilon = 0.0001;
1516
1517    try expectVecApproxEqAbs(sin(splat(F32x4, 0.5 * math.pi)), splat(F32x4, 1.0), epsilon);
1518    try expectVecApproxEqAbs(sin(splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon);
1519    try expectVecApproxEqAbs(sin(splat(F32x4, -0.0)), splat(F32x4, -0.0), epsilon);
1520    try expectVecApproxEqAbs(sin(splat(F32x4, 89.123)), splat(F32x4, 0.916166), epsilon);
1521    try expectVecApproxEqAbs(sin(splat(F32x8, 89.123)), splat(F32x8, 0.916166), epsilon);
1522    try expectVecApproxEqAbs(sin(splat(F32x16, 89.123)), splat(F32x16, 0.916166), epsilon);
1523    try expect(all(isNan(sin(splat(F32x4, math.inf(f32)))), 0) == true);
1524    try expect(all(isNan(sin(splat(F32x4, -math.inf(f32)))), 0) == true);
1525    try expect(all(isNan(sin(splat(F32x4, math.nan(f32)))), 0) == true);
1526    try expect(all(isNan(sin(splat(F32x4, math.snan(f32)))), 0) == true);
1527
1528    var f: f32 = -100.0;
1529    var i: u32 = 0;
1530    while (i < 100) : (i += 1) {
1531        const vr = sin(splat(F32x4, f));
1532        const fr = @sin(splat(F32x4, f));
1533        const vr8 = sin(splat(F32x8, f));
1534        const fr8 = @sin(splat(F32x8, f));
1535        const vr16 = sin(splat(F32x16, f));
1536        const fr16 = @sin(splat(F32x16, f));
1537        try expectVecApproxEqAbs(vr, fr, epsilon);
1538        try expectVecApproxEqAbs(vr8, fr8, epsilon);
1539        try expectVecApproxEqAbs(vr16, fr16, epsilon);
1540        f += 0.12345 * @as(f32, @floatFromInt(i));
1541    }
1542}
1543
1544fn cos32xN(v: anytype) @TypeOf(v) {
1545    // 10-degree minimax approximation
1546    const T = @TypeOf(v);
1547
1548    var x = modAngle(v);
1549    var sign = andInt(x, splatNegativeZero(T));
1550    const c = orInt(sign, splat(T, math.pi));
1551    const absx = andNotInt(sign, x);
1552    const rflx = c - x;
1553    const comp = absx <= splat(T, 0.5 * math.pi);
1554    x = select(comp, x, rflx);
1555    sign = select(comp, splat(T, 1.0), splat(T, -1.0));
1556    const x2 = x * x;
1557
1558    var result = mulAdd(splat(T, -2.6051615e-07), x2, splat(T, 2.4760495e-05));
1559    result = mulAdd(result, x2, splat(T, -0.0013888378));
1560    result = mulAdd(result, x2, splat(T, 0.041666638));
1561    result = mulAdd(result, x2, splat(T, -0.5));
1562    result = mulAdd(result, x2, splat(T, 1.0));
1563    return sign * result;
1564}
1565test "zmath.cos" {
1566    const epsilon = 0.0001;
1567
1568    try expectVecApproxEqAbs(cos(splat(F32x4, 0.5 * math.pi)), splat(F32x4, 0.0), epsilon);
1569    try expectVecApproxEqAbs(cos(splat(F32x4, 0.0)), splat(F32x4, 1.0), epsilon);
1570    try expectVecApproxEqAbs(cos(splat(F32x4, -0.0)), splat(F32x4, 1.0), epsilon);
1571    try expect(all(isNan(cos(splat(F32x4, math.inf(f32)))), 0) == true);
1572    try expect(all(isNan(cos(splat(F32x4, -math.inf(f32)))), 0) == true);
1573    try expect(all(isNan(cos(splat(F32x4, math.nan(f32)))), 0) == true);
1574    try expect(all(isNan(cos(splat(F32x4, math.snan(f32)))), 0) == true);
1575
1576    var f: f32 = -100.0;
1577    var i: u32 = 0;
1578    while (i < 100) : (i += 1) {
1579        const vr = cos(splat(F32x4, f));
1580        const fr = @cos(splat(F32x4, f));
1581        const vr8 = cos(splat(F32x8, f));
1582        const fr8 = @cos(splat(F32x8, f));
1583        const vr16 = cos(splat(F32x16, f));
1584        const fr16 = @cos(splat(F32x16, f));
1585        try expectVecApproxEqAbs(vr, fr, epsilon);
1586        try expectVecApproxEqAbs(vr8, fr8, epsilon);
1587        try expectVecApproxEqAbs(vr16, fr16, epsilon);
1588        f += 0.12345 * @as(f32, @floatFromInt(i));
1589    }
1590}
1591
1592pub fn sin(v: anytype) @TypeOf(v) {
1593    const T = @TypeOf(v);
1594    return switch (T) {
1595        f32 => sin32(v),
1596        F32x4, F32x8, F32x16 => sin32xN(v),
1597        else => @compileError("zmath.sin() not implemented for " ++ @typeName(T)),
1598    };
1599}
1600
1601pub fn cos(v: anytype) @TypeOf(v) {
1602    const T = @TypeOf(v);
1603    return switch (T) {
1604        f32 => cos32(v),
1605        F32x4, F32x8, F32x16 => cos32xN(v),
1606        else => @compileError("zmath.cos() not implemented for " ++ @typeName(T)),
1607    };
1608}
1609
1610pub fn sincos(v: anytype) [2]@TypeOf(v) {
1611    const T = @TypeOf(v);
1612    return switch (T) {
1613        f32 => sincos32(v),
1614        F32x4, F32x8, F32x16 => sincos32xN(v),
1615        else => @compileError("zmath.sincos() not implemented for " ++ @typeName(T)),
1616    };
1617}
1618
1619pub fn asin(v: anytype) @TypeOf(v) {
1620    const T = @TypeOf(v);
1621    return switch (T) {
1622        f32 => asin32(v),
1623        F32x4, F32x8, F32x16 => asin32xN(v),
1624        else => @compileError("zmath.asin() not implemented for " ++ @typeName(T)),
1625    };
1626}
1627
1628pub fn acos(v: anytype) @TypeOf(v) {
1629    const T = @TypeOf(v);
1630    return switch (T) {
1631        f32 => acos32(v),
1632        F32x4, F32x8, F32x16 => acos32xN(v),
1633        else => @compileError("zmath.acos() not implemented for " ++ @typeName(T)),
1634    };
1635}
1636
1637fn sincos32xN(v: anytype) [2]@TypeOf(v) {
1638    const T = @TypeOf(v);
1639
1640    var x = modAngle(v);
1641    var sign = andInt(x, splatNegativeZero(T));
1642    const c = orInt(sign, splat(T, math.pi));
1643    const absx = andNotInt(sign, x);
1644    const rflx = c - x;
1645    const comp = absx <= splat(T, 0.5 * math.pi);
1646    x = select(comp, x, rflx);
1647    sign = select(comp, splat(T, 1.0), splat(T, -1.0));
1648    const x2 = x * x;
1649
1650    var sresult = mulAdd(splat(T, -2.3889859e-08), x2, splat(T, 2.7525562e-06));
1651    sresult = mulAdd(sresult, x2, splat(T, -0.00019840874));
1652    sresult = mulAdd(sresult, x2, splat(T, 0.0083333310));
1653    sresult = mulAdd(sresult, x2, splat(T, -0.16666667));
1654    sresult = x * mulAdd(sresult, x2, splat(T, 1.0));
1655
1656    var cresult = mulAdd(splat(T, -2.6051615e-07), x2, splat(T, 2.4760495e-05));
1657    cresult = mulAdd(cresult, x2, splat(T, -0.0013888378));
1658    cresult = mulAdd(cresult, x2, splat(T, 0.041666638));
1659    cresult = mulAdd(cresult, x2, splat(T, -0.5));
1660    cresult = sign * mulAdd(cresult, x2, splat(T, 1.0));
1661
1662    return .{ sresult, cresult };
1663}
1664test "zmath.sincos32xN" {
1665    const epsilon = 0.0001;
1666
1667    var f: f32 = -100.0;
1668    var i: u32 = 0;
1669    while (i < 100) : (i += 1) {
1670        const sc = sincos(splat(F32x4, f));
1671        const sc8 = sincos(splat(F32x8, f));
1672        const sc16 = sincos(splat(F32x16, f));
1673        const s4 = @sin(splat(F32x4, f));
1674        const s8 = @sin(splat(F32x8, f));
1675        const s16 = @sin(splat(F32x16, f));
1676        const c4 = @cos(splat(F32x4, f));
1677        const c8 = @cos(splat(F32x8, f));
1678        const c16 = @cos(splat(F32x16, f));
1679        try expectVecApproxEqAbs(sc[0], s4, epsilon);
1680        try expectVecApproxEqAbs(sc8[0], s8, epsilon);
1681        try expectVecApproxEqAbs(sc16[0], s16, epsilon);
1682        try expectVecApproxEqAbs(sc[1], c4, epsilon);
1683        try expectVecApproxEqAbs(sc8[1], c8, epsilon);
1684        try expectVecApproxEqAbs(sc16[1], c16, epsilon);
1685        f += 0.12345 * @as(f32, @floatFromInt(i));
1686    }
1687}
1688
1689fn asin32xN(v: anytype) @TypeOf(v) {
1690    // 7-degree minimax approximation
1691    const T = @TypeOf(v);
1692
1693    const x = abs(v);
1694    const root = sqrt(maxFast(splat(T, 0.0), splat(T, 1.0) - x));
1695
1696    var t0 = mulAdd(splat(T, -0.0012624911), x, splat(T, 0.0066700901));
1697    t0 = mulAdd(t0, x, splat(T, -0.0170881256));
1698    t0 = mulAdd(t0, x, splat(T, 0.0308918810));
1699    t0 = mulAdd(t0, x, splat(T, -0.0501743046));
1700    t0 = mulAdd(t0, x, splat(T, 0.0889789874));
1701    t0 = mulAdd(t0, x, splat(T, -0.2145988016));
1702    t0 = root * mulAdd(t0, x, splat(T, 1.5707963050));
1703
1704    const t1 = splat(T, math.pi) - t0;
1705    return splat(T, 0.5 * math.pi) - select(v >= splat(T, 0.0), t0, t1);
1706}
1707
1708fn acos32xN(v: anytype) @TypeOf(v) {
1709    // 7-degree minimax approximation
1710    const T = @TypeOf(v);
1711
1712    const x = abs(v);
1713    const root = sqrt(maxFast(splat(T, 0.0), splat(T, 1.0) - x));
1714
1715    var t0 = mulAdd(splat(T, -0.0012624911), x, splat(T, 0.0066700901));
1716    t0 = mulAdd(t0, x, splat(T, -0.0170881256));
1717    t0 = mulAdd(t0, x, splat(T, 0.0308918810));
1718    t0 = mulAdd(t0, x, splat(T, -0.0501743046));
1719    t0 = mulAdd(t0, x, splat(T, 0.0889789874));
1720    t0 = mulAdd(t0, x, splat(T, -0.2145988016));
1721    t0 = root * mulAdd(t0, x, splat(T, 1.5707963050));
1722
1723    const t1 = splat(T, math.pi) - t0;
1724    return select(v >= splat(T, 0.0), t0, t1);
1725}
1726
1727pub fn atan(v: anytype) @TypeOf(v) {
1728    // 17-degree minimax approximation
1729    const T = @TypeOf(v);
1730
1731    const vabs = abs(v);
1732    const vinv = splat(T, 1.0) / v;
1733    var sign = select(v > splat(T, 1.0), splat(T, 1.0), splat(T, -1.0));
1734    const comp = vabs <= splat(T, 1.0);
1735    sign = select(comp, splat(T, 0.0), sign);
1736    const x = select(comp, v, vinv);
1737    const x2 = x * x;
1738
1739    var result = mulAdd(splat(T, 0.0028662257), x2, splat(T, -0.0161657367));
1740    result = mulAdd(result, x2, splat(T, 0.0429096138));
1741    result = mulAdd(result, x2, splat(T, -0.0752896400));
1742    result = mulAdd(result, x2, splat(T, 0.1065626393));
1743    result = mulAdd(result, x2, splat(T, -0.1420889944));
1744    result = mulAdd(result, x2, splat(T, 0.1999355085));
1745    result = mulAdd(result, x2, splat(T, -0.3333314528));
1746    result = x * mulAdd(result, x2, splat(T, 1.0));
1747
1748    const result1 = sign * splat(T, 0.5 * math.pi) - result;
1749    return select(sign == splat(T, 0.0), result, result1);
1750}
1751test "zmath.atan" {
1752    const epsilon = 0.0001;
1753    {
1754        const v = f32x4(0.25, 0.5, 1.0, 1.25);
1755        const e = f32x4(math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3]));
1756        try expectVecApproxEqAbs(e, atan(v), epsilon);
1757    }
1758    {
1759        const v = f32x8(-0.25, 0.5, -1.0, 1.25, 100.0, -200.0, 300.0, 400.0);
1760        // zig fmt: off
1761        const e = f32x8(
1762            math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3]),
1763            math.atan(v[4]), math.atan(v[5]), math.atan(v[6]), math.atan(v[7]),
1764        );
1765        // zig fmt: on
1766        try expectVecApproxEqAbs(e, atan(v), epsilon);
1767    }
1768    {
1769        // zig fmt: off
1770        const v = f32x16(
1771            -0.25, 0.5, -1.0, 0.0, 0.1, -0.2, 30.0, 400.0,
1772            -0.25, 0.5, -1.0, -0.0, -0.05, -0.125, 0.0625, 4000.0
1773        );
1774        const e = f32x16(
1775            math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3]),
1776            math.atan(v[4]), math.atan(v[5]), math.atan(v[6]), math.atan(v[7]),
1777            math.atan(v[8]), math.atan(v[9]), math.atan(v[10]), math.atan(v[11]),
1778            math.atan(v[12]), math.atan(v[13]), math.atan(v[14]), math.atan(v[15]),
1779        );
1780        // zig fmt: on
1781        try expectVecApproxEqAbs(e, atan(v), epsilon);
1782    }
1783    {
1784        try expectVecApproxEqAbs(atan(splat(F32x4, math.inf(f32))), splat(F32x4, 0.5 * math.pi), epsilon);
1785        try expectVecApproxEqAbs(atan(splat(F32x4, -math.inf(f32))), splat(F32x4, -0.5 * math.pi), epsilon);
1786        try expect(all(isNan(atan(splat(F32x4, math.nan(f32)))), 0) == true);
1787        try expect(all(isNan(atan(splat(F32x4, -math.nan(f32)))), 0) == true);
1788    }
1789}
1790
1791pub fn atan2(vy: anytype, vx: anytype) @TypeOf(vx, vy) {
1792    const T = @TypeOf(vx, vy);
1793    const Tu = @Vector(veclen(T), u32);
1794
1795    const vx_is_positive =
1796        (@as(Tu, @bitCast(vx)) & @as(Tu, @splat(0x8000_0000))) == @as(Tu, @splat(0));
1797
1798    const vy_sign = andInt(vy, splatNegativeZero(T));
1799    const c0_25pi = orInt(vy_sign, @as(T, @splat(0.25 * math.pi)));
1800    const c0_50pi = orInt(vy_sign, @as(T, @splat(0.50 * math.pi)));
1801    const c0_75pi = orInt(vy_sign, @as(T, @splat(0.75 * math.pi)));
1802    const c1_00pi = orInt(vy_sign, @as(T, @splat(1.00 * math.pi)));
1803
1804    var r1 = select(vx_is_positive, vy_sign, c1_00pi);
1805    var r2 = select(vx == splat(T, 0.0), c0_50pi, splatInt(T, 0xffff_ffff));
1806    const r3 = select(vy == splat(T, 0.0), r1, r2);
1807    const r4 = select(vx_is_positive, c0_25pi, c0_75pi);
1808    const r5 = select(isInf(vx), r4, c0_50pi);
1809    const result = select(isInf(vy), r5, r3);
1810    const result_valid = @as(Tu, @bitCast(result)) == @as(Tu, @splat(0xffff_ffff));
1811
1812    const v = vy / vx;
1813    const r0 = atan(v);
1814
1815    r1 = select(vx_is_positive, splatNegativeZero(T), c1_00pi);
1816    r2 = r0 + r1;
1817
1818    return select(result_valid, r2, result);
1819}
1820test "zmath.atan2" {
1821    // From DirectXMath XMVectorATan2():
1822    //
1823    // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions:
1824
1825    //     Y == 0 and X is Negative         -> Pi with the sign of Y
1826    //     y == 0 and x is positive         -> 0 with the sign of y
1827    //     Y != 0 and X == 0                -> Pi / 2 with the sign of Y
1828    //     Y != 0 and X is Negative         -> atan(y/x) + (PI with the sign of Y)
1829    //     X == -Infinity and Finite Y      -> Pi with the sign of Y
1830    //     X == +Infinity and Finite Y      -> 0 with the sign of Y
1831    //     Y == Infinity and X is Finite    -> Pi / 2 with the sign of Y
1832    //     Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y
1833    //     Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y
1834
1835    const epsilon = 0.0001;
1836    try expectVecApproxEqAbs(atan2(splat(F32x4, 0.0), splat(F32x4, -1.0)), splat(F32x4, math.pi), epsilon);
1837    try expectVecApproxEqAbs(atan2(splat(F32x4, -0.0), splat(F32x4, -1.0)), splat(F32x4, -math.pi), epsilon);
1838    try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, 0.0)), splat(F32x4, 0.5 * math.pi), epsilon);
1839    try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, 0.0)), splat(F32x4, -0.5 * math.pi), epsilon);
1840    try expectVecApproxEqAbs(
1841        atan2(splat(F32x4, 1.0), splat(F32x4, -1.0)),
1842        splat(F32x4, math.atan(@as(f32, -1.0)) + math.pi),
1843        epsilon,
1844    );
1845    try expectVecApproxEqAbs(
1846        atan2(splat(F32x4, -10.0), splat(F32x4, -2.0)),
1847        splat(F32x4, math.atan(@as(f32, 5.0)) - math.pi),
1848        epsilon,
1849    );
1850    try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, -math.inf(f32))), splat(F32x4, math.pi), epsilon);
1851    try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, -math.inf(f32))), splat(F32x4, -math.pi), epsilon);
1852    try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, math.inf(f32))), splat(F32x4, 0.0), epsilon);
1853    try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, math.inf(f32))), splat(F32x4, -0.0), epsilon);
1854    try expectVecApproxEqAbs(
1855        atan2(splat(F32x4, math.inf(f32)), splat(F32x4, 2.0)),
1856        splat(F32x4, 0.5 * math.pi),
1857        epsilon,
1858    );
1859    try expectVecApproxEqAbs(
1860        atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, 2.0)),
1861        splat(F32x4, -0.5 * math.pi),
1862        epsilon,
1863    );
1864    try expectVecApproxEqAbs(
1865        atan2(splat(F32x4, math.inf(f32)), splat(F32x4, -math.inf(f32))),
1866        splat(F32x4, 0.75 * math.pi),
1867        epsilon,
1868    );
1869    try expectVecApproxEqAbs(
1870        atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, -math.inf(f32))),
1871        splat(F32x4, -0.75 * math.pi),
1872        epsilon,
1873    );
1874    try expectVecApproxEqAbs(
1875        atan2(splat(F32x4, math.inf(f32)), splat(F32x4, math.inf(f32))),
1876        splat(F32x4, 0.25 * math.pi),
1877        epsilon,
1878    );
1879    try expectVecApproxEqAbs(
1880        atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, math.inf(f32))),
1881        splat(F32x4, -0.25 * math.pi),
1882        epsilon,
1883    );
1884    try expectVecApproxEqAbs(
1885        atan2(
1886            f32x8(0.0, -math.inf(f32), -0.0, 2.0, math.inf(f32), math.inf(f32), 1.0, -math.inf(f32)),
1887            f32x8(-2.0, math.inf(f32), 1.0, 0.0, 10.0, -math.inf(f32), 1.0, -math.inf(f32)),
1888        ),
1889        f32x8(
1890            math.pi,
1891            -0.25 * math.pi,
1892            -0.0,
1893            0.5 * math.pi,
1894            0.5 * math.pi,
1895            0.75 * math.pi,
1896            math.atan(@as(f32, 1.0)),
1897            -0.75 * math.pi,
1898        ),
1899        epsilon,
1900    );
1901    try expectVecApproxEqAbs(atan2(splat(F32x4, 0.0), splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon);
1902    try expectVecApproxEqAbs(atan2(splat(F32x4, -0.0), splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon);
1903    try expect(all(isNan(atan2(splat(F32x4, 1.0), splat(F32x4, math.nan(f32)))), 0) == true);
1904    try expect(all(isNan(atan2(splat(F32x4, -1.0), splat(F32x4, math.nan(f32)))), 0) == true);
1905    try expect(all(isNan(atan2(splat(F32x4, math.nan(f32)), splat(F32x4, -1.0))), 0) == true);
1906    try expect(all(isNan(atan2(splat(F32x4, -math.nan(f32)), splat(F32x4, 1.0))), 0) == true);
1907}
1908// ------------------------------------------------------------------------------
1909//
1910// 3. 2D, 3D, 4D vector functions
1911//
1912// ------------------------------------------------------------------------------
1913pub inline fn dot2(v0: Vec, v1: Vec) F32x4 {
1914    var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | -- | -- |
1915    const xmm1 = swizzle(xmm0, .y, .x, .x, .x); // | y0*y1 | -- | -- | -- |
1916    xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[3]); // | x0*x1 + y0*y1 | -- | -- | -- |
1917    return swizzle(xmm0, .x, .x, .x, .x);
1918}
1919test "zmath.dot2" {
1920    const v0 = f32x4(-1.0, 2.0, 300.0, -2.0);
1921    const v1 = f32x4(4.0, 5.0, 600.0, 2.0);
1922    const v = dot2(v0, v1);
1923    try expectVecApproxEqAbs(v, splat(F32x4, 6.0), 0.0001);
1924}
1925
1926pub inline fn dot3(v0: Vec, v1: Vec) F32x4 {
1927    const dot = v0 * v1;
1928    return f32x4s(dot[0] + dot[1] + dot[2]);
1929}
1930test "zmath.dot3" {
1931    const v0 = f32x4(-1.0, 2.0, 3.0, 1.0);
1932    const v1 = f32x4(4.0, 5.0, 6.0, 1.0);
1933    const v = dot3(v0, v1);
1934    try expectVecApproxEqAbs(v, splat(F32x4, 24.0), 0.0001);
1935}
1936
1937pub inline fn dot4(v0: Vec, v1: Vec) F32x4 {
1938    var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | z0*z1 | w0*w1 |
1939    var xmm1 = swizzle(xmm0, .y, .x, .w, .x); // | y0*y1 | -- | w0*w1 | -- |
1940    xmm1 = xmm0 + xmm1; // | x0*x1 + y0*y1 | -- | z0*z1 + w0*w1 | -- |
1941    xmm0 = swizzle(xmm1, .z, .x, .x, .x); // | z0*z1 + w0*w1 | -- | -- | -- |
1942    xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[2]); // addss
1943    return swizzle(xmm0, .x, .x, .x, .x);
1944}
1945test "zmath.dot4" {
1946    const v0 = f32x4(-1.0, 2.0, 3.0, -2.0);
1947    const v1 = f32x4(4.0, 5.0, 6.0, 2.0);
1948    const v = dot4(v0, v1);
1949    try expectVecApproxEqAbs(v, splat(F32x4, 20.0), 0.0001);
1950}
1951
1952pub inline fn cross3(v0: Vec, v1: Vec) Vec {
1953    var xmm0 = swizzle(v0, .y, .z, .x, .w);
1954    var xmm1 = swizzle(v1, .z, .x, .y, .w);
1955    var result = xmm0 * xmm1;
1956    xmm0 = swizzle(xmm0, .y, .z, .x, .w);
1957    xmm1 = swizzle(xmm1, .z, .x, .y, .w);
1958    result = result - xmm0 * xmm1;
1959    return andInt(result, f32x4_mask3);
1960}
1961test "zmath.cross3" {
1962    {
1963        const v0 = f32x4(1.0, 0.0, 0.0, 1.0);
1964        const v1 = f32x4(0.0, 1.0, 0.0, 1.0);
1965        const v = cross3(v0, v1);
1966        try expectVecApproxEqAbs(v, f32x4(0.0, 0.0, 1.0, 0.0), 0.0001);
1967    }
1968    {
1969        const v0 = f32x4(1.0, 0.0, 0.0, 1.0);
1970        const v1 = f32x4(0.0, -1.0, 0.0, 1.0);
1971        const v = cross3(v0, v1);
1972        try expectVecApproxEqAbs(v, f32x4(0.0, 0.0, -1.0, 0.0), 0.0001);
1973    }
1974    {
1975        const v0 = f32x4(-3.0, 0, -2.0, 1.0);
1976        const v1 = f32x4(5.0, -1.0, 2.0, 1.0);
1977        const v = cross3(v0, v1);
1978        try expectVecApproxEqAbs(v, f32x4(-2.0, -4.0, 3.0, 0.0), 0.0001);
1979    }
1980}
1981
1982pub inline fn lengthSq2(v: Vec) F32x4 {
1983    return dot2(v, v);
1984}
1985pub inline fn lengthSq3(v: Vec) F32x4 {
1986    return dot3(v, v);
1987}
1988pub inline fn lengthSq4(v: Vec) F32x4 {
1989    return dot4(v, v);
1990}
1991
1992pub inline fn length2(v: Vec) F32x4 {
1993    return sqrt(dot2(v, v));
1994}
1995pub inline fn length3(v: Vec) F32x4 {
1996    return sqrt(dot3(v, v));
1997}
1998pub inline fn length4(v: Vec) F32x4 {
1999    return sqrt(dot4(v, v));
2000}
2001test "zmath.length3" {
2002    {
2003        const v = length3(f32x4(1.0, -2.0, 3.0, 1000.0));
2004        try expectVecApproxEqAbs(v, splat(F32x4, math.sqrt(14.0)), 0.001);
2005    }
2006    {
2007        const v = length3(f32x4(1.0, math.nan(f32), math.nan(f32), 1000.0));
2008        try expect(all(isNan(v), 0));
2009    }
2010    {
2011        const v = length3(f32x4(1.0, math.inf(f32), 3.0, 1000.0));
2012        try expect(all(isInf(v), 0));
2013    }
2014    {
2015        const v = length3(f32x4(3.0, 2.0, 1.0, math.nan(f32)));
2016        try expectVecApproxEqAbs(v, splat(F32x4, math.sqrt(14.0)), 0.001);
2017    }
2018}
2019
2020pub inline fn normalize2(v: Vec) Vec {
2021    return v * splat(F32x4, 1.0) / sqrt(dot2(v, v));
2022}
2023pub inline fn normalize3(v: Vec) Vec {
2024    return v * splat(F32x4, 1.0) / sqrt(dot3(v, v));
2025}
2026pub inline fn normalize4(v: Vec) Vec {
2027    return v * splat(F32x4, 1.0) / sqrt(dot4(v, v));
2028}
2029test "zmath.normalize3" {
2030    {
2031        const v0 = f32x4(1.0, -2.0, 3.0, 1000.0);
2032        const v = normalize3(v0);
2033        try expectVecApproxEqAbs(v, v0 * splat(F32x4, 1.0 / math.sqrt(14.0)), 0.0005);
2034    }
2035    {
2036        try expect(any(isNan(normalize3(f32x4(1.0, math.inf(f32), 1.0, 1.0))), 0));
2037        try expect(any(isNan(normalize3(f32x4(-math.inf(f32), math.inf(f32), 0.0, 0.0))), 0));
2038        try expect(any(isNan(normalize3(f32x4(-math.nan(f32), math.snan(f32), 0.0, 0.0))), 0));
2039        try expect(any(isNan(normalize3(f32x4(0, 0, 0, 0))), 0));
2040    }
2041}
2042test "zmath.normalize4" {
2043    {
2044        const v0 = f32x4(1.0, -2.0, 3.0, 10.0);
2045        const v = normalize4(v0);
2046        try expectVecApproxEqAbs(v, v0 * splat(F32x4, 1.0 / math.sqrt(114.0)), 0.0005);
2047    }
2048    {
2049        try expect(any(isNan(normalize4(f32x4(1.0, math.inf(f32), 1.0, 1.0))), 0));
2050        try expect(any(isNan(normalize4(f32x4(-math.inf(f32), math.inf(f32), 0.0, 0.0))), 0));
2051        try expect(any(isNan(normalize4(f32x4(-math.nan(f32), math.snan(f32), 0.0, 0.0))), 0));
2052        try expect(any(isNan(normalize4(f32x4(0, 0, 0, 0))), 0));
2053    }
2054}
2055
2056fn vecMulMat(v: Vec, m: Mat) Vec {
2057    const vx = @shuffle(f32, v, undefined, [4]i32{ 0, 0, 0, 0 });
2058    const vy = @shuffle(f32, v, undefined, [4]i32{ 1, 1, 1, 1 });
2059    const vz = @shuffle(f32, v, undefined, [4]i32{ 2, 2, 2, 2 });
2060    const vw = @shuffle(f32, v, undefined, [4]i32{ 3, 3, 3, 3 });
2061    return vx * m[0] + vy * m[1] + vz * m[2] + vw * m[3];
2062}
2063fn matMulVec(m: Mat, v: Vec) Vec {
2064    return .{ dot4(m[0], v)[0], dot4(m[1], v)[0], dot4(m[2], v)[0], dot4(m[3], v)[0] };
2065}
2066test "zmath.vecMulMat" {
2067    const m = Mat{
2068        f32x4(1.0, 0.0, 0.0, 0.0),
2069        f32x4(0.0, 1.0, 0.0, 0.0),
2070        f32x4(0.0, 0.0, 1.0, 0.0),
2071        f32x4(2.0, 3.0, 4.0, 1.0),
2072    };
2073    const vm = mul(f32x4(1.0, 2.0, 3.0, 1.0), m);
2074    const mv = mul(m, f32x4(1.0, 2.0, 3.0, 1.0));
2075    const v = mul(transpose(m), f32x4(1.0, 2.0, 3.0, 1.0));
2076    try expectVecApproxEqAbs(vm, f32x4(3.0, 5.0, 7.0, 1.0), 0.0001);
2077    try expectVecApproxEqAbs(mv, f32x4(1.0, 2.0, 3.0, 21.0), 0.0001);
2078    try expectVecApproxEqAbs(v, f32x4(3.0, 5.0, 7.0, 1.0), 0.0001);
2079}
2080// ------------------------------------------------------------------------------
2081//
2082// 4. Matrix functions
2083//
2084// ------------------------------------------------------------------------------
2085pub fn identity() Mat {
2086    const static = struct {
2087        const identity = Mat{
2088            f32x4(1.0, 0.0, 0.0, 0.0),
2089            f32x4(0.0, 1.0, 0.0, 0.0),
2090            f32x4(0.0, 0.0, 1.0, 0.0),
2091            f32x4(0.0, 0.0, 0.0, 1.0),
2092        };
2093    };
2094    return static.identity;
2095}
2096
2097pub fn matFromArr(arr: [16]f32) Mat {
2098    return Mat{
2099        f32x4(arr[0], arr[1], arr[2], arr[3]),
2100        f32x4(arr[4], arr[5], arr[6], arr[7]),
2101        f32x4(arr[8], arr[9], arr[10], arr[11]),
2102        f32x4(arr[12], arr[13], arr[14], arr[15]),
2103    };
2104}
2105
2106fn mulRetType(comptime Ta: type, comptime Tb: type) type {
2107    if (Ta == Mat and Tb == Mat) {
2108        return Mat;
2109    } else if ((Ta == f32 and Tb == Mat) or (Ta == Mat and Tb == f32)) {
2110        return Mat;
2111    } else if ((Ta == Vec and Tb == Mat) or (Ta == Mat and Tb == Vec)) {
2112        return Vec;
2113    }
2114    @compileError("zmath.mul() not implemented for types: " ++ @typeName(Ta) ++ @typeName(Tb));
2115}
2116
2117pub fn mul(a: anytype, b: anytype) mulRetType(@TypeOf(a), @TypeOf(b)) {
2118    const Ta = @TypeOf(a);
2119    const Tb = @TypeOf(b);
2120    if (Ta == Mat and Tb == Mat) {
2121        return mulMat(a, b);
2122    } else if (Ta == f32 and Tb == Mat) {
2123        const va = splat(F32x4, a);
2124        return Mat{ va * b[0], va * b[1], va * b[2], va * b[3] };
2125    } else if (Ta == Mat and Tb == f32) {
2126        const vb = splat(F32x4, b);
2127        return Mat{ a[0] * vb, a[1] * vb, a[2] * vb, a[3] * vb };
2128    } else if (Ta == Vec and Tb == Mat) {
2129        return vecMulMat(a, b);
2130    } else if (Ta == Mat and Tb == Vec) {
2131        return matMulVec(a, b);
2132    } else {
2133        @compileError("zmath.mul() not implemented for types: " ++ @typeName(Ta) ++ ", " ++ @typeName(Tb));
2134    }
2135}
2136test "zmath.mul" {
2137    {
2138        const m = Mat{
2139            f32x4(0.1, 0.2, 0.3, 0.4),
2140            f32x4(0.5, 0.6, 0.7, 0.8),
2141            f32x4(0.9, 1.0, 1.1, 1.2),
2142            f32x4(1.3, 1.4, 1.5, 1.6),
2143        };
2144        const ms = mul(@as(f32, 2.0), m);
2145        try expectVecApproxEqAbs(ms[0], f32x4(0.2, 0.4, 0.6, 0.8), 0.0001);
2146        try expectVecApproxEqAbs(ms[1], f32x4(1.0, 1.2, 1.4, 1.6), 0.0001);
2147        try expectVecApproxEqAbs(ms[2], f32x4(1.8, 2.0, 2.2, 2.4), 0.0001);
2148        try expectVecApproxEqAbs(ms[3], f32x4(2.6, 2.8, 3.0, 3.2), 0.0001);
2149    }
2150}
2151
2152fn mulMat(m0: Mat, m1: Mat) Mat {
2153    var result: Mat = undefined;
2154    comptime var row: u32 = 0;
2155    inline while (row < 4) : (row += 1) {
2156        const vx = swizzle(m0[row], .x, .x, .x, .x);
2157        const vy = swizzle(m0[row], .y, .y, .y, .y);
2158        const vz = swizzle(m0[row], .z, .z, .z, .z);
2159        const vw = swizzle(m0[row], .w, .w, .w, .w);
2160        result[row] = mulAdd(vx, m1[0], vz * m1[2]) + mulAdd(vy, m1[1], vw * m1[3]);
2161    }
2162    return result;
2163}
2164test "zmath.matrix.mul" {
2165    const a = Mat{
2166        f32x4(0.1, 0.2, 0.3, 0.4),
2167        f32x4(0.5, 0.6, 0.7, 0.8),
2168        f32x4(0.9, 1.0, 1.1, 1.2),
2169        f32x4(1.3, 1.4, 1.5, 1.6),
2170    };
2171    const b = Mat{
2172        f32x4(1.7, 1.8, 1.9, 2.0),
2173        f32x4(2.1, 2.2, 2.3, 2.4),
2174        f32x4(2.5, 2.6, 2.7, 2.8),
2175        f32x4(2.9, 3.0, 3.1, 3.2),
2176    };
2177    const c = mul(a, b);
2178    try expectVecApproxEqAbs(c[0], f32x4(2.5, 2.6, 2.7, 2.8), 0.0001);
2179    try expectVecApproxEqAbs(c[1], f32x4(6.18, 6.44, 6.7, 6.96), 0.0001);
2180    try expectVecApproxEqAbs(c[2], f32x4(9.86, 10.28, 10.7, 11.12), 0.0001);
2181    try expectVecApproxEqAbs(c[3], f32x4(13.54, 14.12, 14.7, 15.28), 0.0001);
2182}
2183
2184pub fn transpose(m: Mat) Mat {
2185    const temp1 = @shuffle(f32, m[0], m[1], [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 1) });
2186    const temp3 = @shuffle(f32, m[0], m[1], [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) });
2187    const temp2 = @shuffle(f32, m[2], m[3], [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 1) });
2188    const temp4 = @shuffle(f32, m[2], m[3], [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) });
2189    return .{
2190        @shuffle(f32, temp1, temp2, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }),
2191        @shuffle(f32, temp1, temp2, [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }),
2192        @shuffle(f32, temp3, temp4, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }),
2193        @shuffle(f32, temp3, temp4, [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }),
2194    };
2195}
2196test "zmath.matrix.transpose" {
2197    const m = Mat{
2198        f32x4(1.0, 2.0, 3.0, 4.0),
2199        f32x4(5.0, 6.0, 7.0, 8.0),
2200        f32x4(9.0, 10.0, 11.0, 12.0),
2201        f32x4(13.0, 14.0, 15.0, 16.0),
2202    };
2203    const mt = transpose(m);
2204    try expectVecApproxEqAbs(mt[0], f32x4(1.0, 5.0, 9.0, 13.0), 0.0001);
2205    try expectVecApproxEqAbs(mt[1], f32x4(2.0, 6.0, 10.0, 14.0), 0.0001);
2206    try expectVecApproxEqAbs(mt[2], f32x4(3.0, 7.0, 11.0, 15.0), 0.0001);
2207    try expectVecApproxEqAbs(mt[3], f32x4(4.0, 8.0, 12.0, 16.0), 0.0001);
2208}
2209
2210pub fn rotationX(angle: f32) Mat {
2211    const sc = sincos(angle);
2212    return .{
2213        f32x4(1.0, 0.0, 0.0, 0.0),
2214        f32x4(0.0, sc[1], sc[0], 0.0),
2215        f32x4(0.0, -sc[0], sc[1], 0.0),
2216        f32x4(0.0, 0.0, 0.0, 1.0),
2217    };
2218}
2219
2220pub fn rotationY(angle: f32) Mat {
2221    const sc = sincos(angle);
2222    return .{
2223        f32x4(sc[1], 0.0, -sc[0], 0.0),
2224        f32x4(0.0, 1.0, 0.0, 0.0),
2225        f32x4(sc[0], 0.0, sc[1], 0.0),
2226        f32x4(0.0, 0.0, 0.0, 1.0),
2227    };
2228}
2229
2230pub fn rotationZ(angle: f32) Mat {
2231    const sc = sincos(angle);
2232    return .{
2233        f32x4(sc[1], sc[0], 0.0, 0.0),
2234        f32x4(-sc[0], sc[1], 0.0, 0.0),
2235        f32x4(0.0, 0.0, 1.0, 0.0),
2236        f32x4(0.0, 0.0, 0.0, 1.0),
2237    };
2238}
2239
2240pub fn translation(x: f32, y: f32, z: f32) Mat {
2241    return .{
2242        f32x4(1.0, 0.0, 0.0, 0.0),
2243        f32x4(0.0, 1.0, 0.0, 0.0),
2244        f32x4(0.0, 0.0, 1.0, 0.0),
2245        f32x4(x, y, z, 1.0),
2246    };
2247}
2248pub fn translationV(v: Vec) Mat {
2249    return translation(v[0], v[1], v[2]);
2250}
2251
2252pub fn scaling(x: f32, y: f32, z: f32) Mat {
2253    return .{
2254        f32x4(x, 0.0, 0.0, 0.0),
2255        f32x4(0.0, y, 0.0, 0.0),
2256        f32x4(0.0, 0.0, z, 0.0),
2257        f32x4(0.0, 0.0, 0.0, 1.0),
2258    };
2259}
2260pub fn scalingV(v: Vec) Mat {
2261    return scaling(v[0], v[1], v[2]);
2262}
2263
2264pub fn lookToLh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat {
2265    const az = normalize3(eyedir);
2266    const ax = normalize3(cross3(updir, az));
2267    const ay = normalize3(cross3(az, ax));
2268    return .{
2269        f32x4(ax[0], ay[0], az[0], 0),
2270        f32x4(ax[1], ay[1], az[1], 0),
2271        f32x4(ax[2], ay[2], az[2], 0),
2272        f32x4(-dot3(ax, eyepos)[0], -dot3(ay, eyepos)[0], -dot3(az, eyepos)[0], 1.0),
2273    };
2274}
2275pub fn lookToRh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat {
2276    return lookToLh(eyepos, -eyedir, updir);
2277}
2278pub fn lookAtLh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat {
2279    return lookToLh(eyepos, focuspos - eyepos, updir);
2280}
2281pub fn lookAtRh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat {
2282    return lookToLh(eyepos, eyepos - focuspos, updir);
2283}
2284test "zmath.matrix.lookToLh" {
2285    const m = lookToLh(f32x4(0.0, 0.0, -3.0, 1.0), f32x4(0.0, 0.0, 1.0, 0.0), f32x4(0.0, 1.0, 0.0, 0.0));
2286    try expectVecApproxEqAbs(m[0], f32x4(1.0, 0.0, 0.0, 0.0), 0.001);
2287    try expectVecApproxEqAbs(m[1], f32x4(0.0, 1.0, 0.0, 0.0), 0.001);
2288    try expectVecApproxEqAbs(m[2], f32x4(0.0, 0.0, 1.0, 0.0), 0.001);
2289    try expectVecApproxEqAbs(m[3], f32x4(0.0, 0.0, 3.0, 1.0), 0.001);
2290}
2291
2292pub fn perspectiveFovLh(fovy: f32, aspect: f32, near: f32, far: f32) Mat {
2293    const scfov = sincos(0.5 * fovy);
2294
2295    assert(near > 0.0 and far > 0.0);
2296    assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001));
2297    assert(!math.approxEqAbs(f32, far, near, 0.001));
2298    assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01));
2299
2300    const h = scfov[1] / scfov[0];
2301    const w = h / aspect;
2302    const r = far / (far - near);
2303    return .{
2304        f32x4(w, 0.0, 0.0, 0.0),
2305        f32x4(0.0, h, 0.0, 0.0),
2306        f32x4(0.0, 0.0, r, 1.0),
2307        f32x4(0.0, 0.0, -r * near, 0.0),
2308    };
2309}
2310pub fn perspectiveFovRh(fovy: f32, aspect: f32, near: f32, far: f32) Mat {
2311    const scfov = sincos(0.5 * fovy);
2312
2313    assert(near > 0.0 and far > 0.0);
2314    assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001));
2315    assert(!math.approxEqAbs(f32, far, near, 0.001));
2316    assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01));
2317
2318    const h = scfov[1] / scfov[0];
2319    const w = h / aspect;
2320    const r = far / (near - far);
2321    return .{
2322        f32x4(w, 0.0, 0.0, 0.0),
2323        f32x4(0.0, h, 0.0, 0.0),
2324        f32x4(0.0, 0.0, r, -1.0),
2325        f32x4(0.0, 0.0, r * near, 0.0),
2326    };
2327}
2328
2329// Produces Z values in [-1.0, 1.0] range (OpenGL defaults)
2330pub fn perspectiveFovLhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat {
2331    const scfov = sincos(0.5 * fovy);
2332
2333    assert(near > 0.0 and far > 0.0);
2334    assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001));
2335    assert(!math.approxEqAbs(f32, far, near, 0.001));
2336    assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01));
2337
2338    const h = scfov[1] / scfov[0];
2339    const w = h / aspect;
2340    const r = far - near;
2341    return .{
2342        f32x4(w, 0.0, 0.0, 0.0),
2343        f32x4(0.0, h, 0.0, 0.0),
2344        f32x4(0.0, 0.0, (near + far) / r, 1.0),
2345        f32x4(0.0, 0.0, 2.0 * near * far / -r, 0.0),
2346    };
2347}
2348
2349// Produces Z values in [-1.0, 1.0] range (OpenGL defaults)
2350pub fn perspectiveFovRhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat {
2351    const scfov = sincos(0.5 * fovy);
2352
2353    assert(near > 0.0 and far > 0.0);
2354    assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001));
2355    assert(!math.approxEqAbs(f32, far, near, 0.001));
2356    assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01));
2357
2358    const h = scfov[1] / scfov[0];
2359    const w = h / aspect;
2360    const r = near - far;
2361    return .{
2362        f32x4(w, 0.0, 0.0, 0.0),
2363        f32x4(0.0, h, 0.0, 0.0),
2364        f32x4(0.0, 0.0, (near + far) / r, -1.0),
2365        f32x4(0.0, 0.0, 2.0 * near * far / r, 0.0),
2366    };
2367}
2368
2369pub fn orthographicLh(w: f32, h: f32, near: f32, far: f32) Mat {
2370    assert(!math.approxEqAbs(f32, w, 0.0, 0.001));
2371    assert(!math.approxEqAbs(f32, h, 0.0, 0.001));
2372    assert(!math.approxEqAbs(f32, far, near, 0.001));
2373
2374    const r = 1 / (far - near);
2375    return .{
2376        f32x4(2 / w, 0.0, 0.0, 0.0),
2377        f32x4(0.0, 2 / h, 0.0, 0.0),
2378        f32x4(0.0, 0.0, r, 0.0),
2379        f32x4(0.0, 0.0, -r * near, 1.0),
2380    };
2381}
2382
2383pub fn orthographicRh(w: f32, h: f32, near: f32, far: f32) Mat {
2384    assert(!math.approxEqAbs(f32, w, 0.0, 0.001));
2385    assert(!math.approxEqAbs(f32, h, 0.0, 0.001));
2386    assert(!math.approxEqAbs(f32, far, near, 0.001));
2387
2388    const r = 1 / (near - far);
2389    return .{
2390        f32x4(2 / w, 0.0, 0.0, 0.0),
2391        f32x4(0.0, 2 / h, 0.0, 0.0),
2392        f32x4(0.0, 0.0, r, 0.0),
2393        f32x4(0.0, 0.0, r * near, 1.0),
2394    };
2395}
2396
2397// Produces Z values in [-1.0, 1.0] range (OpenGL defaults)
2398pub fn orthographicLhGl(w: f32, h: f32, near: f32, far: f32) Mat {
2399    assert(!math.approxEqAbs(f32, w, 0.0, 0.001));
2400    assert(!math.approxEqAbs(f32, h, 0.0, 0.001));
2401    assert(!math.approxEqAbs(f32, far, near, 0.001));
2402
2403    const r = far - near;
2404    return .{
2405        f32x4(2 / w, 0.0, 0.0, 0.0),
2406        f32x4(0.0, 2 / h, 0.0, 0.0),
2407        f32x4(0.0, 0.0, 2 / r, 0.0),
2408        f32x4(0.0, 0.0, (near + far) / -r, 1.0),
2409    };
2410}
2411
2412// Produces Z values in [-1.0, 1.0] range (OpenGL defaults)
2413pub fn orthographicRhGl(w: f32, h: f32, near: f32, far: f32) Mat {
2414    assert(!math.approxEqAbs(f32, w, 0.0, 0.001));
2415    assert(!math.approxEqAbs(f32, h, 0.0, 0.001));
2416    assert(!math.approxEqAbs(f32, far, near, 0.001));
2417
2418    const r = near - far;
2419    return .{
2420        f32x4(2 / w, 0.0, 0.0, 0.0),
2421        f32x4(0.0, 2 / h, 0.0, 0.0),
2422        f32x4(0.0, 0.0, 2 / r, 0.0),
2423        f32x4(0.0, 0.0, (near + far) / r, 1.0),
2424    };
2425}
2426
2427pub fn orthographicOffCenterLh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat {
2428    assert(!math.approxEqAbs(f32, far, near, 0.001));
2429
2430    const r = 1 / (far - near);
2431    return .{
2432        f32x4(2 / (right - left), 0.0, 0.0, 0.0),
2433        f32x4(0.0, 2 / (top - bottom), 0.0, 0.0),
2434        f32x4(0.0, 0.0, r, 0.0),
2435        f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), -r * near, 1.0),
2436    };
2437}
2438
2439pub fn orthographicOffCenterRh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat {
2440    assert(!math.approxEqAbs(f32, far, near, 0.001));
2441
2442    const r = 1 / (near - far);
2443    return .{
2444        f32x4(2 / (right - left), 0.0, 0.0, 0.0),
2445        f32x4(0.0, 2 / (top - bottom), 0.0, 0.0),
2446        f32x4(0.0, 0.0, r, 0.0),
2447        f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), r * near, 1.0),
2448    };
2449}
2450
2451// Produces Z values in [-1.0, 1.0] range (OpenGL defaults)
2452pub fn orthographicOffCenterLhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat {
2453    assert(!math.approxEqAbs(f32, far, near, 0.001));
2454
2455    const r = far - near;
2456    return .{
2457        f32x4(2 / (right - left), 0.0, 0.0, 0.0),
2458        f32x4(0.0, 2 / (top - bottom), 0.0, 0.0),
2459        f32x4(0.0, 0.0, 2 / r, 0.0),
2460        f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), (near + far) / -r, 1.0),
2461    };
2462}
2463
2464// Produces Z values in [-1.0, 1.0] range (OpenGL defaults)
2465pub fn orthographicOffCenterRhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat {
2466    assert(!math.approxEqAbs(f32, far, near, 0.001));
2467
2468    const r = near - far;
2469    return .{
2470        f32x4(2 / (right - left), 0.0, 0.0, 0.0),
2471        f32x4(0.0, 2 / (top - bottom), 0.0, 0.0),
2472        f32x4(0.0, 0.0, 2 / r, 0.0),
2473        f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), (near + far) / r, 1.0),
2474    };
2475}
2476
2477pub fn determinant(m: Mat) F32x4 {
2478    var v0 = swizzle(m[2], .y, .x, .x, .x);
2479    var v1 = swizzle(m[3], .z, .z, .y, .y);
2480    var v2 = swizzle(m[2], .y, .x, .x, .x);
2481    var v3 = swizzle(m[3], .w, .w, .w, .z);
2482    var v4 = swizzle(m[2], .z, .z, .y, .y);
2483    var v5 = swizzle(m[3], .w, .w, .w, .z);
2484
2485    var p0 = v0 * v1;
2486    var p1 = v2 * v3;
2487    var p2 = v4 * v5;
2488
2489    v0 = swizzle(m[2], .z, .z, .y, .y);
2490    v1 = swizzle(m[3], .y, .x, .x, .x);
2491    v2 = swizzle(m[2], .w, .w, .w, .z);
2492    v3 = swizzle(m[3], .y, .x, .x, .x);
2493    v4 = swizzle(m[2], .w, .w, .w, .z);
2494    v5 = swizzle(m[3], .z, .z, .y, .y);
2495
2496    p0 = mulAdd(-v0, v1, p0);
2497    p1 = mulAdd(-v2, v3, p1);
2498    p2 = mulAdd(-v4, v5, p2);
2499
2500    v0 = swizzle(m[1], .w, .w, .w, .z);
2501    v1 = swizzle(m[1], .z, .z, .y, .y);
2502    v2 = swizzle(m[1], .y, .x, .x, .x);
2503
2504    const s = m[0] * f32x4(1.0, -1.0, 1.0, -1.0);
2505    var r = v0 * p0;
2506    r = mulAdd(-v1, p1, r);
2507    r = mulAdd(v2, p2, r);
2508    return dot4(s, r);
2509}
2510test "zmath.matrix.determinant" {
2511    const m = Mat{
2512        f32x4(10.0, -9.0, -12.0, 1.0),
2513        f32x4(7.0, -12.0, 11.0, 1.0),
2514        f32x4(-10.0, 10.0, 3.0, 1.0),
2515        f32x4(1.0, 2.0, 3.0, 4.0),
2516    };
2517    try expectVecApproxEqAbs(determinant(m), splat(F32x4, 2939.0), 0.0001);
2518}
2519
2520pub fn inverse(a: anytype) @TypeOf(a) {
2521    const T = @TypeOf(a);
2522    return switch (T) {
2523        Mat => inverseMat(a),
2524        Quat => inverseQuat(a),
2525        else => @compileError("zmath.inverse() not implemented for " ++ @typeName(T)),
2526    };
2527}
2528
2529fn inverseMat(m: Mat) Mat {
2530    return inverseDet(m, null);
2531}
2532
2533pub fn inverseDet(m: Mat, out_det: ?*F32x4) Mat {
2534    const mt = transpose(m);
2535    var v0: [4]F32x4 = undefined;
2536    var v1: [4]F32x4 = undefined;
2537
2538    v0[0] = swizzle(mt[2], .x, .x, .y, .y);
2539    v1[0] = swizzle(mt[3], .z, .w, .z, .w);
2540    v0[1] = swizzle(mt[0], .x, .x, .y, .y);
2541    v1[1] = swizzle(mt[1], .z, .w, .z, .w);
2542    v0[2] = @shuffle(f32, mt[2], mt[0], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) });
2543    v1[2] = @shuffle(f32, mt[3], mt[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) });
2544
2545    var d0 = v0[0] * v1[0];
2546    var d1 = v0[1] * v1[1];
2547    var d2 = v0[2] * v1[2];
2548
2549    v0[0] = swizzle(mt[2], .z, .w, .z, .w);
2550    v1[0] = swizzle(mt[3], .x, .x, .y, .y);
2551    v0[1] = swizzle(mt[0], .z, .w, .z, .w);
2552    v1[1] = swizzle(mt[1], .x, .x, .y, .y);
2553    v0[2] = @shuffle(f32, mt[2], mt[0], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) });
2554    v1[2] = @shuffle(f32, mt[3], mt[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) });
2555
2556    d0 = mulAdd(-v0[0], v1[0], d0);
2557    d1 = mulAdd(-v0[1], v1[1], d1);
2558    d2 = mulAdd(-v0[2], v1[2], d2);
2559
2560    v0[0] = swizzle(mt[1], .y, .z, .x, .y);
2561    v1[0] = @shuffle(f32, d0, d2, [4]i32{ ~@as(i32, 1), 1, 3, 0 });
2562    v0[1] = swizzle(mt[0], .z, .x, .y, .x);
2563    v1[1] = @shuffle(f32, d0, d2, [4]i32{ 3, ~@as(i32, 1), 1, 2 });
2564    v0[2] = swizzle(mt[3], .y, .z, .x, .y);
2565    v1[2] = @shuffle(f32, d1, d2, [4]i32{ ~@as(i32, 3), 1, 3, 0 });
2566    v0[3] = swizzle(mt[2], .z, .x, .y, .x);
2567    v1[3] = @shuffle(f32, d1, d2, [4]i32{ 3, ~@as(i32, 3), 1, 2 });
2568
2569    var c0 = v0[0] * v1[0];
2570    var c2 = v0[1] * v1[1];
2571    var c4 = v0[2] * v1[2];
2572    var c6 = v0[3] * v1[3];
2573
2574    v0[0] = swizzle(mt[1], .z, .w, .y, .z);
2575    v1[0] = @shuffle(f32, d0, d2, [4]i32{ 3, 0, 1, ~@as(i32, 0) });
2576    v0[1] = swizzle(mt[0], .w, .z, .w, .y);
2577    v1[1] = @shuffle(f32, d0, d2, [4]i32{ 2, 1, ~@as(i32, 0), 0 });
2578    v0[2] = swizzle(mt[3], .z, .w, .y, .z);
2579    v1[2] = @shuffle(f32, d1, d2, [4]i32{ 3, 0, 1, ~@as(i32, 2) });
2580    v0[3] = swizzle(mt[2], .w, .z, .w, .y);
2581    v1[3] = @shuffle(f32, d1, d2, [4]i32{ 2, 1, ~@as(i32, 2), 0 });
2582
2583    c0 = mulAdd(-v0[0], v1[0], c0);
2584    c2 = mulAdd(-v0[1], v1[1], c2);
2585    c4 = mulAdd(-v0[2], v1[2], c4);
2586    c6 = mulAdd(-v0[3], v1[3], c6);
2587
2588    v0[0] = swizzle(mt[1], .w, .x, .w, .x);
2589    v1[0] = @shuffle(f32, d0, d2, [4]i32{ 2, ~@as(i32, 1), ~@as(i32, 0), 2 });
2590    v0[1] = swizzle(mt[0], .y, .w, .x, .z);
2591    v1[1] = @shuffle(f32, d0, d2, [4]i32{ ~@as(i32, 1), 0, 3, ~@as(i32, 0) });
2592    v0[2] = swizzle(mt[3], .w, .x, .w, .x);
2593    v1[2] = @shuffle(f32, d1, d2, [4]i32{ 2, ~@as(i32, 3), ~@as(i32, 2), 2 });
2594    v0[3] = swizzle(mt[2], .y, .w, .x, .z);
2595    v1[3] = @shuffle(f32, d1, d2, [4]i32{ ~@as(i32, 3), 0, 3, ~@as(i32, 2) });
2596
2597    const c1 = mulAdd(-v0[0], v1[0], c0);
2598    const c3 = mulAdd(v0[1], v1[1], c2);
2599    const c5 = mulAdd(-v0[2], v1[2], c4);
2600    const c7 = mulAdd(v0[3], v1[3], c6);
2601
2602    c0 = mulAdd(v0[0], v1[0], c0);
2603    c2 = mulAdd(-v0[1], v1[1], c2);
2604    c4 = mulAdd(v0[2], v1[2], c4);
2605    c6 = mulAdd(-v0[3], v1[3], c6);
2606
2607    var mr = Mat{
2608        f32x4(c0[0], c1[1], c0[2], c1[3]),
2609        f32x4(c2[0], c3[1], c2[2], c3[3]),
2610        f32x4(c4[0], c5[1], c4[2], c5[3]),
2611        f32x4(c6[0], c7[1], c6[2], c7[3]),
2612    };
2613
2614    const det = dot4(mr[0], mt[0]);
2615    if (out_det != null) {
2616        out_det.?.* = det;
2617    }
2618
2619    if (math.approxEqAbs(f32, det[0], 0.0, math.floatEps(f32))) {
2620        return .{
2621            f32x4(0.0, 0.0, 0.0, 0.0),
2622            f32x4(0.0, 0.0, 0.0, 0.0),
2623            f32x4(0.0, 0.0, 0.0, 0.0),
2624            f32x4(0.0, 0.0, 0.0, 0.0),
2625        };
2626    }
2627
2628    const scale = splat(F32x4, 1.0) / det;
2629    mr[0] *= scale;
2630    mr[1] *= scale;
2631    mr[2] *= scale;
2632    mr[3] *= scale;
2633    return mr;
2634}
2635test "zmath.matrix.inverse" {
2636    const m = Mat{
2637        f32x4(10.0, -9.0, -12.0, 1.0),
2638        f32x4(7.0, -12.0, 11.0, 1.0),
2639        f32x4(-10.0, 10.0, 3.0, 1.0),
2640        f32x4(1.0, 2.0, 3.0, 4.0),
2641    };
2642    var det: F32x4 = undefined;
2643    const mi = inverseDet(m, &det);
2644    try expectVecApproxEqAbs(det, splat(F32x4, 2939.0), 0.0001);
2645
2646    try expectVecApproxEqAbs(mi[0], f32x4(-0.170806, -0.13576, -0.349439, 0.164001), 0.0001);
2647    try expectVecApproxEqAbs(mi[1], f32x4(-0.163661, -0.14801, -0.253147, 0.141204), 0.0001);
2648    try expectVecApproxEqAbs(mi[2], f32x4(-0.0871045, 0.00646478, -0.0785982, 0.0398095), 0.0001);
2649    try expectVecApproxEqAbs(mi[3], f32x4(0.18986, 0.103096, 0.272882, 0.10854), 0.0001);
2650}
2651
2652pub fn matFromNormAxisAngle(axis: Vec, angle: f32) Mat {
2653    const sincos_angle = sincos(angle);
2654
2655    const c2 = splat(F32x4, 1.0 - sincos_angle[1]);
2656    const c1 = splat(F32x4, sincos_angle[1]);
2657    const c0 = splat(F32x4, sincos_angle[0]);
2658
2659    const n0 = swizzle(axis, .y, .z, .x, .w);
2660    const n1 = swizzle(axis, .z, .x, .y, .w);
2661
2662    var v0 = c2 * n0 * n1;
2663    const r0 = c2 * axis * axis + c1;
2664    const r1 = c0 * axis + v0;
2665    var r2 = v0 - c0 * axis;
2666
2667    v0 = andInt(r0, f32x4_mask3);
2668
2669    var v1 = @shuffle(f32, r1, r2, [4]i32{ 0, 2, ~@as(i32, 1), ~@as(i32, 2) });
2670    v1 = swizzle(v1, .y, .z, .w, .x);
2671
2672    var v2 = @shuffle(f32, r1, r2, [4]i32{ 1, 1, ~@as(i32, 0), ~@as(i32, 0) });
2673    v2 = swizzle(v2, .x, .z, .x, .z);
2674
2675    r2 = @shuffle(f32, v0, v1, [4]i32{ 0, 3, ~@as(i32, 0), ~@as(i32, 1) });
2676    r2 = swizzle(r2, .x, .z, .w, .y);
2677
2678    var m: Mat = undefined;
2679    m[0] = r2;
2680
2681    r2 = @shuffle(f32, v0, v1, [4]i32{ 1, 3, ~@as(i32, 2), ~@as(i32, 3) });
2682    r2 = swizzle(r2, .z, .x, .w, .y);
2683    m[1] = r2;
2684
2685    v2 = @shuffle(f32, v2, v0, [4]i32{ 0, 1, ~@as(i32, 2), ~@as(i32, 3) });
2686    m[2] = v2;
2687    m[3] = f32x4(0.0, 0.0, 0.0, 1.0);
2688    return m;
2689}
2690pub fn matFromAxisAngle(axis: Vec, angle: f32) Mat {
2691    assert(!all(axis == splat(F32x4, 0.0), 3));
2692    assert(!all(isInf(axis), 3));
2693    const normal = normalize3(axis);
2694    return matFromNormAxisAngle(normal, angle);
2695}
2696test "zmath.matrix.matFromAxisAngle" {
2697    {
2698        const m0 = matFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), math.pi * 0.25);
2699        const m1 = rotationX(math.pi * 0.25);
2700        try expectVecApproxEqAbs(m0[0], m1[0], 0.001);
2701        try expectVecApproxEqAbs(m0[1], m1[1], 0.001);
2702        try expectVecApproxEqAbs(m0[2], m1[2], 0.001);
2703        try expectVecApproxEqAbs(m0[3], m1[3], 0.001);
2704    }
2705    {
2706        const m0 = matFromAxisAngle(f32x4(0.0, 1.0, 0.0, 0.0), math.pi * 0.125);
2707        const m1 = rotationY(math.pi * 0.125);
2708        try expectVecApproxEqAbs(m0[0], m1[0], 0.001);
2709        try expectVecApproxEqAbs(m0[1], m1[1], 0.001);
2710        try expectVecApproxEqAbs(m0[2], m1[2], 0.001);
2711        try expectVecApproxEqAbs(m0[3], m1[3], 0.001);
2712    }
2713    {
2714        const m0 = matFromAxisAngle(f32x4(0.0, 0.0, 1.0, 0.0), math.pi * 0.333);
2715        const m1 = rotationZ(math.pi * 0.333);
2716        try expectVecApproxEqAbs(m0[0], m1[0], 0.001);
2717        try expectVecApproxEqAbs(m0[1], m1[1], 0.001);
2718        try expectVecApproxEqAbs(m0[2], m1[2], 0.001);
2719        try expectVecApproxEqAbs(m0[3], m1[3], 0.001);
2720    }
2721}
2722
2723pub fn matFromQuat(quat: Quat) Mat {
2724    const q0 = quat + quat;
2725    var q1 = quat * q0;
2726
2727    var v0 = swizzle(q1, .y, .x, .x, .w);
2728    v0 = andInt(v0, f32x4_mask3);
2729
2730    var v1 = swizzle(q1, .z, .z, .y, .w);
2731    v1 = andInt(v1, f32x4_mask3);
2732
2733    const r0 = (f32x4(1.0, 1.0, 1.0, 0.0) - v0) - v1;
2734
2735    v0 = swizzle(quat, .x, .x, .y, .w);
2736    v1 = swizzle(q0, .z, .y, .z, .w);
2737    v0 = v0 * v1;
2738
2739    v1 = swizzle(quat, .w, .w, .w, .w);
2740    const v2 = swizzle(q0, .y, .z, .x, .w);
2741    v1 = v1 * v2;
2742
2743    const r1 = v0 + v1;
2744    const r2 = v0 - v1;
2745
2746    v0 = @shuffle(f32, r1, r2, [4]i32{ 1, 2, ~@as(i32, 0), ~@as(i32, 1) });
2747    v0 = swizzle(v0, .x, .z, .w, .y);
2748    v1 = @shuffle(f32, r1, r2, [4]i32{ 0, 0, ~@as(i32, 2), ~@as(i32, 2) });
2749    v1 = swizzle(v1, .x, .z, .x, .z);
2750
2751    q1 = @shuffle(f32, r0, v0, [4]i32{ 0, 3, ~@as(i32, 0), ~@as(i32, 1) });
2752    q1 = swizzle(q1, .x, .z, .w, .y);
2753
2754    var m: Mat = undefined;
2755    m[0] = q1;
2756
2757    q1 = @shuffle(f32, r0, v0, [4]i32{ 1, 3, ~@as(i32, 2), ~@as(i32, 3) });
2758    q1 = swizzle(q1, .z, .x, .w, .y);
2759    m[1] = q1;
2760
2761    q1 = @shuffle(f32, v1, r0, [4]i32{ 0, 1, ~@as(i32, 2), ~@as(i32, 3) });
2762    m[2] = q1;
2763    m[3] = f32x4(0.0, 0.0, 0.0, 1.0);
2764    return m;
2765}
2766test "zmath.matrix.matFromQuat" {
2767    {
2768        const m = matFromQuat(f32x4(0.0, 0.0, 0.0, 1.0));
2769        try expectVecApproxEqAbs(m[0], f32x4(1.0, 0.0, 0.0, 0.0), 0.0001);
2770        try expectVecApproxEqAbs(m[1], f32x4(0.0, 1.0, 0.0, 0.0), 0.0001);
2771        try expectVecApproxEqAbs(m[2], f32x4(0.0, 0.0, 1.0, 0.0), 0.0001);
2772        try expectVecApproxEqAbs(m[3], f32x4(0.0, 0.0, 0.0, 1.0), 0.0001);
2773    }
2774}
2775
2776pub fn matFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Mat {
2777    return matFromRollPitchYawV(f32x4(pitch, yaw, roll, 0.0));
2778}
2779pub fn matFromRollPitchYawV(angles: Vec) Mat {
2780    return matFromQuat(quatFromRollPitchYawV(angles));
2781}
2782
2783pub fn matToQuat(m: Mat) Quat {
2784    return quatFromMat(m);
2785}
2786
2787pub inline fn loadMat(mem: []const f32) Mat {
2788    return .{
2789        load(mem[0..4], F32x4, 0),
2790        load(mem[4..8], F32x4, 0),
2791        load(mem[8..12], F32x4, 0),
2792        load(mem[12..16], F32x4, 0),
2793    };
2794}
2795test "zmath.loadMat" {
2796    const a = [18]f32{
2797        1.0,  2.0,  3.0,  4.0,
2798        5.0,  6.0,  7.0,  8.0,
2799        9.0,  10.0, 11.0, 12.0,
2800        13.0, 14.0, 15.0, 16.0,
2801        17.0, 18.0,
2802    };
2803    const m = loadMat(a[1..]);
2804    try expectVecEqual(m[0], f32x4(2.0, 3.0, 4.0, 5.0));
2805    try expectVecEqual(m[1], f32x4(6.0, 7.0, 8.0, 9.0));
2806    try expectVecEqual(m[2], f32x4(10.0, 11.0, 12.0, 13.0));
2807    try expectVecEqual(m[3], f32x4(14.0, 15.0, 16.0, 17.0));
2808}
2809
2810pub inline fn storeMat(mem: []f32, m: Mat) void {
2811    store(mem[0..4], m[0], 0);
2812    store(mem[4..8], m[1], 0);
2813    store(mem[8..12], m[2], 0);
2814    store(mem[12..16], m[3], 0);
2815}
2816
2817pub inline fn loadMat43(mem: []const f32) Mat {
2818    return .{
2819        f32x4(mem[0], mem[1], mem[2], 0.0),
2820        f32x4(mem[3], mem[4], mem[5], 0.0),
2821        f32x4(mem[6], mem[7], mem[8], 0.0),
2822        f32x4(mem[9], mem[10], mem[11], 1.0),
2823    };
2824}
2825
2826pub inline fn storeMat43(mem: []f32, m: Mat) void {
2827    store(mem[0..3], m[0], 3);
2828    store(mem[3..6], m[1], 3);
2829    store(mem[6..9], m[2], 3);
2830    store(mem[9..12], m[3], 3);
2831}
2832
2833pub inline fn loadMat34(mem: []const f32) Mat {
2834    return .{
2835        load(mem[0..4], F32x4, 0),
2836        load(mem[4..8], F32x4, 0),
2837        load(mem[8..12], F32x4, 0),
2838        f32x4(0.0, 0.0, 0.0, 1.0),
2839    };
2840}
2841
2842pub inline fn storeMat34(mem: []f32, m: Mat) void {
2843    store(mem[0..4], m[0], 0);
2844    store(mem[4..8], m[1], 0);
2845    store(mem[8..12], m[2], 0);
2846}
2847
2848pub inline fn matToArr(m: Mat) [16]f32 {
2849    var array: [16]f32 = undefined;
2850    storeMat(array[0..], m);
2851    return array;
2852}
2853
2854pub inline fn matToArr43(m: Mat) [12]f32 {
2855    var array: [12]f32 = undefined;
2856    storeMat43(array[0..], m);
2857    return array;
2858}
2859
2860pub inline fn matToArr34(m: Mat) [12]f32 {
2861    var array: [12]f32 = undefined;
2862    storeMat34(array[0..], m);
2863    return array;
2864}
2865// ------------------------------------------------------------------------------
2866//
2867// 5. Quaternion functions
2868//
2869// ------------------------------------------------------------------------------
2870pub fn qmul(q0: Quat, q1: Quat) Quat {
2871    var result = swizzle(q1, .w, .w, .w, .w);
2872    var q1x = swizzle(q1, .x, .x, .x, .x);
2873    var q1y = swizzle(q1, .y, .y, .y, .y);
2874    var q1z = swizzle(q1, .z, .z, .z, .z);
2875    result = result * q0;
2876    var q0_shuf = swizzle(q0, .w, .z, .y, .x);
2877    q1x = q1x * q0_shuf;
2878    q0_shuf = swizzle(q0_shuf, .y, .x, .w, .z);
2879    result = mulAdd(q1x, f32x4(1.0, -1.0, 1.0, -1.0), result);
2880    q1y = q1y * q0_shuf;
2881    q0_shuf = swizzle(q0_shuf, .w, .z, .y, .x);
2882    q1y = q1y * f32x4(1.0, 1.0, -1.0, -1.0);
2883    q1z = q1z * q0_shuf;
2884    q1y = mulAdd(q1z, f32x4(-1.0, 1.0, 1.0, -1.0), q1y);
2885    return result + q1y;
2886}
2887test "zmath.quaternion.mul" {
2888    {
2889        const q0 = f32x4(2.0, 3.0, 4.0, 1.0);
2890        const q1 = f32x4(3.0, 2.0, 1.0, 4.0);
2891        try expectVecApproxEqAbs(qmul(q0, q1), f32x4(16.0, 4.0, 22.0, -12.0), 0.0001);
2892    }
2893}
2894
2895pub fn quatToMat(quat: Quat) Mat {
2896    return matFromQuat(quat);
2897}
2898
2899pub fn quatToAxisAngle(quat: Quat, axis: *Vec, angle: *f32) void {
2900    axis.* = quat;
2901    angle.* = 2.0 * acos(quat[3]);
2902}
2903test "zmath.quaternion.quatToAxisAngle" {
2904    {
2905        const q0 = quatFromNormAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi);
2906        var axis: Vec = f32x4(4.0, 3.0, 2.0, 1.0);
2907        var angle: f32 = 10.0;
2908        quatToAxisAngle(q0, &axis, &angle);
2909        try expect(math.approxEqAbs(f32, axis[0], @sin(@as(f32, 0.25) * math.pi * 0.5), 0.0001));
2910        try expect(axis[1] == 0.0);
2911        try expect(axis[2] == 0.0);
2912        try expect(math.approxEqAbs(f32, angle, 0.25 * math.pi, 0.0001));
2913    }
2914}
2915
2916pub fn quatFromMat(m: Mat) Quat {
2917    const r0 = m[0];
2918    const r1 = m[1];
2919    const r2 = m[2];
2920    const r00 = swizzle(r0, .x, .x, .x, .x);
2921    const r11 = swizzle(r1, .y, .y, .y, .y);
2922    const r22 = swizzle(r2, .z, .z, .z, .z);
2923
2924    const x2gey2 = (r11 - r00) <= splat(F32x4, 0.0);
2925    const z2gew2 = (r11 + r00) <= splat(F32x4, 0.0);
2926    const x2py2gez2pw2 = r22 <= splat(F32x4, 0.0);
2927
2928    var t0 = mulAdd(r00, f32x4(1.0, -1.0, -1.0, 1.0), splat(F32x4, 1.0));
2929    var t1 = r11 * f32x4(-1.0, 1.0, -1.0, 1.0);
2930    var t2 = mulAdd(r22, f32x4(-1.0, -1.0, 1.0, 1.0), t0);
2931    const x2y2z2w2 = t1 + t2;
2932
2933    t0 = @shuffle(f32, r0, r1, [4]i32{ 1, 2, ~@as(i32, 2), ~@as(i32, 1) });
2934    t1 = @shuffle(f32, r1, r2, [4]i32{ 0, 0, ~@as(i32, 0), ~@as(i32, 1) });
2935    t1 = swizzle(t1, .x, .z, .w, .y);
2936    const xyxzyz = t0 + t1;
2937
2938    t0 = @shuffle(f32, r2, r1, [4]i32{ 1, 0, ~@as(i32, 0), ~@as(i32, 0) });
2939    t1 = @shuffle(f32, r1, r0, [4]i32{ 2, 2, ~@as(i32, 2), ~@as(i32, 1) });
2940    t1 = swizzle(t1, .x, .z, .w, .y);
2941    const xwywzw = (t0 - t1) * f32x4(-1.0, 1.0, -1.0, 1.0);
2942
2943    t0 = @shuffle(f32, x2y2z2w2, xyxzyz, [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 0) });
2944    t1 = @shuffle(f32, x2y2z2w2, xwywzw, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 0) });
2945    t2 = @shuffle(f32, xyxzyz, xwywzw, [4]i32{ 1, 2, ~@as(i32, 0), ~@as(i32, 1) });
2946
2947    const tensor0 = @shuffle(f32, t0, t2, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) });
2948    const tensor1 = @shuffle(f32, t0, t2, [4]i32{ 2, 1, ~@as(i32, 1), ~@as(i32, 3) });
2949    const tensor2 = @shuffle(f32, t2, t1, [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 2) });
2950    const tensor3 = @shuffle(f32, t2, t1, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 1) });
2951
2952    t0 = select(x2gey2, tensor0, tensor1);
2953    t1 = select(z2gew2, tensor2, tensor3);
2954    t2 = select(x2py2gez2pw2, t0, t1);
2955
2956    return t2 / length4(t2);
2957}
2958test "zmath.quatFromMat" {
2959    {
2960        const q0 = quatFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi);
2961        const q1 = quatFromMat(rotationX(0.25 * math.pi));
2962        try expectVecApproxEqAbs(q0, q1, 0.0001);
2963    }
2964    {
2965        const q0 = quatFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi);
2966        const q1 = quatFromMat(matFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi));
2967        try expectVecApproxEqAbs(q0, q1, 0.0001);
2968    }
2969    {
2970        const q0 = quatFromRollPitchYaw(0.1 * math.pi, -0.2 * math.pi, 0.3 * math.pi);
2971        const q1 = quatFromMat(matFromRollPitchYaw(0.1 * math.pi, -0.2 * math.pi, 0.3 * math.pi));
2972        try expectVecApproxEqAbs(q0, q1, 0.0001);
2973    }
2974}
2975
2976pub fn quatFromNormAxisAngle(axis: Vec, angle: f32) Quat {
2977    const n = f32x4(axis[0], axis[1], axis[2], 1.0);
2978    const sc = sincos(0.5 * angle);
2979    return n * f32x4(sc[0], sc[0], sc[0], sc[1]);
2980}
2981pub fn quatFromAxisAngle(axis: Vec, angle: f32) Quat {
2982    assert(!all(axis == splat(F32x4, 0.0), 3));
2983    assert(!all(isInf(axis), 3));
2984    const normal = normalize3(axis);
2985    return quatFromNormAxisAngle(normal, angle);
2986}
2987test "zmath.quaternion.quatFromNormAxisAngle" {
2988    {
2989        const q0 = quatFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi);
2990        const q1 = quatFromAxisAngle(f32x4(0.0, 1.0, 0.0, 0.0), 0.125 * math.pi);
2991        const m0 = rotationX(0.25 * math.pi);
2992        const m1 = rotationY(0.125 * math.pi);
2993        const mr0 = quatToMat(qmul(q0, q1));
2994        const mr1 = mul(m0, m1);
2995        try expectVecApproxEqAbs(mr0[0], mr1[0], 0.0001);
2996        try expectVecApproxEqAbs(mr0[1], mr1[1], 0.0001);
2997        try expectVecApproxEqAbs(mr0[2], mr1[2], 0.0001);
2998        try expectVecApproxEqAbs(mr0[3], mr1[3], 0.0001);
2999    }
3000    {
3001        const m0 = quatToMat(quatFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi));
3002        const m1 = matFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi);
3003        try expectVecApproxEqAbs(m0[0], m1[0], 0.0001);
3004        try expectVecApproxEqAbs(m0[1], m1[1], 0.0001);
3005        try expectVecApproxEqAbs(m0[2], m1[2], 0.0001);
3006        try expectVecApproxEqAbs(m0[3], m1[3], 0.0001);
3007    }
3008}
3009
3010pub inline fn qidentity() Quat {
3011    return f32x4(@as(f32, 0.0), @as(f32, 0.0), @as(f32, 0.0), @as(f32, 1.0));
3012}
3013
3014pub inline fn conjugate(quat: Quat) Quat {
3015    return quat * f32x4(-1.0, -1.0, -1.0, 1.0);
3016}
3017
3018fn inverseQuat(quat: Quat) Quat {
3019    const l = lengthSq4(quat);
3020    const conj = conjugate(quat);
3021    return select(l <= splat(F32x4, math.floatEps(f32)), splat(F32x4, 0.0), conj / l);
3022}
3023test "zmath.quaternion.inverseQuat" {
3024    try expectVecApproxEqAbs(
3025        inverse(f32x4(2.0, 3.0, 4.0, 1.0)),
3026        f32x4(-1.0 / 15.0, -1.0 / 10.0, -2.0 / 15.0, 1.0 / 30.0),
3027        0.0001,
3028    );
3029    try expectVecApproxEqAbs(inverse(qidentity()), qidentity(), 0.0001);
3030}
3031
3032// Algorithm from: https://github.com/g-truc/glm/blob/master/glm/detail/type_quat.inl
3033pub fn rotate(q: Quat, v: Vec) Vec {
3034    const w = splat(F32x4, q[3]);
3035    const axis = f32x4(q[0], q[1], q[2], 0.0);
3036    const uv = cross3(axis, v);
3037    return v + ((uv * w) + cross3(axis, uv)) * splat(F32x4, 2.0);
3038}
3039test "zmath.quaternion.rotate" {
3040    const quat = quatFromRollPitchYaw(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi);
3041    const mat = matFromQuat(quat);
3042    const forward = f32x4(0.0, 0.0, -1.0, 0.0);
3043    const up = f32x4(0.0, 1.0, 0.0, 0.0);
3044    const right = f32x4(1.0, 0.0, 0.0, 0.0);
3045    try expectVecApproxEqAbs(rotate(quat, forward), mul(forward, mat), 0.0001);
3046    try expectVecApproxEqAbs(rotate(quat, up), mul(up, mat), 0.0001);
3047    try expectVecApproxEqAbs(rotate(quat, right), mul(right, mat), 0.0001);
3048}
3049
3050pub fn slerp(q0: Quat, q1: Quat, t: f32) Quat {
3051    return slerpV(q0, q1, splat(F32x4, t));
3052}
3053pub fn slerpV(q0: Quat, q1: Quat, t: F32x4) Quat {
3054    var cos_omega = dot4(q0, q1);
3055    const sign = select(cos_omega < splat(F32x4, 0.0), splat(F32x4, -1.0), splat(F32x4, 1.0));
3056
3057    cos_omega = cos_omega * sign;
3058    const sin_omega = sqrt(splat(F32x4, 1.0) - cos_omega * cos_omega);
3059
3060    const omega = atan2(sin_omega, cos_omega);
3061
3062    var v01 = t;
3063    v01 = xorInt(andInt(v01, f32x4_mask2), f32x4_sign_mask1);
3064    v01 = f32x4(1.0, 0.0, 0.0, 0.0) + v01;
3065
3066    var s0 = sin(v01 * omega) / sin_omega;
3067    s0 = select(cos_omega < splat(F32x4, 1.0 - 0.00001), s0, v01);
3068
3069    const s1 = swizzle(s0, .y, .y, .y, .y);
3070    s0 = swizzle(s0, .x, .x, .x, .x);
3071
3072    return q0 * s0 + sign * q1 * s1;
3073}
3074test "zmath.quaternion.slerp" {
3075    const from = f32x4(0.0, 0.0, 0.0, 1.0);
3076    const to = f32x4(0.5, 0.5, -0.5, 0.5);
3077    const result = slerp(from, to, 0.5);
3078    try expectVecApproxEqAbs(result, f32x4(0.28867513, 0.28867513, -0.28867513, 0.86602540), 0.0001);
3079}
3080
3081// Converts q back to euler angles, assuming a YXZ rotation order.
3082// See: http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler
3083pub fn quatToRollPitchYaw(q: Quat) [3]f32 {
3084    var angles: [3]f32 = undefined;
3085
3086    const p = swizzle(q, .w, .y, .x, .z);
3087    const sign = -1.0;
3088
3089    const singularity = p[0] * p[2] + sign * p[1] * p[3];
3090    if (singularity > 0.499) {
3091        angles[0] = math.pi * 0.5;
3092        angles[1] = 2.0 * math.atan2(p[1], p[0]);
3093        angles[2] = 0.0;
3094    } else if (singularity < -0.499) {
3095        angles[0] = -math.pi * 0.5;
3096        angles[1] = 2.0 * math.atan2(p[1], p[0]);
3097        angles[2] = 0.0;
3098    } else {
3099        const sq = p * p;
3100        const y = splat(F32x4, 2.0) * f32x4(p[0] * p[1] - sign * p[2] * p[3], p[0] * p[3] - sign * p[1] * p[2], 0.0, 0.0);
3101        const x = splat(F32x4, 1.0) - (splat(F32x4, 2.0) * f32x4(sq[1] + sq[2], sq[2] + sq[3], 0.0, 0.0));
3102        const res = atan2(y, x);
3103        angles[0] = math.asin(2.0 * singularity);
3104        angles[1] = res[0];
3105        angles[2] = res[1];
3106    }
3107
3108    return angles;
3109}
3110
3111test "zmath.quaternion.quatToRollPitchYaw" {
3112    {
3113        const expected = f32x4(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0);
3114        const quat = quatFromRollPitchYaw(expected[0], expected[1], expected[2]);
3115        const result = quatToRollPitchYaw(quat);
3116        try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001);
3117    }
3118
3119    {
3120        const expected = f32x4(0.3 * math.pi, 0.1 * math.pi, 0.2 * math.pi, 0.0);
3121        const quat = quatFromRollPitchYaw(expected[0], expected[1], expected[2]);
3122        const result = quatToRollPitchYaw(quat);
3123        try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001);
3124    }
3125
3126    // North pole singularity
3127    {
3128        const angle = f32x4(0.5 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0);
3129        const expected = f32x4(0.5 * math.pi, -0.1 * math.pi, 0.0, 0.0);
3130        const quat = quatFromRollPitchYaw(angle[0], angle[1], angle[2]);
3131        const result = quatToRollPitchYaw(quat);
3132        try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001);
3133    }
3134
3135    // South pole singularity
3136    {
3137        const angle = f32x4(-0.5 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0);
3138        const expected = f32x4(-0.5 * math.pi, 0.5 * math.pi, 0.0, 0.0);
3139        const quat = quatFromRollPitchYaw(angle[0], angle[1], angle[2]);
3140        const result = quatToRollPitchYaw(quat);
3141        try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001);
3142    }
3143}
3144
3145pub fn quatFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Quat {
3146    return quatFromRollPitchYawV(f32x4(pitch, yaw, roll, 0.0));
3147}
3148pub fn quatFromRollPitchYawV(angles: Vec) Quat { // | pitch | yaw | roll | 0 |
3149    const sc = sincos(splat(Vec, 0.5) * angles);
3150    const p0 = @shuffle(f32, sc[1], sc[0], [4]i32{ ~@as(i32, 0), 0, 0, 0 });
3151    const p1 = @shuffle(f32, sc[0], sc[1], [4]i32{ ~@as(i32, 0), 0, 0, 0 });
3152    const y0 = @shuffle(f32, sc[1], sc[0], [4]i32{ 1, ~@as(i32, 1), 1, 1 });
3153    const y1 = @shuffle(f32, sc[0], sc[1], [4]i32{ 1, ~@as(i32, 1), 1, 1 });
3154    const r0 = @shuffle(f32, sc[1], sc[0], [4]i32{ 2, 2, ~@as(i32, 2), 2 });
3155    const r1 = @shuffle(f32, sc[0], sc[1], [4]i32{ 2, 2, ~@as(i32, 2), 2 });
3156    const q1 = p1 * f32x4(1.0, -1.0, -1.0, 1.0) * y1;
3157    const q0 = p0 * y0 * r0;
3158    return mulAdd(q1, r1, q0);
3159}
3160test "zmath.quaternion.quatFromRollPitchYawV" {
3161    {
3162        const m0 = quatToMat(quatFromRollPitchYawV(f32x4(0.25 * math.pi, 0.0, 0.0, 0.0)));
3163        const m1 = rotationX(0.25 * math.pi);
3164        try expectVecApproxEqAbs(m0[0], m1[0], 0.0001);
3165        try expectVecApproxEqAbs(m0[1], m1[1], 0.0001);
3166        try expectVecApproxEqAbs(m0[2], m1[2], 0.0001);
3167        try expectVecApproxEqAbs(m0[3], m1[3], 0.0001);
3168    }
3169    {
3170        const m0 = quatToMat(quatFromRollPitchYaw(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi));
3171        const m1 = mul(
3172            rotationZ(0.3 * math.pi),
3173            mul(rotationX(0.1 * math.pi), rotationY(0.2 * math.pi)),
3174        );
3175        try expectVecApproxEqAbs(m0[0], m1[0], 0.0001);
3176        try expectVecApproxEqAbs(m0[1], m1[1], 0.0001);
3177        try expectVecApproxEqAbs(m0[2], m1[2], 0.0001);
3178        try expectVecApproxEqAbs(m0[3], m1[3], 0.0001);
3179    }
3180}
3181// ------------------------------------------------------------------------------
3182//
3183// 6. Color functions
3184//
3185// ------------------------------------------------------------------------------
3186pub fn adjustSaturation(color: F32x4, saturation: f32) F32x4 {
3187    const luminance = dot3(f32x4(0.2125, 0.7154, 0.0721, 0.0), color);
3188    var result = mulAdd(color - luminance, f32x4s(saturation), luminance);
3189    result[3] = color[3];
3190    return result;
3191}
3192
3193pub fn adjustContrast(color: F32x4, contrast: f32) F32x4 {
3194    var result = mulAdd(color - f32x4s(0.5), f32x4s(contrast), f32x4s(0.5));
3195    result[3] = color[3];
3196    return result;
3197}
3198
3199pub fn rgbToHsl(rgb: F32x4) F32x4 {
3200    const r = swizzle(rgb, .x, .x, .x, .x);
3201    const g = swizzle(rgb, .y, .y, .y, .y);
3202    const b = swizzle(rgb, .z, .z, .z, .z);
3203
3204    const minv = min(r, min(g, b));
3205    const maxv = max(r, max(g, b));
3206
3207    const l = (minv + maxv) * f32x4s(0.5);
3208    const d = maxv - minv;
3209    const la = select(boolx4(true, true, true, false), l, rgb);
3210
3211    if (all(d < f32x4s(math.floatEps(f32)), 3)) {
3212        return select(boolx4(true, true, false, false), f32x4s(0.0), la);
3213    } else {
3214        var s: F32x4 = undefined;
3215        var h: F32x4 = undefined;
3216
3217        const d2 = minv + maxv;
3218
3219        if (all(l > f32x4s(0.5), 3)) {
3220            s = d / (f32x4s(2.0) - d2);
3221        } else {
3222            s = d / d2;
3223        }
3224
3225        if (all(r == maxv, 3)) {
3226            h = (g - b) / d;
3227        } else if (all(g == maxv, 3)) {
3228            h = f32x4s(2.0) + (b - r) / d;
3229        } else {
3230            h = f32x4s(4.0) + (r - g) / d;
3231        }
3232
3233        h /= f32x4s(6.0);
3234
3235        if (all(h < f32x4s(0.0), 3)) {
3236            h += f32x4s(1.0);
3237        }
3238
3239        const lha = select(boolx4(true, true, false, false), h, la);
3240        return select(boolx4(true, false, true, true), lha, s);
3241    }
3242}
3243test "zmath.color.rgbToHsl" {
3244    try expectVecApproxEqAbs(rgbToHsl(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.6111, 0.6, 0.5, 1.0), 0.0001);
3245    try expectVecApproxEqAbs(rgbToHsl(f32x4(1.0, 0.0, 0.0, 0.5)), f32x4(0.0, 1.0, 0.5, 0.5), 0.0001);
3246    try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 1.0, 0.0, 0.25)), f32x4(0.3333, 1.0, 0.5, 0.25), 0.0001);
3247    try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 0.0, 1.0, 1.0)), f32x4(0.6666, 1.0, 0.5, 1.0), 0.0001);
3248    try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 0.0, 0.0, 1.0)), f32x4(0.0, 0.0, 0.0, 1.0), 0.0001);
3249    try expectVecApproxEqAbs(rgbToHsl(f32x4(1.0, 1.0, 1.0, 1.0)), f32x4(0.0, 0.0, 1.0, 1.0), 0.0001);
3250}
3251
3252fn hueToClr(p: F32x4, q: F32x4, h: F32x4) F32x4 {
3253    var t = h;
3254
3255    if (all(t < f32x4s(0.0), 3))
3256        t += f32x4s(1.0);
3257
3258    if (all(t > f32x4s(1.0), 3))
3259        t -= f32x4s(1.0);
3260
3261    if (all(t < f32x4s(1.0 / 6.0), 3))
3262        return mulAdd(q - p, f32x4s(6.0) * t, p);
3263
3264    if (all(t < f32x4s(0.5), 3))
3265        return q;
3266
3267    if (all(t < f32x4s(2.0 / 3.0), 3))
3268        return mulAdd(q - p, f32x4s(6.0) * (f32x4s(2.0 / 3.0) - t), p);
3269
3270    return p;
3271}
3272
3273pub fn hslToRgb(hsl: F32x4) F32x4 {
3274    const s = swizzle(hsl, .y, .y, .y, .y);
3275    const l = swizzle(hsl, .z, .z, .z, .z);
3276
3277    if (all(isNearEqual(s, f32x4s(0.0), f32x4s(math.floatEps(f32))), 3)) {
3278        return select(boolx4(true, true, true, false), l, hsl);
3279    } else {
3280        const h = swizzle(hsl, .x, .x, .x, .x);
3281        var q: F32x4 = undefined;
3282        if (all(l < f32x4s(0.5), 3)) {
3283            q = l * (f32x4s(1.0) + s);
3284        } else {
3285            q = (l + s) - (l * s);
3286        }
3287
3288        const p = f32x4s(2.0) * l - q;
3289
3290        const r = hueToClr(p, q, h + f32x4s(1.0 / 3.0));
3291        const g = hueToClr(p, q, h);
3292        const b = hueToClr(p, q, h - f32x4s(1.0 / 3.0));
3293
3294        const rg = select(boolx4(true, false, false, false), r, g);
3295        const ba = select(boolx4(true, true, true, false), b, hsl);
3296        return select(boolx4(true, true, false, false), rg, ba);
3297    }
3298}
3299test "zmath.color.hslToRgb" {
3300    try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), hslToRgb(f32x4(0.6111, 0.6, 0.5, 1.0)), 0.0001);
3301    try expectVecApproxEqAbs(f32x4(1.0, 0.0, 0.0, 0.5), hslToRgb(f32x4(0.0, 1.0, 0.5, 0.5)), 0.0001);
3302    try expectVecApproxEqAbs(f32x4(0.0, 1.0, 0.0, 0.25), hslToRgb(f32x4(0.3333, 1.0, 0.5, 0.25)), 0.0005);
3303    try expectVecApproxEqAbs(f32x4(0.0, 0.0, 1.0, 1.0), hslToRgb(f32x4(0.6666, 1.0, 0.5, 1.0)), 0.0005);
3304    try expectVecApproxEqAbs(f32x4(0.0, 0.0, 0.0, 1.0), hslToRgb(f32x4(0.0, 0.0, 0.0, 1.0)), 0.0001);
3305    try expectVecApproxEqAbs(f32x4(1.0, 1.0, 1.0, 1.0), hslToRgb(f32x4(0.0, 0.0, 1.0, 1.0)), 0.0001);
3306    try expectVecApproxEqAbs(hslToRgb(rgbToHsl(f32x4(1.0, 1.0, 1.0, 1.0))), f32x4(1.0, 1.0, 1.0, 1.0), 0.0005);
3307    try expectVecApproxEqAbs(
3308        hslToRgb(rgbToHsl(f32x4(0.82198, 0.1839, 0.632, 1.0))),
3309        f32x4(0.82198, 0.1839, 0.632, 1.0),
3310        0.0005,
3311    );
3312    try expectVecApproxEqAbs(
3313        rgbToHsl(hslToRgb(f32x4(0.82198, 0.1839, 0.632, 1.0))),
3314        f32x4(0.82198, 0.1839, 0.632, 1.0),
3315        0.0005,
3316    );
3317    try expectVecApproxEqAbs(
3318        rgbToHsl(hslToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))),
3319        f32x4(0.1839, 0.82198, 0.632, 1.0),
3320        0.0005,
3321    );
3322    try expectVecApproxEqAbs(
3323        hslToRgb(rgbToHsl(f32x4(0.1839, 0.632, 0.82198, 1.0))),
3324        f32x4(0.1839, 0.632, 0.82198, 1.0),
3325        0.0005,
3326    );
3327}
3328
3329pub fn rgbToHsv(rgb: F32x4) F32x4 {
3330    const r = swizzle(rgb, .x, .x, .x, .x);
3331    const g = swizzle(rgb, .y, .y, .y, .y);
3332    const b = swizzle(rgb, .z, .z, .z, .z);
3333
3334    const minv = min(r, min(g, b));
3335    const v = max(r, max(g, b));
3336    const d = v - minv;
3337    const s = if (all(isNearEqual(v, f32x4s(0.0), f32x4s(math.floatEps(f32))), 3)) f32x4s(0.0) else d / v;
3338
3339    if (all(d < f32x4s(math.floatEps(f32)), 3)) {
3340        const hv = select(boolx4(true, false, false, false), f32x4s(0.0), v);
3341        const hva = select(boolx4(true, true, true, false), hv, rgb);
3342        return select(boolx4(true, false, true, true), hva, s);
3343    } else {
3344        var h: F32x4 = undefined;
3345        if (all(r == v, 3)) {
3346            h = (g - b) / d;
3347            if (all(g < b, 3))
3348                h += f32x4s(6.0);
3349        } else if (all(g == v, 3)) {
3350            h = f32x4s(2.0) + (b - r) / d;
3351        } else {
3352            h = f32x4s(4.0) + (r - g) / d;
3353        }
3354
3355        h /= f32x4s(6.0);
3356        const hv = select(boolx4(true, false, false, false), h, v);
3357        const hva = select(boolx4(true, true, true, false), hv, rgb);
3358        return select(boolx4(true, false, true, true), hva, s);
3359    }
3360}
3361test "zmath.color.rgbToHsv" {
3362    try expectVecApproxEqAbs(rgbToHsv(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.6111, 0.75, 0.8, 1.0), 0.0001);
3363    try expectVecApproxEqAbs(rgbToHsv(f32x4(0.4, 0.2, 0.8, 1.0)), f32x4(0.7222, 0.75, 0.8, 1.0), 0.0001);
3364    try expectVecApproxEqAbs(rgbToHsv(f32x4(0.4, 0.8, 0.2, 1.0)), f32x4(0.2777, 0.75, 0.8, 1.0), 0.0001);
3365    try expectVecApproxEqAbs(rgbToHsv(f32x4(1.0, 0.0, 0.0, 0.5)), f32x4(0.0, 1.0, 1.0, 0.5), 0.0001);
3366    try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 1.0, 0.0, 0.25)), f32x4(0.3333, 1.0, 1.0, 0.25), 0.0001);
3367    try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 0.0, 1.0, 1.0)), f32x4(0.6666, 1.0, 1.0, 1.0), 0.0001);
3368    try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 0.0, 0.0, 1.0)), f32x4(0.0, 0.0, 0.0, 1.0), 0.0001);
3369    try expectVecApproxEqAbs(rgbToHsv(f32x4(1.0, 1.0, 1.0, 1.0)), f32x4(0.0, 0.0, 1.0, 1.0), 0.0001);
3370}
3371
3372pub fn hsvToRgb(hsv: F32x4) F32x4 {
3373    const h = swizzle(hsv, .x, .x, .x, .x);
3374    const s = swizzle(hsv, .y, .y, .y, .y);
3375    const v = swizzle(hsv, .z, .z, .z, .z);
3376
3377    const h6 = h * f32x4s(6.0);
3378    const i = floor(h6);
3379    const f = h6 - i;
3380
3381    const p = v * (f32x4s(1.0) - s);
3382    const q = v * (f32x4s(1.0) - f * s);
3383    const t = v * (f32x4s(1.0) - (f32x4s(1.0) - f) * s);
3384
3385    const ii = @as(i32, @intFromFloat(mod(i, f32x4s(6.0))[0]));
3386    const rgb = switch (ii) {
3387        0 => blk: {
3388            const vt = select(boolx4(true, false, false, false), v, t);
3389            break :blk select(boolx4(true, true, false, false), vt, p);
3390        },
3391        1 => blk: {
3392            const qv = select(boolx4(true, false, false, false), q, v);
3393            break :blk select(boolx4(true, true, false, false), qv, p);
3394        },
3395        2 => blk: {
3396            const pv = select(boolx4(true, false, false, false), p, v);
3397            break :blk select(boolx4(true, true, false, false), pv, t);
3398        },
3399        3 => blk: {
3400            const pq = select(boolx4(true, false, false, false), p, q);
3401            break :blk select(boolx4(true, true, false, false), pq, v);
3402        },
3403        4 => blk: {
3404            const tp = select(boolx4(true, false, false, false), t, p);
3405            break :blk select(boolx4(true, true, false, false), tp, v);
3406        },
3407        5 => blk: {
3408            const vp = select(boolx4(true, false, false, false), v, p);
3409            break :blk select(boolx4(true, true, false, false), vp, q);
3410        },
3411        else => unreachable,
3412    };
3413    return select(boolx4(true, true, true, false), rgb, hsv);
3414}
3415test "zmath.color.hsvToRgb" {
3416    const epsilon = 0.0005;
3417    try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), hsvToRgb(f32x4(0.6111, 0.75, 0.8, 1.0)), epsilon);
3418    try expectVecApproxEqAbs(f32x4(0.4, 0.2, 0.8, 1.0), hsvToRgb(f32x4(0.7222, 0.75, 0.8, 1.0)), epsilon);
3419    try expectVecApproxEqAbs(f32x4(0.4, 0.8, 0.2, 1.0), hsvToRgb(f32x4(0.2777, 0.75, 0.8, 1.0)), epsilon);
3420    try expectVecApproxEqAbs(f32x4(1.0, 0.0, 0.0, 0.5), hsvToRgb(f32x4(0.0, 1.0, 1.0, 0.5)), epsilon);
3421    try expectVecApproxEqAbs(f32x4(0.0, 1.0, 0.0, 0.25), hsvToRgb(f32x4(0.3333, 1.0, 1.0, 0.25)), epsilon);
3422    try expectVecApproxEqAbs(f32x4(0.0, 0.0, 1.0, 1.0), hsvToRgb(f32x4(0.6666, 1.0, 1.0, 1.0)), epsilon);
3423    try expectVecApproxEqAbs(f32x4(0.0, 0.0, 0.0, 1.0), hsvToRgb(f32x4(0.0, 0.0, 0.0, 1.0)), epsilon);
3424    try expectVecApproxEqAbs(f32x4(1.0, 1.0, 1.0, 1.0), hsvToRgb(f32x4(0.0, 0.0, 1.0, 1.0)), epsilon);
3425    try expectVecApproxEqAbs(
3426        hsvToRgb(rgbToHsv(f32x4(0.1839, 0.632, 0.82198, 1.0))),
3427        f32x4(0.1839, 0.632, 0.82198, 1.0),
3428        epsilon,
3429    );
3430    try expectVecApproxEqAbs(
3431        hsvToRgb(rgbToHsv(f32x4(0.82198, 0.1839, 0.632, 1.0))),
3432        f32x4(0.82198, 0.1839, 0.632, 1.0),
3433        epsilon,
3434    );
3435    try expectVecApproxEqAbs(
3436        rgbToHsv(hsvToRgb(f32x4(0.82198, 0.1839, 0.632, 1.0))),
3437        f32x4(0.82198, 0.1839, 0.632, 1.0),
3438        epsilon,
3439    );
3440    try expectVecApproxEqAbs(
3441        rgbToHsv(hsvToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))),
3442        f32x4(0.1839, 0.82198, 0.632, 1.0),
3443        epsilon,
3444    );
3445}
3446
3447pub fn rgbToSrgb(rgb: F32x4) F32x4 {
3448    const static = struct {
3449        const cutoff = f32x4(0.0031308, 0.0031308, 0.0031308, 1.0);
3450        const linear = f32x4(12.92, 12.92, 12.92, 1.0);
3451        const scale = f32x4(1.055, 1.055, 1.055, 1.0);
3452        const bias = f32x4(0.055, 0.055, 0.055, 1.0);
3453        const rgamma = 1.0 / 2.4;
3454    };
3455    var v = saturate(rgb);
3456    const v0 = v * static.linear;
3457    const v1 = static.scale * f32x4(
3458        math.pow(f32, v[0], static.rgamma),
3459        math.pow(f32, v[1], static.rgamma),
3460        math.pow(f32, v[2], static.rgamma),
3461        v[3],
3462    ) - static.bias;
3463    v = select(v < static.cutoff, v0, v1);
3464    return select(boolx4(true, true, true, false), v, rgb);
3465}
3466test "zmath.color.rgbToSrgb" {
3467    const epsilon = 0.001;
3468    try expectVecApproxEqAbs(rgbToSrgb(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.484, 0.665, 0.906, 1.0), epsilon);
3469}
3470
3471pub fn srgbToRgb(srgb: F32x4) F32x4 {
3472    const static = struct {
3473        const cutoff = f32x4(0.04045, 0.04045, 0.04045, 1.0);
3474        const rlinear = f32x4(1.0 / 12.92, 1.0 / 12.92, 1.0 / 12.92, 1.0);
3475        const scale = f32x4(1.0 / 1.055, 1.0 / 1.055, 1.0 / 1.055, 1.0);
3476        const bias = f32x4(0.055, 0.055, 0.055, 1.0);
3477        const gamma = 2.4;
3478    };
3479    var v = saturate(srgb);
3480    const v0 = v * static.rlinear;
3481    var v1 = static.scale * (v + static.bias);
3482    v1 = f32x4(
3483        math.pow(f32, v1[0], static.gamma),
3484        math.pow(f32, v1[1], static.gamma),
3485        math.pow(f32, v1[2], static.gamma),
3486        v1[3],
3487    );
3488    v = select(v > static.cutoff, v1, v0);
3489    return select(boolx4(true, true, true, false), v, srgb);
3490}
3491test "zmath.color.srgbToRgb" {
3492    const epsilon = 0.0007;
3493    try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), srgbToRgb(f32x4(0.484, 0.665, 0.906, 1.0)), epsilon);
3494    try expectVecApproxEqAbs(
3495        rgbToSrgb(srgbToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))),
3496        f32x4(0.1839, 0.82198, 0.632, 1.0),
3497        epsilon,
3498    );
3499}
3500// ------------------------------------------------------------------------------
3501//
3502// X. Misc functions
3503//
3504// ------------------------------------------------------------------------------
3505pub fn linePointDistance(linept0: Vec, linept1: Vec, pt: Vec) F32x4 {
3506    const ptvec = pt - linept0;
3507    const linevec = linept1 - linept0;
3508    const scale = dot3(ptvec, linevec) / lengthSq3(linevec);
3509    return length3(ptvec - linevec * scale);
3510}
3511test "zmath.linePointDistance" {
3512    {
3513        const linept0 = f32x4(-1.0, -2.0, -3.0, 1.0);
3514        const linept1 = f32x4(1.0, 2.0, 3.0, 1.0);
3515        const pt = f32x4(1.0, 1.0, 1.0, 1.0);
3516        const v = linePointDistance(linept0, linept1, pt);
3517        try expectVecApproxEqAbs(v, splat(F32x4, 0.654), 0.001);
3518    }
3519}
3520
3521fn sin32(v: f32) f32 {
3522    var y = v - math.tau * @round(v * 1.0 / math.tau);
3523
3524    if (y > 0.5 * math.pi) {
3525        y = math.pi - y;
3526    } else if (y < -math.pi * 0.5) {
3527        y = -math.pi - y;
3528    }
3529    const y2 = y * y;
3530
3531    // 11-degree minimax approximation
3532    var sinv = mulAdd(@as(f32, -2.3889859e-08), y2, 2.7525562e-06);
3533    sinv = mulAdd(sinv, y2, -0.00019840874);
3534    sinv = mulAdd(sinv, y2, 0.0083333310);
3535    sinv = mulAdd(sinv, y2, -0.16666667);
3536    return y * mulAdd(sinv, y2, 1.0);
3537}
3538fn cos32(v: f32) f32 {
3539    var y = v - math.tau * @round(v * 1.0 / math.tau);
3540
3541    const sign = blk: {
3542        if (y > 0.5 * math.pi) {
3543            y = math.pi - y;
3544            break :blk @as(f32, -1.0);
3545        } else if (y < -math.pi * 0.5) {
3546            y = -math.pi - y;
3547            break :blk @as(f32, -1.0);
3548        } else {
3549            break :blk @as(f32, 1.0);
3550        }
3551    };
3552    const y2 = y * y;
3553
3554    // 10-degree minimax approximation
3555    var cosv = mulAdd(@as(f32, -2.6051615e-07), y2, 2.4760495e-05);
3556    cosv = mulAdd(cosv, y2, -0.0013888378);
3557    cosv = mulAdd(cosv, y2, 0.041666638);
3558    cosv = mulAdd(cosv, y2, -0.5);
3559    return sign * mulAdd(cosv, y2, 1.0);
3560}
3561fn sincos32(v: f32) [2]f32 {
3562    var y = v - math.tau * @round(v * 1.0 / math.tau);
3563
3564    const sign = blk: {
3565        if (y > 0.5 * math.pi) {
3566            y = math.pi - y;
3567            break :blk @as(f32, -1.0);
3568        } else if (y < -math.pi * 0.5) {
3569            y = -math.pi - y;
3570            break :blk @as(f32, -1.0);
3571        } else {
3572            break :blk @as(f32, 1.0);
3573        }
3574    };
3575    const y2 = y * y;
3576
3577    // 11-degree minimax approximation
3578    var sinv = mulAdd(@as(f32, -2.3889859e-08), y2, 2.7525562e-06);
3579    sinv = mulAdd(sinv, y2, -0.00019840874);
3580    sinv = mulAdd(sinv, y2, 0.0083333310);
3581    sinv = mulAdd(sinv, y2, -0.16666667);
3582    sinv = y * mulAdd(sinv, y2, 1.0);
3583
3584    // 10-degree minimax approximation
3585    var cosv = mulAdd(@as(f32, -2.6051615e-07), y2, 2.4760495e-05);
3586    cosv = mulAdd(cosv, y2, -0.0013888378);
3587    cosv = mulAdd(cosv, y2, 0.041666638);
3588    cosv = mulAdd(cosv, y2, -0.5);
3589    cosv = sign * mulAdd(cosv, y2, 1.0);
3590
3591    return .{ sinv, cosv };
3592}
3593test "zmath.sincos32" {
3594    const epsilon = 0.0001;
3595
3596    try expect(math.isNan(sincos32(math.inf(f32))[0]));
3597    try expect(math.isNan(sincos32(math.inf(f32))[1]));
3598    try expect(math.isNan(sincos32(-math.inf(f32))[0]));
3599    try expect(math.isNan(sincos32(-math.inf(f32))[1]));
3600    try expect(math.isNan(sincos32(math.nan(f32))[0]));
3601    try expect(math.isNan(sincos32(-math.nan(f32))[1]));
3602
3603    try expect(math.isNan(sin32(math.inf(f32))));
3604    try expect(math.isNan(cos32(math.inf(f32))));
3605    try expect(math.isNan(sin32(-math.inf(f32))));
3606    try expect(math.isNan(cos32(-math.inf(f32))));
3607    try expect(math.isNan(sin32(math.nan(f32))));
3608    try expect(math.isNan(cos32(-math.nan(f32))));
3609
3610    var f: f32 = -100.0;
3611    var i: u32 = 0;
3612    while (i < 100) : (i += 1) {
3613        const sc = sincos32(f);
3614        const s0 = sin32(f);
3615        const c0 = cos32(f);
3616        const s = @sin(f);
3617        const c = @cos(f);
3618        try expect(math.approxEqAbs(f32, sc[0], s, epsilon));
3619        try expect(math.approxEqAbs(f32, sc[1], c, epsilon));
3620        try expect(math.approxEqAbs(f32, s0, s, epsilon));
3621        try expect(math.approxEqAbs(f32, c0, c, epsilon));
3622        f += 0.12345 * @as(f32, @floatFromInt(i));
3623    }
3624}
3625
3626fn asin32(v: f32) f32 {
3627    const x = @abs(v);
3628    var omx = 1.0 - x;
3629    if (omx < 0.0) {
3630        omx = 0.0;
3631    }
3632    const root = @sqrt(omx);
3633
3634    // 7-degree minimax approximation
3635    var result = mulAdd(@as(f32, -0.0012624911), x, 0.0066700901);
3636    result = mulAdd(result, x, -0.0170881256);
3637    result = mulAdd(result, x, 0.0308918810);
3638    result = mulAdd(result, x, -0.0501743046);
3639    result = mulAdd(result, x, 0.0889789874);
3640    result = mulAdd(result, x, -0.2145988016);
3641    result = root * mulAdd(result, x, 1.5707963050);
3642
3643    return if (v >= 0.0) 0.5 * math.pi - result else result - 0.5 * math.pi;
3644}
3645test "zmath.asin32" {
3646    const epsilon = 0.0001;
3647
3648    try expect(math.approxEqAbs(f32, asin(@as(f32, -1.1)), -0.5 * math.pi, epsilon));
3649    try expect(math.approxEqAbs(f32, asin(@as(f32, 1.1)), 0.5 * math.pi, epsilon));
3650    try expect(math.approxEqAbs(f32, asin(@as(f32, -1000.1)), -0.5 * math.pi, epsilon));
3651    try expect(math.approxEqAbs(f32, asin(@as(f32, 100000.1)), 0.5 * math.pi, epsilon));
3652    try expect(math.isNan(asin(math.inf(f32))));
3653    try expect(math.isNan(asin(-math.inf(f32))));
3654    try expect(math.isNan(asin(math.nan(f32))));
3655    try expect(math.isNan(asin(-math.nan(f32))));
3656
3657    try expectVecApproxEqAbs(asin(splat(F32x8, -100.0)), splat(F32x8, -0.5 * math.pi), epsilon);
3658    try expectVecApproxEqAbs(asin(splat(F32x16, 100.0)), splat(F32x16, 0.5 * math.pi), epsilon);
3659    try expect(all(isNan(asin(splat(F32x4, math.inf(f32)))), 0) == true);
3660    try expect(all(isNan(asin(splat(F32x4, -math.inf(f32)))), 0) == true);
3661    try expect(all(isNan(asin(splat(F32x4, math.nan(f32)))), 0) == true);
3662    try expect(all(isNan(asin(splat(F32x4, math.snan(f32)))), 0) == true);
3663
3664    var f: f32 = -1.0;
3665    var i: u32 = 0;
3666    while (i < 8) : (i += 1) {
3667        const r0 = asin32(f);
3668        const r1 = math.asin(f);
3669        const r4 = asin(splat(F32x4, f));
3670        const r8 = asin(splat(F32x8, f));
3671        const r16 = asin(splat(F32x16, f));
3672        try expect(math.approxEqAbs(f32, r0, r1, epsilon));
3673        try expectVecApproxEqAbs(r4, splat(F32x4, r1), epsilon);
3674        try expectVecApproxEqAbs(r8, splat(F32x8, r1), epsilon);
3675        try expectVecApproxEqAbs(r16, splat(F32x16, r1), epsilon);
3676        f += 0.09 * @as(f32, @floatFromInt(i));
3677    }
3678}
3679
3680fn acos32(v: f32) f32 {
3681    const x = @abs(v);
3682    var omx = 1.0 - x;
3683    if (omx < 0.0) {
3684        omx = 0.0;
3685    }
3686    const root = @sqrt(omx);
3687
3688    // 7-degree minimax approximation
3689    var result = mulAdd(@as(f32, -0.0012624911), x, 0.0066700901);
3690    result = mulAdd(result, x, -0.0170881256);
3691    result = mulAdd(result, x, 0.0308918810);
3692    result = mulAdd(result, x, -0.0501743046);
3693    result = mulAdd(result, x, 0.0889789874);
3694    result = mulAdd(result, x, -0.2145988016);
3695    result = root * mulAdd(result, x, 1.5707963050);
3696
3697    return if (v >= 0.0) result else math.pi - result;
3698}
3699test "zmath.acos32" {
3700    const epsilon = 0.1;
3701
3702    try expect(math.approxEqAbs(f32, acos(@as(f32, -1.1)), math.pi, epsilon));
3703    try expect(math.approxEqAbs(f32, acos(@as(f32, -10000.1)), math.pi, epsilon));
3704    try expect(math.approxEqAbs(f32, acos(@as(f32, 1.1)), 0.0, epsilon));
3705    try expect(math.approxEqAbs(f32, acos(@as(f32, 1000.1)), 0.0, epsilon));
3706    try expect(math.isNan(acos(math.inf(f32))));
3707    try expect(math.isNan(acos(-math.inf(f32))));
3708    try expect(math.isNan(acos(math.nan(f32))));
3709    try expect(math.isNan(acos(-math.nan(f32))));
3710
3711    try expectVecApproxEqAbs(acos(splat(F32x8, -100.0)), splat(F32x8, math.pi), epsilon);
3712    try expectVecApproxEqAbs(acos(splat(F32x16, 100.0)), splat(F32x16, 0.0), epsilon);
3713    try expect(all(isNan(acos(splat(F32x4, math.inf(f32)))), 0) == true);
3714    try expect(all(isNan(acos(splat(F32x4, -math.inf(f32)))), 0) == true);
3715    try expect(all(isNan(acos(splat(F32x4, math.nan(f32)))), 0) == true);
3716    try expect(all(isNan(acos(splat(F32x4, math.snan(f32)))), 0) == true);
3717
3718    var f: f32 = -1.0;
3719    var i: u32 = 0;
3720    while (i < 8) : (i += 1) {
3721        const r0 = acos32(f);
3722        const r1 = math.acos(f);
3723        const r4 = acos(splat(F32x4, f));
3724        const r8 = acos(splat(F32x8, f));
3725        const r16 = acos(splat(F32x16, f));
3726        try expect(math.approxEqAbs(f32, r0, r1, epsilon));
3727        try expectVecApproxEqAbs(r4, splat(F32x4, r1), epsilon);
3728        try expectVecApproxEqAbs(r8, splat(F32x8, r1), epsilon);
3729        try expectVecApproxEqAbs(r16, splat(F32x16, r1), epsilon);
3730        f += 0.09 * @as(f32, @floatFromInt(i));
3731    }
3732}
3733
3734pub fn modAngle32(in_angle: f32) f32 {
3735    const angle = in_angle + math.pi;
3736    var temp: f32 = @abs(angle);
3737    temp = temp - (2.0 * math.pi * @as(f32, @floatFromInt(@as(i32, @intFromFloat(temp / math.pi)))));
3738    temp = temp - math.pi;
3739    if (angle < 0.0) {
3740        temp = -temp;
3741    }
3742    return temp;
3743}
3744
3745pub fn cmulSoa(re0: anytype, im0: anytype, re1: anytype, im1: anytype) [2]@TypeOf(re0, im0, re1, im1) {
3746    const re0_re1 = re0 * re1;
3747    const re0_im1 = re0 * im1;
3748    return .{
3749        mulAdd(-im0, im1, re0_re1), // re
3750        mulAdd(re1, im0, re0_im1), // im
3751    };
3752}
3753// ------------------------------------------------------------------------------
3754//
3755// FFT (implementation based on xdsp.h from DirectXMath)
3756//
3757// ------------------------------------------------------------------------------
3758fn fftButterflyDit4_1(re0: *F32x4, im0: *F32x4) void {
3759    const re0l = swizzle(re0.*, .x, .x, .y, .y);
3760    const re0h = swizzle(re0.*, .z, .z, .w, .w);
3761
3762    const im0l = swizzle(im0.*, .x, .x, .y, .y);
3763    const im0h = swizzle(im0.*, .z, .z, .w, .w);
3764
3765    const re_temp = mulAdd(re0h, f32x4(1.0, -1.0, 1.0, -1.0), re0l);
3766    const im_temp = mulAdd(im0h, f32x4(1.0, -1.0, 1.0, -1.0), im0l);
3767
3768    const re_shuf0 = @shuffle(f32, re_temp, im_temp, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) });
3769    const re_shuf = swizzle(re_shuf0, .x, .w, .x, .w);
3770    const im_shuf = swizzle(re_shuf0, .z, .y, .z, .y);
3771
3772    const re_templ = swizzle(re_temp, .x, .y, .x, .y);
3773    const im_templ = swizzle(im_temp, .x, .y, .x, .y);
3774
3775    re0.* = mulAdd(re_shuf, f32x4(1.0, 1.0, -1.0, -1.0), re_templ);
3776    im0.* = mulAdd(im_shuf, f32x4(1.0, -1.0, -1.0, 1.0), im_templ);
3777}
3778
3779fn fftButterflyDit4_4(
3780    re0: *F32x4,
3781    re1: *F32x4,
3782    re2: *F32x4,
3783    re3: *F32x4,
3784    im0: *F32x4,
3785    im1: *F32x4,
3786    im2: *F32x4,
3787    im3: *F32x4,
3788    unity_table_re: []const F32x4,
3789    unity_table_im: []const F32x4,
3790    stride: u32,
3791    last: bool,
3792) void {
3793    const re_temp0 = re0.* + re2.*;
3794    const im_temp0 = im0.* + im2.*;
3795
3796    const re_temp2 = re1.* + re3.*;
3797    const im_temp2 = im1.* + im3.*;
3798
3799    const re_temp1 = re0.* - re2.*;
3800    const im_temp1 = im0.* - im2.*;
3801
3802    const re_temp3 = re1.* - re3.*;
3803    const im_temp3 = im1.* - im3.*;
3804
3805    var re_temp4 = re_temp0 + re_temp2;
3806    var im_temp4 = im_temp0 + im_temp2;
3807
3808    var re_temp5 = re_temp1 + im_temp3;
3809    var im_temp5 = im_temp1 - re_temp3;
3810
3811    var re_temp6 = re_temp0 - re_temp2;
3812    var im_temp6 = im_temp0 - im_temp2;
3813
3814    var re_temp7 = re_temp1 - im_temp3;
3815    var im_temp7 = im_temp1 + re_temp3;
3816
3817    {
3818        const re_im = cmulSoa(re_temp5, im_temp5, unity_table_re[stride], unity_table_im[stride]);
3819        re_temp5 = re_im[0];
3820        im_temp5 = re_im[1];
3821    }
3822    {
3823        const re_im = cmulSoa(re_temp6, im_temp6, unity_table_re[stride * 2], unity_table_im[stride * 2]);
3824        re_temp6 = re_im[0];
3825        im_temp6 = re_im[1];
3826    }
3827    {
3828        const re_im = cmulSoa(re_temp7, im_temp7, unity_table_re[stride * 3], unity_table_im[stride * 3]);
3829        re_temp7 = re_im[0];
3830        im_temp7 = re_im[1];
3831    }
3832
3833    if (last) {
3834        fftButterflyDit4_1(&re_temp4, &im_temp4);
3835        fftButterflyDit4_1(&re_temp5, &im_temp5);
3836        fftButterflyDit4_1(&re_temp6, &im_temp6);
3837        fftButterflyDit4_1(&re_temp7, &im_temp7);
3838    }
3839
3840    re0.* = re_temp4;
3841    im0.* = im_temp4;
3842
3843    re1.* = re_temp5;
3844    im1.* = im_temp5;
3845
3846    re2.* = re_temp6;
3847    im2.* = im_temp6;
3848
3849    re3.* = re_temp7;
3850    im3.* = im_temp7;
3851}
3852
3853fn fft4(re: []F32x4, im: []F32x4, count: u32) void {
3854    assert(std.math.isPowerOfTwo(count));
3855    assert(re.len >= count);
3856    assert(im.len >= count);
3857
3858    var index: u32 = 0;
3859    while (index < count) : (index += 1) {
3860        fftButterflyDit4_1(&re[index], &im[index]);
3861    }
3862}
3863test "zmath.fft4" {
3864    const epsilon = 0.0001;
3865    var re = [_]F32x4{f32x4(1.0, 2.0, 3.0, 4.0)};
3866    var im = [_]F32x4{f32x4s(0.0)};
3867    fft4(re[0..], im[0..], 1);
3868
3869    var re_uns: [1]F32x4 = undefined;
3870    var im_uns: [1]F32x4 = undefined;
3871    fftUnswizzle(re[0..], re_uns[0..]);
3872    fftUnswizzle(im[0..], im_uns[0..]);
3873
3874    try expectVecApproxEqAbs(re_uns[0], f32x4(10.0, -2.0, -2.0, -2.0), epsilon);
3875    try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 2.0, 0.0, -2.0), epsilon);
3876}
3877
3878fn fft8(re: []F32x4, im: []F32x4, count: u32) void {
3879    assert(std.math.isPowerOfTwo(count));
3880    assert(re.len >= 2 * count);
3881    assert(im.len >= 2 * count);
3882
3883    var index: u32 = 0;
3884    while (index < count) : (index += 1) {
3885        var pre = re[index * 2 ..];
3886        var pim = im[index * 2 ..];
3887
3888        var odds_re = @shuffle(f32, pre[0], pre[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) });
3889        var evens_re = @shuffle(f32, pre[0], pre[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) });
3890        var odds_im = @shuffle(f32, pim[0], pim[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) });
3891        var evens_im = @shuffle(f32, pim[0], pim[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) });
3892        fftButterflyDit4_1(&odds_re, &odds_im);
3893        fftButterflyDit4_1(&evens_re, &evens_im);
3894
3895        {
3896            const re_im = cmulSoa(
3897                odds_re,
3898                odds_im,
3899                f32x4(1.0, 0.70710677, 0.0, -0.70710677),
3900                f32x4(0.0, -0.70710677, -1.0, -0.70710677),
3901            );
3902            pre[0] = evens_re + re_im[0];
3903            pim[0] = evens_im + re_im[1];
3904        }
3905        {
3906            const re_im = cmulSoa(
3907                odds_re,
3908                odds_im,
3909                f32x4(-1.0, -0.70710677, 0.0, 0.70710677),
3910                f32x4(0.0, 0.70710677, 1.0, 0.70710677),
3911            );
3912            pre[1] = evens_re + re_im[0];
3913            pim[1] = evens_im + re_im[1];
3914        }
3915    }
3916}
3917test "zmath.fft8" {
3918    const epsilon = 0.0001;
3919    var re = [_]F32x4{ f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0) };
3920    var im = [_]F32x4{ f32x4s(0.0), f32x4s(0.0) };
3921    fft8(re[0..], im[0..], 1);
3922
3923    var re_uns: [2]F32x4 = undefined;
3924    var im_uns: [2]F32x4 = undefined;
3925    fftUnswizzle(re[0..], re_uns[0..]);
3926    fftUnswizzle(im[0..], im_uns[0..]);
3927
3928    try expectVecApproxEqAbs(re_uns[0], f32x4(36.0, -4.0, -4.0, -4.0), epsilon);
3929    try expectVecApproxEqAbs(re_uns[1], f32x4(-4.0, -4.0, -4.0, -4.0), epsilon);
3930    try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 9.656854, 4.0, 1.656854), epsilon);
3931    try expectVecApproxEqAbs(im_uns[1], f32x4(0.0, -1.656854, -4.0, -9.656854), epsilon);
3932}
3933
3934fn fft16(re: []F32x4, im: []F32x4, count: u32) void {
3935    assert(std.math.isPowerOfTwo(count));
3936    assert(re.len >= 4 * count);
3937    assert(im.len >= 4 * count);
3938
3939    const static = struct {
3940        const unity_table_re = [4]F32x4{
3941            f32x4(1.0, 1.0, 1.0, 1.0),
3942            f32x4(1.0, 0.92387950, 0.70710677, 0.38268343),
3943            f32x4(1.0, 0.70710677, -4.3711388e-008, -0.70710677),
3944            f32x4(1.0, 0.38268343, -0.70710677, -0.92387950),
3945        };
3946        const unity_table_im = [4]F32x4{
3947            f32x4(-0.0, -0.0, -0.0, -0.0),
3948            f32x4(-0.0, -0.38268343, -0.70710677, -0.92387950),
3949            f32x4(-0.0, -0.70710677, -1.0, -0.70710677),
3950            f32x4(-0.0, -0.92387950, -0.70710677, 0.38268343),
3951        };
3952    };
3953
3954    var index: u32 = 0;
3955    while (index < count) : (index += 1) {
3956        fftButterflyDit4_4(
3957            &re[index * 4],
3958            &re[index * 4 + 1],
3959            &re[index * 4 + 2],
3960            &re[index * 4 + 3],
3961            &im[index * 4],
3962            &im[index * 4 + 1],
3963            &im[index * 4 + 2],
3964            &im[index * 4 + 3],
3965            static.unity_table_re[0..],
3966            static.unity_table_im[0..],
3967            1,
3968            true,
3969        );
3970    }
3971}
3972test "zmath.fft16" {
3973    const epsilon = 0.0001;
3974    var re = [_]F32x4{
3975        f32x4(1.0, 2.0, 3.0, 4.0),
3976        f32x4(5.0, 6.0, 7.0, 8.0),
3977        f32x4(9.0, 10.0, 11.0, 12.0),
3978        f32x4(13.0, 14.0, 15.0, 16.0),
3979    };
3980    var im = [_]F32x4{ f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0) };
3981    fft16(re[0..], im[0..], 1);
3982
3983    var re_uns: [4]F32x4 = undefined;
3984    var im_uns: [4]F32x4 = undefined;
3985    fftUnswizzle(re[0..], re_uns[0..]);
3986    fftUnswizzle(im[0..], im_uns[0..]);
3987
3988    try expectVecApproxEqAbs(re_uns[0], f32x4(136.0, -8.0, -8.0, -8.0), epsilon);
3989    try expectVecApproxEqAbs(re_uns[1], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon);
3990    try expectVecApproxEqAbs(re_uns[2], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon);
3991    try expectVecApproxEqAbs(re_uns[3], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon);
3992    try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 40.218716, 19.313708, 11.972846), epsilon);
3993    try expectVecApproxEqAbs(im_uns[1], f32x4(8.0, 5.345429, 3.313708, 1.591299), epsilon);
3994    try expectVecApproxEqAbs(im_uns[2], f32x4(0.0, -1.591299, -3.313708, -5.345429), epsilon);
3995    try expectVecApproxEqAbs(im_uns[3], f32x4(-8.0, -11.972846, -19.313708, -40.218716), epsilon);
3996}
3997
3998fn fftN(re: []F32x4, im: []F32x4, unity_table: []const F32x4, length: u32, count: u32) void {
3999    assert(length > 16);
4000    assert(std.math.isPowerOfTwo(length));
4001    assert(std.math.isPowerOfTwo(count));
4002    assert(re.len >= length * count / 4);
4003    assert(re.len == im.len);
4004
4005    const total = count * length;
4006    const total_vectors = total / 4;
4007    const stage_vectors = length / 4;
4008    const stage_vectors_mask = stage_vectors - 1;
4009    const stride = length / 16;
4010    const stride_mask = stride - 1;
4011    const stride_inv_mask = ~stride_mask;
4012
4013    var unity_table_re = unity_table;
4014    var unity_table_im = unity_table[length / 4 ..];
4015
4016    var index: u32 = 0;
4017    while (index < total_vectors / 4) : (index += 1) {
4018        const n = (index & stride_inv_mask) * 4 + (index & stride_mask);
4019        fftButterflyDit4_4(
4020            &re[n],
4021            &re[n + stride],
4022            &re[n + stride * 2],
4023            &re[n + stride * 3],
4024            &im[n],
4025            &im[n + stride],
4026            &im[n + stride * 2],
4027            &im[n + stride * 3],
4028            unity_table_re[(n & stage_vectors_mask)..],
4029            unity_table_im[(n & stage_vectors_mask)..],
4030            stride,
4031            false,
4032        );
4033    }
4034
4035    if (length > 16 * 4) {
4036        fftN(re, im, unity_table[(length / 2)..], length / 4, count * 4);
4037    } else if (length == 16 * 4) {
4038        fft16(re, im, count * 4);
4039    } else if (length == 8 * 4) {
4040        fft8(re, im, count * 4);
4041    } else if (length == 4 * 4) {
4042        fft4(re, im, count * 4);
4043    }
4044}
4045test "zmath.fftN" {
4046    var unity_table: [128]F32x4 = undefined;
4047    const epsilon = 0.0001;
4048
4049    // 32 samples
4050    {
4051        var re = [_]F32x4{
4052            f32x4(1.0, 2.0, 3.0, 4.0),     f32x4(5.0, 6.0, 7.0, 8.0),
4053            f32x4(9.0, 10.0, 11.0, 12.0),  f32x4(13.0, 14.0, 15.0, 16.0),
4054            f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0),
4055            f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0),
4056        };
4057        var im = [_]F32x4{
4058            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4059            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4060        };
4061
4062        fftInitUnityTable(unity_table[0..32]);
4063        fft(re[0..], im[0..], unity_table[0..32]);
4064
4065        try expectVecApproxEqAbs(re[0], f32x4(528.0, -16.0, -16.0, -16.0), epsilon);
4066        try expectVecApproxEqAbs(re[1], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon);
4067        try expectVecApproxEqAbs(re[2], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon);
4068        try expectVecApproxEqAbs(re[3], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon);
4069        try expectVecApproxEqAbs(re[4], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon);
4070        try expectVecApproxEqAbs(re[5], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon);
4071        try expectVecApproxEqAbs(re[6], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon);
4072        try expectVecApproxEqAbs(re[7], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon);
4073        try expectVecApproxEqAbs(im[0], f32x4(0.0, 162.450726, 80.437432, 52.744931), epsilon);
4074        try expectVecApproxEqAbs(im[1], f32x4(38.627417, 29.933895, 23.945692, 19.496056), epsilon);
4075        try expectVecApproxEqAbs(im[2], f32x4(16.0, 13.130861, 10.690858, 8.552178), epsilon);
4076        try expectVecApproxEqAbs(im[3], f32x4(6.627417, 4.853547, 3.182598, 1.575862), epsilon);
4077        try expectVecApproxEqAbs(im[4], f32x4(0.0, -1.575862, -3.182598, -4.853547), epsilon);
4078        try expectVecApproxEqAbs(im[5], f32x4(-6.627417, -8.552178, -10.690858, -13.130861), epsilon);
4079        try expectVecApproxEqAbs(im[6], f32x4(-16.0, -19.496056, -23.945692, -29.933895), epsilon);
4080        try expectVecApproxEqAbs(im[7], f32x4(-38.627417, -52.744931, -80.437432, -162.450726), epsilon);
4081    }
4082
4083    // 64 samples
4084    {
4085        var re = [_]F32x4{
4086            f32x4(1.0, 2.0, 3.0, 4.0),     f32x4(5.0, 6.0, 7.0, 8.0),
4087            f32x4(9.0, 10.0, 11.0, 12.0),  f32x4(13.0, 14.0, 15.0, 16.0),
4088            f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0),
4089            f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0),
4090            f32x4(1.0, 2.0, 3.0, 4.0),     f32x4(5.0, 6.0, 7.0, 8.0),
4091            f32x4(9.0, 10.0, 11.0, 12.0),  f32x4(13.0, 14.0, 15.0, 16.0),
4092            f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0),
4093            f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0),
4094        };
4095        var im = [_]F32x4{
4096            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4097            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4098            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4099            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4100        };
4101
4102        fftInitUnityTable(unity_table[0..64]);
4103        fft(re[0..], im[0..], unity_table[0..64]);
4104
4105        try expectVecApproxEqAbs(re[0], f32x4(1056.0, 0.0, -32.0, 0.0), epsilon);
4106        var i: u32 = 1;
4107        while (i < 16) : (i += 1) {
4108            try expectVecApproxEqAbs(re[i], f32x4(-32.0, 0.0, -32.0, 0.0), epsilon);
4109        }
4110
4111        const expected = [_]f32{
4112            0.0,        0.0,      324.901452,  0.000000, 160.874864,  0.0,      105.489863,  0.000000,
4113            77.254834,  0.0,      59.867789,   0.0,      47.891384,   0.0,      38.992113,   0.0,
4114            32.000000,  0.000000, 26.261721,   0.000000, 21.381716,   0.000000, 17.104356,   0.000000,
4115            13.254834,  0.000000, 9.707094,    0.000000, 6.365196,    0.000000, 3.151725,    0.000000,
4116            0.000000,   0.000000, -3.151725,   0.000000, -6.365196,   0.000000, -9.707094,   0.000000,
4117            -13.254834, 0.000000, -17.104356,  0.000000, -21.381716,  0.000000, -26.261721,  0.000000,
4118            -32.000000, 0.000000, -38.992113,  0.000000, -47.891384,  0.000000, -59.867789,  0.000000,
4119            -77.254834, 0.000000, -105.489863, 0.000000, -160.874864, 0.000000, -324.901452, 0.000000,
4120        };
4121        for (expected, 0..) |e, ie| {
4122            try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon));
4123        }
4124    }
4125
4126    // 128 samples
4127    {
4128        var re = [_]F32x4{
4129            f32x4(1.0, 2.0, 3.0, 4.0),     f32x4(5.0, 6.0, 7.0, 8.0),
4130            f32x4(9.0, 10.0, 11.0, 12.0),  f32x4(13.0, 14.0, 15.0, 16.0),
4131            f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0),
4132            f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0),
4133            f32x4(1.0, 2.0, 3.0, 4.0),     f32x4(5.0, 6.0, 7.0, 8.0),
4134            f32x4(9.0, 10.0, 11.0, 12.0),  f32x4(13.0, 14.0, 15.0, 16.0),
4135            f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0),
4136            f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0),
4137            f32x4(1.0, 2.0, 3.0, 4.0),     f32x4(5.0, 6.0, 7.0, 8.0),
4138            f32x4(9.0, 10.0, 11.0, 12.0),  f32x4(13.0, 14.0, 15.0, 16.0),
4139            f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0),
4140            f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0),
4141            f32x4(1.0, 2.0, 3.0, 4.0),     f32x4(5.0, 6.0, 7.0, 8.0),
4142            f32x4(9.0, 10.0, 11.0, 12.0),  f32x4(13.0, 14.0, 15.0, 16.0),
4143            f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0),
4144            f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0),
4145        };
4146        var im = [_]F32x4{
4147            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4148            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4149            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4150            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4151            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4152            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4153            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4154            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4155        };
4156
4157        fftInitUnityTable(unity_table[0..128]);
4158        fft(re[0..], im[0..], unity_table[0..128]);
4159
4160        try expectVecApproxEqAbs(re[0], f32x4(2112.0, 0.0, 0.0, 0.0), epsilon);
4161        var i: u32 = 1;
4162        while (i < 32) : (i += 1) {
4163            try expectVecApproxEqAbs(re[i], f32x4(-64.0, 0.0, 0.0, 0.0), epsilon);
4164        }
4165
4166        const expected = [_]f32{
4167            0.000000,    0.000000, 0.000000, 0.000000, 649.802905,  0.000000, 0.000000, 0.000000,
4168            321.749727,  0.000000, 0.000000, 0.000000, 210.979725,  0.000000, 0.000000, 0.000000,
4169            154.509668,  0.000000, 0.000000, 0.000000, 119.735578,  0.000000, 0.000000, 0.000000,
4170            95.782769,   0.000000, 0.000000, 0.000000, 77.984226,   0.000000, 0.000000, 0.000000,
4171            64.000000,   0.000000, 0.000000, 0.000000, 52.523443,   0.000000, 0.000000, 0.000000,
4172            42.763433,   0.000000, 0.000000, 0.000000, 34.208713,   0.000000, 0.000000, 0.000000,
4173            26.509668,   0.000000, 0.000000, 0.000000, 19.414188,   0.000000, 0.000000, 0.000000,
4174            12.730392,   0.000000, 0.000000, 0.000000, 6.303450,    0.000000, 0.000000, 0.000000,
4175            0.000000,    0.000000, 0.000000, 0.000000, -6.303450,   0.000000, 0.000000, 0.000000,
4176            -12.730392,  0.000000, 0.000000, 0.000000, -19.414188,  0.000000, 0.000000, 0.000000,
4177            -26.509668,  0.000000, 0.000000, 0.000000, -34.208713,  0.000000, 0.000000, 0.000000,
4178            -42.763433,  0.000000, 0.000000, 0.000000, -52.523443,  0.000000, 0.000000, 0.000000,
4179            -64.000000,  0.000000, 0.000000, 0.000000, -77.984226,  0.000000, 0.000000, 0.000000,
4180            -95.782769,  0.000000, 0.000000, 0.000000, -119.735578, 0.000000, 0.000000, 0.000000,
4181            -154.509668, 0.000000, 0.000000, 0.000000, -210.979725, 0.000000, 0.000000, 0.000000,
4182            -321.749727, 0.000000, 0.000000, 0.000000, -649.802905, 0.000000, 0.000000, 0.000000,
4183        };
4184        for (expected, 0..) |e, ie| {
4185            try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon));
4186        }
4187    }
4188}
4189
4190fn fftUnswizzle(input: []const F32x4, output: []F32x4) void {
4191    assert(std.math.isPowerOfTwo(input.len));
4192    assert(input.len == output.len);
4193    assert(input.ptr != output.ptr);
4194
4195    const log2_length = std.math.log2_int(usize, input.len * 4);
4196    assert(log2_length >= 2);
4197
4198    const length = input.len;
4199
4200    const f32_output = @as([*]f32, @ptrCast(output.ptr))[0 .. output.len * 4];
4201
4202    const static = struct {
4203        const swizzle_table = [256]u8{
4204            0x00, 0x40, 0x80, 0xC0, 0x10, 0x50, 0x90, 0xD0, 0x20, 0x60, 0xA0, 0xE0, 0x30, 0x70, 0xB0, 0xF0,
4205            0x04, 0x44, 0x84, 0xC4, 0x14, 0x54, 0x94, 0xD4, 0x24, 0x64, 0xA4, 0xE4, 0x34, 0x74, 0xB4, 0xF4,
4206            0x08, 0x48, 0x88, 0xC8, 0x18, 0x58, 0x98, 0xD8, 0x28, 0x68, 0xA8, 0xE8, 0x38, 0x78, 0xB8, 0xF8,
4207            0x0C, 0x4C, 0x8C, 0xCC, 0x1C, 0x5C, 0x9C, 0xDC, 0x2C, 0x6C, 0xAC, 0xEC, 0x3C, 0x7C, 0xBC, 0xFC,
4208            0x01, 0x41, 0x81, 0xC1, 0x11, 0x51, 0x91, 0xD1, 0x21, 0x61, 0xA1, 0xE1, 0x31, 0x71, 0xB1, 0xF1,
4209            0x05, 0x45, 0x85, 0xC5, 0x15, 0x55, 0x95, 0xD5, 0x25, 0x65, 0xA5, 0xE5, 0x35, 0x75, 0xB5, 0xF5,
4210            0x09, 0x49, 0x89, 0xC9, 0x19, 0x59, 0x99, 0xD9, 0x29, 0x69, 0xA9, 0xE9, 0x39, 0x79, 0xB9, 0xF9,
4211            0x0D, 0x4D, 0x8D, 0xCD, 0x1D, 0x5D, 0x9D, 0xDD, 0x2D, 0x6D, 0xAD, 0xED, 0x3D, 0x7D, 0xBD, 0xFD,
4212            0x02, 0x42, 0x82, 0xC2, 0x12, 0x52, 0x92, 0xD2, 0x22, 0x62, 0xA2, 0xE2, 0x32, 0x72, 0xB2, 0xF2,
4213            0x06, 0x46, 0x86, 0xC6, 0x16, 0x56, 0x96, 0xD6, 0x26, 0x66, 0xA6, 0xE6, 0x36, 0x76, 0xB6, 0xF6,
4214            0x0A, 0x4A, 0x8A, 0xCA, 0x1A, 0x5A, 0x9A, 0xDA, 0x2A, 0x6A, 0xAA, 0xEA, 0x3A, 0x7A, 0xBA, 0xFA,
4215            0x0E, 0x4E, 0x8E, 0xCE, 0x1E, 0x5E, 0x9E, 0xDE, 0x2E, 0x6E, 0xAE, 0xEE, 0x3E, 0x7E, 0xBE, 0xFE,
4216            0x03, 0x43, 0x83, 0xC3, 0x13, 0x53, 0x93, 0xD3, 0x23, 0x63, 0xA3, 0xE3, 0x33, 0x73, 0xB3, 0xF3,
4217            0x07, 0x47, 0x87, 0xC7, 0x17, 0x57, 0x97, 0xD7, 0x27, 0x67, 0xA7, 0xE7, 0x37, 0x77, 0xB7, 0xF7,
4218            0x0B, 0x4B, 0x8B, 0xCB, 0x1B, 0x5B, 0x9B, 0xDB, 0x2B, 0x6B, 0xAB, 0xEB, 0x3B, 0x7B, 0xBB, 0xFB,
4219            0x0F, 0x4F, 0x8F, 0xCF, 0x1F, 0x5F, 0x9F, 0xDF, 0x2F, 0x6F, 0xAF, 0xEF, 0x3F, 0x7F, 0xBF, 0xFF,
4220        };
4221    };
4222
4223    if ((log2_length & 1) == 0) {
4224        const rev32 = @as(u6, @intCast(32 - log2_length));
4225        var index: usize = 0;
4226        while (index < length) : (index += 1) {
4227            const n = index * 4;
4228            const addr =
4229                (@as(usize, @intCast(static.swizzle_table[n & 0xff])) << 24) |
4230                (@as(usize, @intCast(static.swizzle_table[(n >> 8) & 0xff])) << 16) |
4231                (@as(usize, @intCast(static.swizzle_table[(n >> 16) & 0xff])) << 8) |
4232                @as(usize, @intCast(static.swizzle_table[(n >> 24) & 0xff]));
4233            f32_output[addr >> rev32] = input[index][0];
4234            f32_output[(0x40000000 | addr) >> rev32] = input[index][1];
4235            f32_output[(0x80000000 | addr) >> rev32] = input[index][2];
4236            f32_output[(0xC0000000 | addr) >> rev32] = input[index][3];
4237        }
4238    } else {
4239        const rev7 = @as(usize, 1) << @as(u6, @intCast(log2_length - 3));
4240        const rev32 = @as(u6, @intCast(32 - (log2_length - 3)));
4241        var index: usize = 0;
4242        while (index < length) : (index += 1) {
4243            const n = index / 2;
4244            var addr =
4245                (((@as(usize, @intCast(static.swizzle_table[n & 0xff])) << 24) |
4246                (@as(usize, @intCast(static.swizzle_table[(n >> 8) & 0xff])) << 16) |
4247                (@as(usize, @intCast(static.swizzle_table[(n >> 16) & 0xff])) << 8) |
4248                (@as(usize, @intCast(static.swizzle_table[(n >> 24) & 0xff])))) >> rev32) |
4249                ((index & 1) * rev7 * 4);
4250            f32_output[addr] = input[index][0];
4251            addr += rev7;
4252            f32_output[addr] = input[index][1];
4253            addr += rev7;
4254            f32_output[addr] = input[index][2];
4255            addr += rev7;
4256            f32_output[addr] = input[index][3];
4257        }
4258    }
4259}
4260
4261pub fn fftInitUnityTable(out_unity_table: []F32x4) void {
4262    assert(std.math.isPowerOfTwo(out_unity_table.len));
4263    assert(out_unity_table.len >= 32 and out_unity_table.len <= 512);
4264
4265    var unity_table = out_unity_table;
4266
4267    const v0123 = f32x4(0.0, 1.0, 2.0, 3.0);
4268    var length = out_unity_table.len / 4;
4269    var vlstep = f32x4s(0.5 * math.pi / @as(f32, @floatFromInt(length)));
4270
4271    while (true) {
4272        length /= 4;
4273        var vjp = v0123;
4274
4275        var j: u32 = 0;
4276        while (j < length) : (j += 1) {
4277            unity_table[j] = f32x4s(1.0);
4278            unity_table[j + length * 4] = f32x4s(0.0);
4279
4280            var vls = vjp * vlstep;
4281            var sin_cos = sincos(vls);
4282            unity_table[j + length] = sin_cos[1];
4283            unity_table[j + length * 5] = sin_cos[0] * f32x4s(-1.0);
4284
4285            var vijp = vjp + vjp;
4286            vls = vijp * vlstep;
4287            sin_cos = sincos(vls);
4288            unity_table[j + length * 2] = sin_cos[1];
4289            unity_table[j + length * 6] = sin_cos[0] * f32x4s(-1.0);
4290
4291            vijp = vijp + vjp;
4292            vls = vijp * vlstep;
4293            sin_cos = sincos(vls);
4294            unity_table[j + length * 3] = sin_cos[1];
4295            unity_table[j + length * 7] = sin_cos[0] * f32x4s(-1.0);
4296
4297            vjp += f32x4s(4.0);
4298        }
4299        vlstep *= f32x4s(4.0);
4300        unity_table = unity_table[8 * length ..];
4301
4302        if (length <= 4)
4303            break;
4304    }
4305}
4306
4307pub fn fft(re: []F32x4, im: []F32x4, unity_table: []const F32x4) void {
4308    const length = @as(u32, @intCast(re.len * 4));
4309    assert(std.math.isPowerOfTwo(length));
4310    assert(length >= 4 and length <= 512);
4311    assert(re.len == im.len);
4312
4313    var re_temp_storage: [128]F32x4 = undefined;
4314    var im_temp_storage: [128]F32x4 = undefined;
4315    const re_temp = re_temp_storage[0..re.len];
4316    const im_temp = im_temp_storage[0..im.len];
4317
4318    @memcpy(re_temp, re);
4319    @memcpy(im_temp, im);
4320
4321    if (length > 16) {
4322        assert(unity_table.len == length);
4323        fftN(re_temp, im_temp, unity_table, length, 1);
4324    } else if (length == 16) {
4325        fft16(re_temp, im_temp, 1);
4326    } else if (length == 8) {
4327        fft8(re_temp, im_temp, 1);
4328    } else if (length == 4) {
4329        fft4(re_temp, im_temp, 1);
4330    }
4331
4332    fftUnswizzle(re_temp, re);
4333    fftUnswizzle(im_temp, im);
4334}
4335
4336pub fn ifft(re: []F32x4, im: []const F32x4, unity_table: []const F32x4) void {
4337    const length = @as(u32, @intCast(re.len * 4));
4338    assert(std.math.isPowerOfTwo(length));
4339    assert(length >= 4 and length <= 512);
4340    assert(re.len == im.len);
4341
4342    var re_temp_storage: [128]F32x4 = undefined;
4343    var im_temp_storage: [128]F32x4 = undefined;
4344    var re_temp = re_temp_storage[0..re.len];
4345    var im_temp = im_temp_storage[0..im.len];
4346
4347    const rnp = f32x4s(1.0 / @as(f32, @floatFromInt(length)));
4348    const rnm = f32x4s(-1.0 / @as(f32, @floatFromInt(length)));
4349
4350    for (re, 0..) |_, i| {
4351        re_temp[i] = re[i] * rnp;
4352        im_temp[i] = im[i] * rnm;
4353    }
4354
4355    if (length > 16) {
4356        assert(unity_table.len == length);
4357        fftN(re_temp, im_temp, unity_table, length, 1);
4358    } else if (length == 16) {
4359        fft16(re_temp, im_temp, 1);
4360    } else if (length == 8) {
4361        fft8(re_temp, im_temp, 1);
4362    } else if (length == 4) {
4363        fft4(re_temp, im_temp, 1);
4364    }
4365
4366    fftUnswizzle(re_temp, re);
4367}
4368test "zmath.ifft" {
4369    var unity_table: [512]F32x4 = undefined;
4370    const epsilon = 0.0001;
4371
4372    // 64 samples
4373    {
4374        var re = [_]F32x4{
4375            f32x4(1.0, 2.0, 3.0, 4.0),     f32x4(5.0, 6.0, 7.0, 8.0),
4376            f32x4(9.0, 10.0, 11.0, 12.0),  f32x4(13.0, 14.0, 15.0, 16.0),
4377            f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0),
4378            f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0),
4379            f32x4(1.0, 2.0, 3.0, 4.0),     f32x4(5.0, 6.0, 7.0, 8.0),
4380            f32x4(9.0, 10.0, 11.0, 12.0),  f32x4(13.0, 14.0, 15.0, 16.0),
4381            f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0),
4382            f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0),
4383        };
4384        var im = [_]F32x4{
4385            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4386            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4387            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4388            f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0),
4389        };
4390
4391        fftInitUnityTable(unity_table[0..64]);
4392        fft(re[0..], im[0..], unity_table[0..64]);
4393
4394        try expectVecApproxEqAbs(re[0], f32x4(1056.0, 0.0, -32.0, 0.0), epsilon);
4395        var i: u32 = 1;
4396        while (i < 16) : (i += 1) {
4397            try expectVecApproxEqAbs(re[i], f32x4(-32.0, 0.0, -32.0, 0.0), epsilon);
4398        }
4399
4400        ifft(re[0..], im[0..], unity_table[0..64]);
4401
4402        try expectVecApproxEqAbs(re[0], f32x4(1.0, 2.0, 3.0, 4.0), epsilon);
4403        try expectVecApproxEqAbs(re[1], f32x4(5.0, 6.0, 7.0, 8.0), epsilon);
4404        try expectVecApproxEqAbs(re[2], f32x4(9.0, 10.0, 11.0, 12.0), epsilon);
4405        try expectVecApproxEqAbs(re[3], f32x4(13.0, 14.0, 15.0, 16.0), epsilon);
4406        try expectVecApproxEqAbs(re[4], f32x4(17.0, 18.0, 19.0, 20.0), epsilon);
4407        try expectVecApproxEqAbs(re[5], f32x4(21.0, 22.0, 23.0, 24.0), epsilon);
4408        try expectVecApproxEqAbs(re[6], f32x4(25.0, 26.0, 27.0, 28.0), epsilon);
4409        try expectVecApproxEqAbs(re[7], f32x4(29.0, 30.0, 31.0, 32.0), epsilon);
4410    }
4411
4412    // 512 samples
4413    {
4414        var re: [128]F32x4 = undefined;
4415        var im = [_]F32x4{f32x4s(0.0)} ** 128;
4416
4417        for (&re, 0..) |*v, i| {
4418            const f = @as(f32, @floatFromInt(i * 4));
4419            v.* = f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0);
4420        }
4421
4422        fftInitUnityTable(unity_table[0..512]);
4423        fft(re[0..], im[0..], unity_table[0..512]);
4424
4425        for (re, 0..) |v, i| {
4426            const f = @as(f32, @floatFromInt(i * 4));
4427            try expect(!approxEqAbs(v, f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0), epsilon));
4428        }
4429
4430        ifft(re[0..], im[0..], unity_table[0..512]);
4431
4432        for (re, 0..) |v, i| {
4433            const f = @as(f32, @floatFromInt(i * 4));
4434            try expectVecApproxEqAbs(v, f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0), epsilon);
4435        }
4436    }
4437}
4438// ------------------------------------------------------------------------------
4439//
4440// Private functions and constants
4441//
4442// ------------------------------------------------------------------------------
4443const f32x4_sign_mask1: F32x4 = F32x4{ @as(f32, @bitCast(@as(u32, 0x8000_0000))), 0, 0, 0 };
4444const f32x4_mask2: F32x4 = F32x4{
4445    @as(f32, @bitCast(@as(u32, 0xffff_ffff))),
4446    @as(f32, @bitCast(@as(u32, 0xffff_ffff))),
4447    0,
4448    0,
4449};
4450const f32x4_mask3: F32x4 = F32x4{
4451    @as(f32, @bitCast(@as(u32, 0xffff_ffff))),
4452    @as(f32, @bitCast(@as(u32, 0xffff_ffff))),
4453    @as(f32, @bitCast(@as(u32, 0xffff_ffff))),
4454    0,
4455};
4456
4457inline fn splatNegativeZero(comptime T: type) T {
4458    return @splat(@as(f32, @bitCast(@as(u32, 0x8000_0000))));
4459}
4460inline fn splatNoFraction(comptime T: type) T {
4461    return @splat(@as(f32, 8_388_608.0));
4462}
4463inline fn splatAbsMask(comptime T: type) T {
4464    return @splat(@as(f32, @bitCast(@as(u32, 0x7fff_ffff))));
4465}
4466
4467fn floatToIntAndBack(v: anytype) @TypeOf(v) {
4468    // This routine won't handle nan, inf and numbers greater than 8_388_608.0 (will generate undefined values).
4469    @setRuntimeSafety(false);
4470
4471    const T = @TypeOf(v);
4472    const len = veclen(T);
4473
4474    var vi32: [len]i32 = undefined;
4475    comptime var i: u32 = 0;
4476    // vcvttps2dq
4477    inline while (i < len) : (i += 1) {
4478        vi32[i] = @as(i32, @intFromFloat(v[i]));
4479    }
4480
4481    var vf32: [len]f32 = undefined;
4482    i = 0;
4483    // vcvtdq2ps
4484    inline while (i < len) : (i += 1) {
4485        vf32[i] = @as(f32, @floatFromInt(vi32[i]));
4486    }
4487
4488    return vf32;
4489}
4490test "zmath.floatToIntAndBack" {
4491    {
4492        const v = floatToIntAndBack(f32x4(1.1, 2.9, 3.0, -4.5));
4493        try expectVecEqual(v, f32x4(1.0, 2.0, 3.0, -4.0));
4494    }
4495    {
4496        const v = floatToIntAndBack(f32x8(1.1, 2.9, 3.0, -4.5, 2.5, -2.5, 1.1, -100.2));
4497        try expectVecEqual(v, f32x8(1.0, 2.0, 3.0, -4.0, 2.0, -2.0, 1.0, -100.0));
4498    }
4499    {
4500        const v = floatToIntAndBack(f32x4(math.inf(f32), 2.9, math.nan(f32), math.snan(f32)));
4501        try expect(v[1] == 2.0);
4502    }
4503}
4504
4505pub fn expectVecEqual(expected: anytype, actual: anytype) !void {
4506    const T = @TypeOf(expected, actual);
4507    inline for (0..veclen(T)) |i| {
4508        try std.testing.expectEqual(expected[i], actual[i]);
4509    }
4510}
4511
4512pub fn expectVecApproxEqAbs(expected: anytype, actual: anytype, eps: f32) !void {
4513    const T = @TypeOf(expected, actual);
4514    inline for (0..veclen(T)) |i| {
4515        try std.testing.expectApproxEqAbs(expected[i], actual[i], eps);
4516    }
4517}
4518
4519pub fn approxEqAbs(v0: anytype, v1: anytype, eps: f32) bool {
4520    const T = @TypeOf(v0, v1);
4521    comptime var i: comptime_int = 0;
4522    inline while (i < veclen(T)) : (i += 1) {
4523        if (!math.approxEqAbs(f32, v0[i], v1[i], eps)) {
4524            return false;
4525        }
4526    }
4527    return true;
4528}
4529
4530// ------------------------------------------------------------------------------
4531// This software is available under 2 licenses -- choose whichever you prefer.
4532// ------------------------------------------------------------------------------
4533// ALTERNATIVE A - MIT License
4534// Copyright (c) 2022 Michal Ziulek and Contributors
4535// Permission is hereby granted, free of charge, to any person obtaining a copy of
4536// this software and associated documentation files (the "Software"), to deal in
4537// the Software without restriction, including without limitation the rights to
4538// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
4539// of the Software, and to permit persons to whom the Software is furnished to do
4540// so, subject to the following conditions:
4541// The above copyright notice and this permission notice shall be included in all
4542// copies or substantial portions of the Software.
4543// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
4544// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
4545// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
4546// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
4547// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
4548// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4549// SOFTWARE.
4550// ------------------------------------------------------------------------------
4551// ALTERNATIVE B - Public Domain (www.unlicense.org)
4552// This is free and unencumbered software released into the public domain.
4553// Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
4554// software, either in source code form or as a compiled binary, for any purpose,
4555// commercial or non-commercial, and by any means.
4556// In jurisdictions that recognize copyright laws, the author or authors of this
4557// software dedicate any and all copyright interest in the software to the public
4558// domain. We make this dedication for the benefit of the public at large and to
4559// the detriment of our heirs and successors. We intend this dedication to be an
4560// overt act of relinquishment in perpetuity of all present and future rights to
4561// this software under copyright law.
4562// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
4563// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
4564// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
4565// AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
4566// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
4567// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
4568// ------------------------------------------------------------------------------