this repo has no description
at main 174 kB view raw
1// ============================================================================== 2// 3// SIMD math library for game developers 4// https://github.com/michal-z/zig-gamedev/tree/main/libs/zmath 5// 6// Should work on all OSes supported by Zig. Works on x86_64 and ARM. 7// Provides ~140 optimized routines and ~70 extensive tests. 8// Can be used with any graphics API. 9// 10// zmath uses row-major matrices, row vectors (each row vector is stored in a SIMD register). 11// Handedness is determined by which function version is used (Rh vs. Lh), 12// otherwise the function works with either left-handed or right-handed view coordinates. 13// 14// const va = f32x4(1.0, 2.0, 3.0, 1.0); 15// const vb = f32x4(-1.0, 1.0, -1.0, 1.0); 16// const v0 = va + vb - f32x4(0.0, 1.0, 0.0, 1.0) * f32x4s(3.0); 17// const v1 = cross3(va, vb) + f32x4(1.0, 1.0, 1.0, 1.0); 18// const v2 = va + dot3(va, vb) / v1; // dotN() returns scalar replicated on all vector components 19// 20// const m = rotationX(math.pi * 0.25); 21// const v = f32x4(...); 22// const v0 = mul(v, m); // 'v' treated as a row vector 23// const v1 = mul(m, v); // 'v' treated as a column vector 24// const f = m[row][column]; 25// 26// const b = va < vb; 27// if (all(b, 0)) { ... } // '0' means check all vector components; if all are 'true' 28// if (all(b, 3)) { ... } // '3' means check first three vector components; if all first three are 'true' 29// if (any(b, 0)) { ... } // '0' means check all vector components; if any is 'true' 30// if (any(b, 3)) { ... } // '3' means check first three vector components; if any from first three is 'true' 31// 32// var v4 = load(mem[0..], F32x4, 0); 33// var v8 = load(mem[100..], F32x8, 0); 34// var v16 = load(mem[200..], F32x16, 0); 35// 36// var camera_position = [3]f32{ 1.0, 2.0, 3.0 }; 37// var cam_pos = loadArr3(camera_position); 38// ... 39// storeArr3(&camera_position, cam_pos); 40// 41// v4 = sin(v4); // SIMDx4 42// v8 = cos(v8); // .x86_64 -> 2 x SIMDx4, .x86_64+avx+fma -> SIMDx8 43// v16 = atan(v16); // .x86_64 -> 4 x SIMDx4, .x86_64+avx+fma -> 2 x SIMDx8, .x86_64+avx512f -> SIMDx16 44// 45// store(mem[0..], v4, 0); 46// store(mem[100..], v8, 0); 47// store(mem[200..], v16, 0); 48// 49// ------------------------------------------------------------------------------ 50// 1. Initialization functions 51// ------------------------------------------------------------------------------ 52// 53// f32x4(e0: f32, e1: f32, e2: f32, e3: f32) F32x4 54// f32x8(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32) F32x8 55// f32x16(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32, 56// e8: f32, e9: f32, ea: f32, eb: f32, ec: f32, ed: f32, ee: f32, ef: f32) F32x16 57// 58// f32x4s(e0: f32) F32x4 59// f32x8s(e0: f32) F32x8 60// f32x16s(e0: f32) F32x16 61// 62// boolx4(e0: bool, e1: bool, e2: bool, e3: bool) Boolx4 63// boolx8(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool) Boolx8 64// boolx16(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool, 65// e8: bool, e9: bool, ea: bool, eb: bool, ec: bool, ed: bool, ee: bool, ef: bool) Boolx16 66// 67// load(mem: []const f32, comptime T: type, comptime len: u32) T 68// store(mem: []f32, v: anytype, comptime len: u32) void 69// 70// loadArr2(arr: [2]f32) F32x4 71// loadArr2zw(arr: [2]f32, z: f32, w: f32) F32x4 72// loadArr3(arr: [3]f32) F32x4 73// loadArr3w(arr: [3]f32, w: f32) F32x4 74// loadArr4(arr: [4]f32) F32x4 75// 76// storeArr2(arr: *[2]f32, v: F32x4) void 77// storeArr3(arr: *[3]f32, v: F32x4) void 78// storeArr4(arr: *[4]f32, v: F32x4) void 79// 80// arr3Ptr(ptr: anytype) *const [3]f32 81// arrNPtr(ptr: anytype) [*]const f32 82// 83// splat(comptime T: type, value: f32) T 84// splatInt(comptime T: type, value: u32) T 85// 86// ------------------------------------------------------------------------------ 87// 2. Functions that work on all vector components (F32xN = F32x4 or F32x8 or F32x16) 88// ------------------------------------------------------------------------------ 89// 90// all(vb: anytype, comptime len: u32) bool 91// any(vb: anytype, comptime len: u32) bool 92// 93// isNearEqual(v0: F32xN, v1: F32xN, epsilon: F32xN) BoolxN 94// isNan(v: F32xN) BoolxN 95// isInf(v: F32xN) BoolxN 96// isInBounds(v: F32xN, bounds: F32xN) BoolxN 97// 98// andInt(v0: F32xN, v1: F32xN) F32xN 99// andNotInt(v0: F32xN, v1: F32xN) F32xN 100// orInt(v0: F32xN, v1: F32xN) F32xN 101// norInt(v0: F32xN, v1: F32xN) F32xN 102// xorInt(v0: F32xN, v1: F32xN) F32xN 103// 104// minFast(v0: F32xN, v1: F32xN) F32xN 105// maxFast(v0: F32xN, v1: F32xN) F32xN 106// min(v0: F32xN, v1: F32xN) F32xN 107// max(v0: F32xN, v1: F32xN) F32xN 108// round(v: F32xN) F32xN 109// floor(v: F32xN) F32xN 110// trunc(v: F32xN) F32xN 111// ceil(v: F32xN) F32xN 112// clamp(v0: F32xN, v1: F32xN) F32xN 113// clampFast(v0: F32xN, v1: F32xN) F32xN 114// saturate(v: F32xN) F32xN 115// saturateFast(v: F32xN) F32xN 116// lerp(v0: F32xN, v1: F32xN, t: f32) F32xN 117// lerpV(v0: F32xN, v1: F32xN, t: F32xN) F32xN 118// lerpInverse(v0: F32xN, v1: F32xN, t: f32) F32xN 119// lerpInverseV(v0: F32xN, v1: F32xN, t: F32xN) F32xN 120// mapLinear(v: F32xN, min1: f32, max1: f32, min2: f32, max2: f32) F32xN 121// mapLinearV(v: F32xN, min1: F32xN, max1: F32xN, min2: F32xN, max2: F32xN) F32xN 122// sqrt(v: F32xN) F32xN 123// abs(v: F32xN) F32xN 124// mod(v0: F32xN, v1: F32xN) F32xN 125// modAngle(v: F32xN) F32xN 126// mulAdd(v0: F32xN, v1: F32xN, v2: F32xN) F32xN 127// select(mask: BoolxN, v0: F32xN, v1: F32xN) 128// sin(v: F32xN) F32xN 129// cos(v: F32xN) F32xN 130// sincos(v: F32xN) [2]F32xN 131// asin(v: F32xN) F32xN 132// acos(v: F32xN) F32xN 133// atan(v: F32xN) F32xN 134// atan2(vy: F32xN, vx: F32xN) F32xN 135// cmulSoa(re0: F32xN, im0: F32xN, re1: F32xN, im1: F32xN) [2]F32xN 136// 137// ------------------------------------------------------------------------------ 138// 3. 2D, 3D, 4D vector functions 139// ------------------------------------------------------------------------------ 140// 141// swizzle(v: Vec, c, c, c, c) Vec (comptime c = .x | .y | .z | .w) 142// dot2(v0: Vec, v1: Vec) F32x4 143// dot3(v0: Vec, v1: Vec) F32x4 144// dot4(v0: Vec, v1: Vec) F32x4 145// cross3(v0: Vec, v1: Vec) Vec 146// lengthSq2(v: Vec) F32x4 147// lengthSq3(v: Vec) F32x4 148// lengthSq4(v: Vec) F32x4 149// length2(v: Vec) F32x4 150// length3(v: Vec) F32x4 151// length4(v: Vec) F32x4 152// normalize2(v: Vec) Vec 153// normalize3(v: Vec) Vec 154// normalize4(v: Vec) Vec 155// 156// vecToArr2(v: Vec) [2]f32 157// vecToArr3(v: Vec) [3]f32 158// vecToArr4(v: Vec) [4]f32 159// 160// ------------------------------------------------------------------------------ 161// 4. Matrix functions 162// ------------------------------------------------------------------------------ 163// 164// identity() Mat 165// mul(m0: Mat, m1: Mat) Mat 166// mul(s: f32, m: Mat) Mat 167// mul(m: Mat, s: f32) Mat 168// mul(v: Vec, m: Mat) Vec 169// mul(m: Mat, v: Vec) Vec 170// transpose(m: Mat) Mat 171// rotationX(angle: f32) Mat 172// rotationY(angle: f32) Mat 173// rotationZ(angle: f32) Mat 174// translation(x: f32, y: f32, z: f32) Mat 175// translationV(v: Vec) Mat 176// scaling(x: f32, y: f32, z: f32) Mat 177// scalingV(v: Vec) Mat 178// lookToLh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat 179// lookAtLh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat 180// lookToRh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat 181// lookAtRh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat 182// perspectiveFovLh(fovy: f32, aspect: f32, near: f32, far: f32) Mat 183// perspectiveFovRh(fovy: f32, aspect: f32, near: f32, far: f32) Mat 184// perspectiveFovLhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat 185// perspectiveFovRhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat 186// orthographicLh(w: f32, h: f32, near: f32, far: f32) Mat 187// orthographicRh(w: f32, h: f32, near: f32, far: f32) Mat 188// orthographicLhGl(w: f32, h: f32, near: f32, far: f32) Mat 189// orthographicRhGl(w: f32, h: f32, near: f32, far: f32) Mat 190// orthographicOffCenterLh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat 191// orthographicOffCenterRh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat 192// orthographicOffCenterLhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat 193// orthographicOffCenterRhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat 194// determinant(m: Mat) F32x4 195// inverse(m: Mat) Mat 196// inverseDet(m: Mat, det: ?*F32x4) Mat 197// matToQuat(m: Mat) Quat 198// matFromAxisAngle(axis: Vec, angle: f32) Mat 199// matFromNormAxisAngle(axis: Vec, angle: f32) Mat 200// matFromQuat(quat: Quat) Mat 201// matFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Mat 202// matFromRollPitchYawV(angles: Vec) Mat 203// matFromArr(arr: [16]f32) Mat 204// 205// loadMat(mem: []const f32) Mat 206// loadMat43(mem: []const f32) Mat 207// loadMat34(mem: []const f32) Mat 208// storeMat(mem: []f32, m: Mat) void 209// storeMat43(mem: []f32, m: Mat) void 210// storeMat34(mem: []f32, m: Mat) void 211// 212// matToArr(m: Mat) [16]f32 213// matToArr43(m: Mat) [12]f32 214// matToArr34(m: Mat) [12]f32 215// 216// ------------------------------------------------------------------------------ 217// 5. Quaternion functions 218// ------------------------------------------------------------------------------ 219// 220// qmul(q0: Quat, q1: Quat) Quat 221// qidentity() Quat 222// conjugate(quat: Quat) Quat 223// inverse(q: Quat) Quat 224// rotate(q: Quat, v: Vec) Vec 225// slerp(q0: Quat, q1: Quat, t: f32) Quat 226// slerpV(q0: Quat, q1: Quat, t: F32x4) Quat 227// quatToMat(quat: Quat) Mat 228// quatToAxisAngle(quat: Quat, axis: *Vec, angle: *f32) void 229// quatFromMat(m: Mat) Quat 230// quatFromAxisAngle(axis: Vec, angle: f32) Quat 231// quatFromNormAxisAngle(axis: Vec, angle: f32) Quat 232// quatFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Quat 233// quatFromRollPitchYawV(angles: Vec) Quat 234// 235// ------------------------------------------------------------------------------ 236// 6. Color functions 237// ------------------------------------------------------------------------------ 238// 239// adjustSaturation(color: F32x4, saturation: f32) F32x4 240// adjustContrast(color: F32x4, contrast: f32) F32x4 241// rgbToHsl(rgb: F32x4) F32x4 242// hslToRgb(hsl: F32x4) F32x4 243// rgbToHsv(rgb: F32x4) F32x4 244// hsvToRgb(hsv: F32x4) F32x4 245// rgbToSrgb(rgb: F32x4) F32x4 246// srgbToRgb(srgb: F32x4) F32x4 247// 248// ------------------------------------------------------------------------------ 249// X. Misc functions 250// ------------------------------------------------------------------------------ 251// 252// linePointDistance(linept0: Vec, linept1: Vec, pt: Vec) F32x4 253// sin(v: f32) f32 254// cos(v: f32) f32 255// sincos(v: f32) [2]f32 256// asin(v: f32) f32 257// acos(v: f32) f32 258// 259// fftInitUnityTable(unitytable: []F32x4) void 260// fft(re: []F32x4, im: []F32x4, unitytable: []const F32x4) void 261// ifft(re: []F32x4, im: []const F32x4, unitytable: []const F32x4) void 262// 263// ============================================================================== 264 265// Fundamental types 266pub const F32x4 = @Vector(4, f32); 267pub const F32x8 = @Vector(8, f32); 268pub const F32x16 = @Vector(16, f32); 269pub const Boolx4 = @Vector(4, bool); 270pub const Boolx8 = @Vector(8, bool); 271pub const Boolx16 = @Vector(16, bool); 272 273// "Higher-level" aliases 274pub const Vec = F32x4; 275pub const Mat = [4]F32x4; 276pub const Quat = F32x4; 277 278const builtin = @import("builtin"); 279const std = @import("std"); 280const math = std.math; 281const assert = std.debug.assert; 282const expect = std.testing.expect; 283 284const cpu_arch = builtin.cpu.arch; 285const has_avx = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .avx) else false; 286const has_avx512f = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f) else false; 287const has_fma = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .fma) else false; 288// ------------------------------------------------------------------------------ 289// 290// 1. Initialization functions 291// 292// ------------------------------------------------------------------------------ 293pub inline fn f32x4(e0: f32, e1: f32, e2: f32, e3: f32) F32x4 { 294 return .{ e0, e1, e2, e3 }; 295} 296pub inline fn f32x8(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32) F32x8 { 297 return .{ e0, e1, e2, e3, e4, e5, e6, e7 }; 298} 299// zig fmt: off 300pub inline fn f32x16( 301 e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32, 302 e8: f32, e9: f32, ea: f32, eb: f32, ec: f32, ed: f32, ee: f32, ef: f32) F32x16 { 303 return .{ e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, ea, eb, ec, ed, ee, ef }; 304} 305// zig fmt: on 306 307pub inline fn f32x4s(e0: f32) F32x4 { 308 return splat(F32x4, e0); 309} 310pub inline fn f32x8s(e0: f32) F32x8 { 311 return splat(F32x8, e0); 312} 313pub inline fn f32x16s(e0: f32) F32x16 { 314 return splat(F32x16, e0); 315} 316 317pub inline fn boolx4(e0: bool, e1: bool, e2: bool, e3: bool) Boolx4 { 318 return .{ e0, e1, e2, e3 }; 319} 320pub inline fn boolx8(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool) Boolx8 { 321 return .{ e0, e1, e2, e3, e4, e5, e6, e7 }; 322} 323// zig fmt: off 324pub inline fn boolx16( 325 e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool, 326 e8: bool, e9: bool, ea: bool, eb: bool, ec: bool, ed: bool, ee: bool, ef: bool) Boolx16 { 327 return .{ e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, ea, eb, ec, ed, ee, ef }; 328} 329// zig fmt: on 330 331pub inline fn veclen(comptime T: type) comptime_int { 332 return @typeInfo(T).vector.len; 333} 334 335pub inline fn splat(comptime T: type, value: f32) T { 336 return @splat(value); 337} 338pub inline fn splatInt(comptime T: type, value: u32) T { 339 return @splat(@bitCast(value)); 340} 341 342pub fn load(mem: []const f32, comptime T: type, comptime len: u32) T { 343 var v = splat(T, 0.0); 344 const loop_len = if (len == 0) veclen(T) else len; 345 comptime var i: u32 = 0; 346 inline while (i < loop_len) : (i += 1) { 347 v[i] = mem[i]; 348 } 349 return v; 350} 351test "zmath.load" { 352 const a = [7]f32{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 }; 353 var ptr = &a; 354 var i: u32 = 0; 355 const v0 = load(a[i..], F32x4, 2); 356 try expectVecEqual(v0, F32x4{ 1.0, 2.0, 0.0, 0.0 }); 357 i += 2; 358 const v1 = load(a[i .. i + 2], F32x4, 2); 359 try expectVecEqual(v1, F32x4{ 3.0, 4.0, 0.0, 0.0 }); 360 const v2 = load(a[5..7], F32x4, 2); 361 try expectVecEqual(v2, F32x4{ 6.0, 7.0, 0.0, 0.0 }); 362 const v3 = load(ptr[1..], F32x4, 2); 363 try expectVecEqual(v3, F32x4{ 2.0, 3.0, 0.0, 0.0 }); 364 i += 1; 365 const v4 = load(ptr[i .. i + 2], F32x4, 2); 366 try expectVecEqual(v4, F32x4{ 4.0, 5.0, 0.0, 0.0 }); 367} 368 369pub fn store(mem: []f32, v: anytype, comptime len: u32) void { 370 const T = @TypeOf(v); 371 const loop_len = if (len == 0) veclen(T) else len; 372 comptime var i: u32 = 0; 373 inline while (i < loop_len) : (i += 1) { 374 mem[i] = v[i]; 375 } 376} 377test "zmath.store" { 378 var a = [7]f32{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 }; 379 const v = load(a[1..], F32x4, 3); 380 store(a[2..], v, 4); 381 try expect(a[0] == 1.0); 382 try expect(a[1] == 2.0); 383 try expect(a[2] == 2.0); 384 try expect(a[3] == 3.0); 385 try expect(a[4] == 4.0); 386 try expect(a[5] == 0.0); 387} 388 389pub inline fn loadArr2(arr: [2]f32) F32x4 { 390 return f32x4(arr[0], arr[1], 0.0, 0.0); 391} 392pub inline fn loadArr2zw(arr: [2]f32, z: f32, w: f32) F32x4 { 393 return f32x4(arr[0], arr[1], z, w); 394} 395pub inline fn loadArr3(arr: [3]f32) F32x4 { 396 return f32x4(arr[0], arr[1], arr[2], 0.0); 397} 398pub inline fn loadArr3w(arr: [3]f32, w: f32) F32x4 { 399 return f32x4(arr[0], arr[1], arr[2], w); 400} 401pub inline fn loadArr4(arr: [4]f32) F32x4 { 402 return f32x4(arr[0], arr[1], arr[2], arr[3]); 403} 404 405pub inline fn storeArr2(arr: *[2]f32, v: F32x4) void { 406 arr.* = .{ v[0], v[1] }; 407} 408pub inline fn storeArr3(arr: *[3]f32, v: F32x4) void { 409 arr.* = .{ v[0], v[1], v[2] }; 410} 411pub inline fn storeArr4(arr: *[4]f32, v: F32x4) void { 412 arr.* = .{ v[0], v[1], v[2], v[3] }; 413} 414 415pub inline fn arr3Ptr(ptr: anytype) *const [3]f32 { 416 comptime assert(@typeInfo(@TypeOf(ptr)) == .pointer); 417 const T = std.meta.Child(@TypeOf(ptr)); 418 comptime assert(T == F32x4); 419 return @as(*const [3]f32, @ptrCast(ptr)); 420} 421 422pub inline fn arrNPtr(ptr: anytype) [*]const f32 { 423 comptime assert(@typeInfo(@TypeOf(ptr)) == .pointer); 424 const T = std.meta.Child(@TypeOf(ptr)); 425 comptime assert(T == Mat or T == F32x4 or T == F32x8 or T == F32x16); 426 return @as([*]const f32, @ptrCast(ptr)); 427} 428test "zmath.arrNPtr" { 429 { 430 const mat = identity(); 431 const f32ptr = arrNPtr(&mat); 432 try expect(f32ptr[0] == 1.0); 433 try expect(f32ptr[5] == 1.0); 434 try expect(f32ptr[10] == 1.0); 435 try expect(f32ptr[15] == 1.0); 436 } 437 { 438 const v8 = f32x8s(1.0); 439 const f32ptr = arrNPtr(&v8); 440 try expect(f32ptr[1] == 1.0); 441 try expect(f32ptr[7] == 1.0); 442 } 443} 444 445test "zmath.loadArr" { 446 { 447 const camera_position = [3]f32{ 1.0, 2.0, 3.0 }; 448 const simd_reg = loadArr3(camera_position); 449 try expectVecEqual(simd_reg, f32x4(1.0, 2.0, 3.0, 0.0)); 450 } 451 { 452 const camera_position = [3]f32{ 1.0, 2.0, 3.0 }; 453 const simd_reg = loadArr3w(camera_position, 1.0); 454 try expectVecEqual(simd_reg, f32x4(1.0, 2.0, 3.0, 1.0)); 455 } 456} 457 458pub inline fn vecToArr2(v: Vec) [2]f32 { 459 return .{ v[0], v[1] }; 460} 461pub inline fn vecToArr3(v: Vec) [3]f32 { 462 return .{ v[0], v[1], v[2] }; 463} 464pub inline fn vecToArr4(v: Vec) [4]f32 { 465 return .{ v[0], v[1], v[2], v[3] }; 466} 467// ------------------------------------------------------------------------------ 468// 469// 2. Functions that work on all vector components (F32xN = F32x4 or F32x8 or F32x16) 470// 471// ------------------------------------------------------------------------------ 472pub fn all(vb: anytype, comptime len: u32) bool { 473 const T = @TypeOf(vb); 474 if (len > veclen(T)) { 475 @compileError("zmath.all(): 'len' is greater than vector len of type " ++ @typeName(T)); 476 } 477 const loop_len = if (len == 0) veclen(T) else len; 478 const ab: [veclen(T)]bool = vb; 479 comptime var i: u32 = 0; 480 var result = true; 481 inline while (i < loop_len) : (i += 1) { 482 result = result and ab[i]; 483 } 484 return result; 485} 486test "zmath.all" { 487 try expect(all(boolx8(true, true, true, true, true, false, true, false), 5) == true); 488 try expect(all(boolx8(true, true, true, true, true, false, true, false), 6) == false); 489 try expect(all(boolx8(true, true, true, true, false, false, false, false), 4) == true); 490 try expect(all(boolx4(true, true, true, false), 3) == true); 491 try expect(all(boolx4(true, true, true, false), 1) == true); 492 try expect(all(boolx4(true, false, false, false), 1) == true); 493 try expect(all(boolx4(false, true, false, false), 1) == false); 494 try expect(all(boolx8(true, true, true, true, true, false, true, false), 0) == false); 495 try expect(all(boolx4(false, true, false, false), 0) == false); 496 try expect(all(boolx4(true, true, true, true), 0) == true); 497} 498 499pub fn any(vb: anytype, comptime len: u32) bool { 500 const T = @TypeOf(vb); 501 if (len > veclen(T)) { 502 @compileError("zmath.any(): 'len' is greater than vector len of type " ++ @typeName(T)); 503 } 504 const loop_len = if (len == 0) veclen(T) else len; 505 const ab: [veclen(T)]bool = vb; 506 comptime var i: u32 = 0; 507 var result = false; 508 inline while (i < loop_len) : (i += 1) { 509 result = result or ab[i]; 510 } 511 return result; 512} 513test "zmath.any" { 514 try expect(any(boolx8(true, true, true, true, true, false, true, false), 0) == true); 515 try expect(any(boolx8(false, false, false, true, true, false, true, false), 3) == false); 516 try expect(any(boolx8(false, false, false, false, false, true, false, false), 4) == false); 517} 518 519pub inline fn isNearEqual( 520 v0: anytype, 521 v1: anytype, 522 epsilon: anytype, 523) @Vector(veclen(@TypeOf(v0)), bool) { 524 const T = @TypeOf(v0, v1, epsilon); 525 const delta = v0 - v1; 526 const temp = maxFast(delta, splat(T, 0.0) - delta); 527 return temp <= epsilon; 528} 529test "zmath.isNearEqual" { 530 { 531 const v0 = f32x4(1.0, 2.0, -3.0, 4.001); 532 const v1 = f32x4(1.0, 2.1, 3.0, 4.0); 533 const b = isNearEqual(v0, v1, splat(F32x4, 0.01)); 534 try expect(@reduce(.And, b == boolx4(true, false, false, true))); 535 } 536 { 537 const v0 = f32x8(1.0, 2.0, -3.0, 4.001, 1.001, 2.3, -0.0, 0.0); 538 const v1 = f32x8(1.0, 2.1, 3.0, 4.0, -1.001, 2.1, 0.0, 0.0); 539 const b = isNearEqual(v0, v1, splat(F32x8, 0.01)); 540 try expect(@reduce(.And, b == boolx8(true, false, false, true, false, false, true, true))); 541 } 542 try expect(all(isNearEqual( 543 splat(F32x4, math.inf(f32)), 544 splat(F32x4, math.inf(f32)), 545 splat(F32x4, 0.0001), 546 ), 0) == false); 547 try expect(all(isNearEqual( 548 splat(F32x4, -math.inf(f32)), 549 splat(F32x4, math.inf(f32)), 550 splat(F32x4, 0.0001), 551 ), 0) == false); 552 try expect(all(isNearEqual( 553 splat(F32x4, -math.inf(f32)), 554 splat(F32x4, -math.inf(f32)), 555 splat(F32x4, 0.0001), 556 ), 0) == false); 557 try expect(all(isNearEqual( 558 splat(F32x4, -math.nan(f32)), 559 splat(F32x4, math.inf(f32)), 560 splat(F32x4, 0.0001), 561 ), 0) == false); 562} 563 564pub inline fn isNan( 565 v: anytype, 566) @Vector(veclen(@TypeOf(v)), bool) { 567 return v != v; 568} 569test "zmath.isNan" { 570 { 571 const v0 = f32x4(math.inf(f32), math.nan(f32), math.nan(f32), 7.0); 572 const b = isNan(v0); 573 try expect(@reduce(.And, b == boolx4(false, true, true, false))); 574 } 575 { 576 const v0 = f32x8(0, math.nan(f32), 0, 0, math.inf(f32), math.nan(f32), math.snan(f32), 7.0); 577 const b = isNan(v0); 578 try expect(@reduce(.And, b == boolx8(false, true, false, false, false, true, true, false))); 579 } 580} 581 582pub inline fn isInf( 583 v: anytype, 584) @Vector(veclen(@TypeOf(v)), bool) { 585 const T = @TypeOf(v); 586 return abs(v) == splat(T, math.inf(f32)); 587} 588test "zmath.isInf" { 589 { 590 const v0 = f32x4(math.inf(f32), math.nan(f32), math.snan(f32), 7.0); 591 const b = isInf(v0); 592 try expect(@reduce(.And, b == boolx4(true, false, false, false))); 593 } 594 { 595 const v0 = f32x8(0, math.inf(f32), 0, 0, math.inf(f32), math.nan(f32), math.snan(f32), 7.0); 596 const b = isInf(v0); 597 try expect(@reduce(.And, b == boolx8(false, true, false, false, true, false, false, false))); 598 } 599} 600 601pub inline fn isInBounds( 602 v: anytype, 603 bounds: anytype, 604) @Vector(veclen(@TypeOf(v)), bool) { 605 const T = @TypeOf(v, bounds); 606 const Tu = @Vector(veclen(T), u1); 607 const Tr = @Vector(veclen(T), bool); 608 609 // 2 x cmpleps, xorps, load, andps 610 const b0 = v <= bounds; 611 const b1 = (bounds * splat(T, -1.0)) <= v; 612 const b0u = @as(Tu, @bitCast(b0)); 613 const b1u = @as(Tu, @bitCast(b1)); 614 return @as(Tr, @bitCast(b0u & b1u)); 615} 616test "zmath.isInBounds" { 617 { 618 const v0 = f32x4(0.5, -2.0, -1.0, 1.9); 619 const v1 = f32x4(-1.6, -2.001, -1.0, 1.9); 620 const bounds = f32x4(1.0, 2.0, 1.0, 2.0); 621 const b0 = isInBounds(v0, bounds); 622 const b1 = isInBounds(v1, bounds); 623 try expect(@reduce(.And, b0 == boolx4(true, true, true, true))); 624 try expect(@reduce(.And, b1 == boolx4(false, false, true, true))); 625 } 626 { 627 const v0 = f32x8(2.0, 1.0, 2.0, 1.0, 0.5, -2.0, -1.0, 1.9); 628 const bounds = f32x8(1.0, 1.0, 1.0, math.inf(f32), 1.0, math.nan(f32), 1.0, 2.0); 629 const b0 = isInBounds(v0, bounds); 630 try expect(@reduce(.And, b0 == boolx8(false, true, false, true, true, false, true, true))); 631 } 632} 633 634pub inline fn andInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 635 const T = @TypeOf(v0, v1); 636 const Tu = @Vector(veclen(T), u32); 637 const v0u = @as(Tu, @bitCast(v0)); 638 const v1u = @as(Tu, @bitCast(v1)); 639 return @as(T, @bitCast(v0u & v1u)); // andps 640} 641test "zmath.andInt" { 642 { 643 const v0 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); 644 const v1 = f32x4(1.0, 2.0, 3.0, math.inf(f32)); 645 const v = andInt(v0, v1); 646 try expect(v[3] == math.inf(f32)); 647 try expectVecEqual(v, f32x4(0.0, 2.0, 0.0, math.inf(f32))); 648 } 649 { 650 const v0 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); 651 const v1 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, math.inf(f32)); 652 const v = andInt(v0, v1); 653 try expect(v[7] == math.inf(f32)); 654 try expectVecEqual(v, f32x8(0, 0, 0, 0, 0.0, 2.0, 0.0, math.inf(f32))); 655 } 656} 657 658pub inline fn andNotInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 659 const T = @TypeOf(v0, v1); 660 const Tu = @Vector(veclen(T), u32); 661 const v0u = @as(Tu, @bitCast(v0)); 662 const v1u = @as(Tu, @bitCast(v1)); 663 return @as(T, @bitCast(~v0u & v1u)); // andnps 664} 665test "zmath.andNotInt" { 666 { 667 const v0 = f32x4(1.0, 2.0, 3.0, 4.0); 668 const v1 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); 669 const v = andNotInt(v1, v0); 670 try expectVecEqual(v, f32x4(1.0, 0.0, 3.0, 0.0)); 671 } 672 { 673 const v0 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0); 674 const v1 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); 675 const v = andNotInt(v1, v0); 676 try expectVecEqual(v, f32x8(0, 0, 0, 0, 1.0, 0.0, 3.0, 0.0)); 677 } 678} 679 680pub inline fn orInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 681 const T = @TypeOf(v0, v1); 682 const Tu = @Vector(veclen(T), u32); 683 const v0u = @as(Tu, @bitCast(v0)); 684 const v1u = @as(Tu, @bitCast(v1)); 685 return @as(T, @bitCast(v0u | v1u)); // orps 686} 687test "zmath.orInt" { 688 { 689 const v0 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); 690 const v1 = f32x4(1.0, 2.0, 3.0, 4.0); 691 const v = orInt(v0, v1); 692 try expect(v[0] == 1.0); 693 try expect(@as(u32, @bitCast(v[1])) == ~@as(u32, 0)); 694 try expect(v[2] == 3.0); 695 try expect(v[3] == 4.0); 696 } 697 { 698 const v0 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); 699 const v1 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0); 700 const v = orInt(v0, v1); 701 try expect(v[4] == 1.0); 702 try expect(@as(u32, @bitCast(v[5])) == ~@as(u32, 0)); 703 try expect(v[6] == 3.0); 704 try expect(v[7] == 4.0); 705 } 706} 707 708pub inline fn norInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 709 const T = @TypeOf(v0, v1); 710 const Tu = @Vector(veclen(T), u32); 711 const v0u = @as(Tu, @bitCast(v0)); 712 const v1u = @as(Tu, @bitCast(v1)); 713 return @as(T, @bitCast(~(v0u | v1u))); // por, pcmpeqd, pxor 714} 715 716pub inline fn xorInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 717 const T = @TypeOf(v0, v1); 718 const Tu = @Vector(veclen(T), u32); 719 const v0u = @as(Tu, @bitCast(v0)); 720 const v1u = @as(Tu, @bitCast(v1)); 721 return @as(T, @bitCast(v0u ^ v1u)); // xorps 722} 723test "zmath.xorInt" { 724 { 725 const v0 = f32x4(1.0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); 726 const v1 = f32x4(1.0, 0, 0, 0); 727 const v = xorInt(v0, v1); 728 try expect(v[0] == 0.0); 729 try expect(@as(u32, @bitCast(v[1])) == ~@as(u32, 0)); 730 try expect(v[2] == 0.0); 731 try expect(v[3] == 0.0); 732 } 733 { 734 const v0 = f32x8(0, 0, 0, 0, 1.0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); 735 const v1 = f32x8(0, 0, 0, 0, 1.0, 0, 0, 0); 736 const v = xorInt(v0, v1); 737 try expect(v[4] == 0.0); 738 try expect(@as(u32, @bitCast(v[5])) == ~@as(u32, 0)); 739 try expect(v[6] == 0.0); 740 try expect(v[7] == 0.0); 741 } 742} 743 744pub inline fn minFast(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 745 return select(v0 < v1, v0, v1); // minps 746} 747test "zmath.minFast" { 748 { 749 const v0 = f32x4(1.0, 3.0, 2.0, 7.0); 750 const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 751 const v = minFast(v0, v1); 752 try expectVecEqual(v, f32x4(1.0, 1.0, 2.0, 7.0)); 753 } 754 { 755 const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); 756 const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 757 const v = minFast(v0, v1); 758 try expect(v[0] == 1.0); 759 try expect(v[1] == 1.0); 760 try expect(!math.isNan(v[1])); 761 try expect(v[2] == 4.0); 762 try expect(v[3] == math.inf(f32)); 763 try expect(!math.isNan(v[3])); 764 } 765} 766 767pub inline fn maxFast(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 768 return select(v0 > v1, v0, v1); // maxps 769} 770test "zmath.maxFast" { 771 { 772 const v0 = f32x4(1.0, 3.0, 2.0, 7.0); 773 const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 774 const v = maxFast(v0, v1); 775 try expectVecEqual(v, f32x4(2.0, 3.0, 4.0, math.inf(f32))); 776 } 777 { 778 const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); 779 const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 780 const v = maxFast(v0, v1); 781 try expect(v[0] == 2.0); 782 try expect(v[1] == 1.0); 783 try expect(v[2] == 5.0); 784 try expect(v[3] == math.inf(f32)); 785 try expect(!math.isNan(v[3])); 786 } 787} 788 789pub inline fn min(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 790 // This will handle inf & nan 791 return @min(v0, v1); // minps, cmpunordps, andps, andnps, orps 792} 793test "zmath.min" { 794 // Calling math.inf causes test to fail! 795 if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; 796 { 797 const v0 = f32x4(1.0, 3.0, 2.0, 7.0); 798 const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 799 const v = min(v0, v1); 800 try expectVecEqual(v, f32x4(1.0, 1.0, 2.0, 7.0)); 801 } 802 { 803 const v0 = f32x8(0, 0, -2.0, 0, 1.0, 3.0, 2.0, 7.0); 804 const v1 = f32x8(0, 1.0, 0, 0, 2.0, 1.0, 4.0, math.inf(f32)); 805 const v = min(v0, v1); 806 try expectVecEqual(v, f32x8(0.0, 0.0, -2.0, 0.0, 1.0, 1.0, 2.0, 7.0)); 807 } 808 { 809 const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); 810 const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 811 const v = min(v0, v1); 812 try expect(v[0] == 1.0); 813 try expect(v[1] == 1.0); 814 try expect(!math.isNan(v[1])); 815 try expect(v[2] == 4.0); 816 try expect(v[3] == math.inf(f32)); 817 try expect(!math.isNan(v[3])); 818 } 819 820 { 821 const v0 = f32x4(-math.inf(f32), math.inf(f32), math.inf(f32), math.snan(f32)); 822 const v1 = f32x4(math.snan(f32), -math.inf(f32), math.snan(f32), math.nan(f32)); 823 const v = min(v0, v1); 824 try expect(v[0] == -math.inf(f32)); 825 try expect(v[1] == -math.inf(f32)); 826 try expect(v[2] == math.inf(f32)); 827 try expect(!math.isNan(v[2])); 828 try expect(math.isNan(v[3])); 829 try expect(!math.isInf(v[3])); 830 } 831} 832 833pub inline fn max(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 834 // This will handle inf & nan 835 return @max(v0, v1); // maxps, cmpunordps, andps, andnps, orps 836} 837test "zmath.max" { 838 // Calling math.inf causes test to fail! 839 if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; 840 { 841 const v0 = f32x4(1.0, 3.0, 2.0, 7.0); 842 const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 843 const v = max(v0, v1); 844 try expectVecEqual(v, f32x4(2.0, 3.0, 4.0, math.inf(f32))); 845 } 846 { 847 const v0 = f32x8(0, 0, -2.0, 0, 1.0, 3.0, 2.0, 7.0); 848 const v1 = f32x8(0, 1.0, 0, 0, 2.0, 1.0, 4.0, math.inf(f32)); 849 const v = max(v0, v1); 850 try expectVecEqual(v, f32x8(0.0, 1.0, 0.0, 0.0, 2.0, 3.0, 4.0, math.inf(f32))); 851 } 852 { 853 const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); 854 const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 855 const v = max(v0, v1); 856 try expect(v[0] == 2.0); 857 try expect(v[1] == 1.0); 858 try expect(v[2] == 5.0); 859 try expect(v[3] == math.inf(f32)); 860 try expect(!math.isNan(v[3])); 861 } 862 { 863 const v0 = f32x4(-math.inf(f32), math.inf(f32), math.inf(f32), math.snan(f32)); 864 const v1 = f32x4(math.snan(f32), -math.inf(f32), math.snan(f32), math.nan(f32)); 865 const v = max(v0, v1); 866 try expect(v[0] == -math.inf(f32)); 867 try expect(v[1] == math.inf(f32)); 868 try expect(v[2] == math.inf(f32)); 869 try expect(!math.isNan(v[2])); 870 try expect(math.isNan(v[3])); 871 try expect(!math.isInf(v[3])); 872 } 873} 874 875pub fn round(v: anytype) @TypeOf(v) { 876 const T = @TypeOf(v); 877 if (cpu_arch == .x86_64 and has_avx) { 878 if (T == F32x4) { 879 return asm ("vroundps $0, %%xmm0, %%xmm0" 880 : [ret] "={xmm0}" (-> T), 881 : [v] "{xmm0}" (v), 882 ); 883 } else if (T == F32x8) { 884 return asm ("vroundps $0, %%ymm0, %%ymm0" 885 : [ret] "={ymm0}" (-> T), 886 : [v] "{ymm0}" (v), 887 ); 888 } else if (T == F32x16 and has_avx512f) { 889 return asm ("vrndscaleps $0, %%zmm0, %%zmm0" 890 : [ret] "={zmm0}" (-> T), 891 : [v] "{zmm0}" (v), 892 ); 893 } else if (T == F32x16 and !has_avx512f) { 894 const arr: [16]f32 = v; 895 var ymm0 = @as(F32x8, arr[0..8].*); 896 var ymm1 = @as(F32x8, arr[8..16].*); 897 ymm0 = asm ("vroundps $0, %%ymm0, %%ymm0" 898 : [ret] "={ymm0}" (-> F32x8), 899 : [v] "{ymm0}" (ymm0), 900 ); 901 ymm1 = asm ("vroundps $0, %%ymm1, %%ymm1" 902 : [ret] "={ymm1}" (-> F32x8), 903 : [v] "{ymm1}" (ymm1), 904 ); 905 return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); 906 } 907 } else { 908 const sign = andInt(v, splatNegativeZero(T)); 909 const magic = orInt(splatNoFraction(T), sign); 910 var r1 = v + magic; 911 r1 = r1 - magic; 912 const r2 = abs(v); 913 const mask = r2 <= splatNoFraction(T); 914 return select(mask, r1, v); 915 } 916} 917test "zmath.round" { 918 { 919 try expect(all(round(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); 920 try expect(all(round(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); 921 try expect(all(isNan(round(splat(F32x4, math.nan(f32)))), 0)); 922 try expect(all(isNan(round(splat(F32x4, -math.nan(f32)))), 0)); 923 try expect(all(isNan(round(splat(F32x4, math.snan(f32)))), 0)); 924 try expect(all(isNan(round(splat(F32x4, -math.snan(f32)))), 0)); 925 } 926 { 927 const v = round(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); 928 try expectVecApproxEqAbs( 929 v, 930 f32x16(1.0, -1.0, -2.0, 2.0, 2.0, 3.0, 3.0, 4.0, 6.0, 6.0, 8.0, 9.0, 10.0, 11.0, 13.0, 13.0), 931 0.0, 932 ); 933 } 934 var v = round(f32x4(1.1, -1.1, -1.5, 1.5)); 935 try expectVecEqual(v, f32x4(1.0, -1.0, -2.0, 2.0)); 936 937 const v1 = f32x4(-10_000_000.1, -math.inf(f32), 10_000_001.5, math.inf(f32)); 938 v = round(v1); 939 try expect(v[3] == math.inf(f32)); 940 try expectVecEqual(v, f32x4(-10_000_000.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); 941 942 const v2 = f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32)); 943 v = round(v2); 944 try expect(math.isNan(v2[0])); 945 try expect(math.isNan(v2[1])); 946 try expect(math.isNan(v2[2])); 947 try expect(v2[3] == -math.inf(f32)); 948 949 const v3 = f32x4(1001.5, -201.499, -10000.99, -101.5); 950 v = round(v3); 951 try expectVecEqual(v, f32x4(1002.0, -201.0, -10001.0, -102.0)); 952 953 const v4 = f32x4(-1_388_609.9, 1_388_609.5, 1_388_109.01, 2_388_609.5); 954 v = round(v4); 955 try expectVecEqual(v, f32x4(-1_388_610.0, 1_388_610.0, 1_388_109.0, 2_388_610.0)); 956 957 var f: f32 = -100.0; 958 var i: u32 = 0; 959 while (i < 100) : (i += 1) { 960 const vr = round(splat(F32x4, f)); 961 const fr = @round(splat(F32x4, f)); 962 const vr8 = round(splat(F32x8, f)); 963 const fr8 = @round(splat(F32x8, f)); 964 const vr16 = round(splat(F32x16, f)); 965 const fr16 = @round(splat(F32x16, f)); 966 try expectVecEqual(vr, fr); 967 try expectVecEqual(vr8, fr8); 968 try expectVecEqual(vr16, fr16); 969 f += 0.12345 * @as(f32, @floatFromInt(i)); 970 } 971} 972 973pub fn trunc(v: anytype) @TypeOf(v) { 974 const T = @TypeOf(v); 975 if (cpu_arch == .x86_64 and has_avx) { 976 if (T == F32x4) { 977 return asm ("vroundps $3, %%xmm0, %%xmm0" 978 : [ret] "={xmm0}" (-> T), 979 : [v] "{xmm0}" (v), 980 ); 981 } else if (T == F32x8) { 982 return asm ("vroundps $3, %%ymm0, %%ymm0" 983 : [ret] "={ymm0}" (-> T), 984 : [v] "{ymm0}" (v), 985 ); 986 } else if (T == F32x16 and has_avx512f) { 987 return asm ("vrndscaleps $3, %%zmm0, %%zmm0" 988 : [ret] "={zmm0}" (-> T), 989 : [v] "{zmm0}" (v), 990 ); 991 } else if (T == F32x16 and !has_avx512f) { 992 const arr: [16]f32 = v; 993 var ymm0 = @as(F32x8, arr[0..8].*); 994 var ymm1 = @as(F32x8, arr[8..16].*); 995 ymm0 = asm ("vroundps $3, %%ymm0, %%ymm0" 996 : [ret] "={ymm0}" (-> F32x8), 997 : [v] "{ymm0}" (ymm0), 998 ); 999 ymm1 = asm ("vroundps $3, %%ymm1, %%ymm1" 1000 : [ret] "={ymm1}" (-> F32x8), 1001 : [v] "{ymm1}" (ymm1), 1002 ); 1003 return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); 1004 } 1005 } else { 1006 const mask = abs(v) < splatNoFraction(T); 1007 const result = floatToIntAndBack(v); 1008 return select(mask, result, v); 1009 } 1010} 1011test "zmath.trunc" { 1012 { 1013 try expect(all(trunc(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); 1014 try expect(all(trunc(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); 1015 try expect(all(isNan(trunc(splat(F32x4, math.nan(f32)))), 0)); 1016 try expect(all(isNan(trunc(splat(F32x4, -math.nan(f32)))), 0)); 1017 try expect(all(isNan(trunc(splat(F32x4, math.snan(f32)))), 0)); 1018 try expect(all(isNan(trunc(splat(F32x4, -math.snan(f32)))), 0)); 1019 } 1020 { 1021 const v = trunc(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); 1022 try expectVecApproxEqAbs( 1023 v, 1024 f32x16(1.0, -1.0, -1.0, 1.0, 2.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 10.0, 11.0, 12.0, 13.0), 1025 0.0, 1026 ); 1027 } 1028 var v = trunc(f32x4(1.1, -1.1, -1.5, 1.5)); 1029 try expectVecEqual(v, f32x4(1.0, -1.0, -1.0, 1.0)); 1030 1031 v = trunc(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); 1032 try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); 1033 1034 v = trunc(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); 1035 try expect(math.isNan(v[0])); 1036 try expect(math.isNan(v[1])); 1037 try expect(math.isNan(v[2])); 1038 try expect(v[3] == -math.inf(f32)); 1039 1040 v = trunc(f32x4(1000.5001, -201.499, -10000.99, 100.750001)); 1041 try expectVecEqual(v, f32x4(1000.0, -201.0, -10000.0, 100.0)); 1042 1043 v = trunc(f32x4(-7_388_609.5, 7_388_609.1, 8_388_109.5, -8_388_509.5)); 1044 try expectVecEqual(v, f32x4(-7_388_609.0, 7_388_609.0, 8_388_109.0, -8_388_509.0)); 1045 1046 var f: f32 = -100.0; 1047 var i: u32 = 0; 1048 while (i < 100) : (i += 1) { 1049 const vr = trunc(splat(F32x4, f)); 1050 const fr = @trunc(splat(F32x4, f)); 1051 const vr8 = trunc(splat(F32x8, f)); 1052 const fr8 = @trunc(splat(F32x8, f)); 1053 const vr16 = trunc(splat(F32x16, f)); 1054 const fr16 = @trunc(splat(F32x16, f)); 1055 try expectVecEqual(vr, fr); 1056 try expectVecEqual(vr8, fr8); 1057 try expectVecEqual(vr16, fr16); 1058 f += 0.12345 * @as(f32, @floatFromInt(i)); 1059 } 1060} 1061 1062pub fn floor(v: anytype) @TypeOf(v) { 1063 const T = @TypeOf(v); 1064 if (cpu_arch == .x86_64 and has_avx) { 1065 if (T == F32x4) { 1066 return asm ("vroundps $1, %%xmm0, %%xmm0" 1067 : [ret] "={xmm0}" (-> T), 1068 : [v] "{xmm0}" (v), 1069 ); 1070 } else if (T == F32x8) { 1071 return asm ("vroundps $1, %%ymm0, %%ymm0" 1072 : [ret] "={ymm0}" (-> T), 1073 : [v] "{ymm0}" (v), 1074 ); 1075 } else if (T == F32x16 and has_avx512f) { 1076 return asm ("vrndscaleps $1, %%zmm0, %%zmm0" 1077 : [ret] "={zmm0}" (-> T), 1078 : [v] "{zmm0}" (v), 1079 ); 1080 } else if (T == F32x16 and !has_avx512f) { 1081 const arr: [16]f32 = v; 1082 var ymm0 = @as(F32x8, arr[0..8].*); 1083 var ymm1 = @as(F32x8, arr[8..16].*); 1084 ymm0 = asm ("vroundps $1, %%ymm0, %%ymm0" 1085 : [ret] "={ymm0}" (-> F32x8), 1086 : [v] "{ymm0}" (ymm0), 1087 ); 1088 ymm1 = asm ("vroundps $1, %%ymm1, %%ymm1" 1089 : [ret] "={ymm1}" (-> F32x8), 1090 : [v] "{ymm1}" (ymm1), 1091 ); 1092 return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); 1093 } 1094 } else { 1095 const mask = abs(v) < splatNoFraction(T); 1096 var result = floatToIntAndBack(v); 1097 const larger_mask = result > v; 1098 const larger = select(larger_mask, splat(T, -1.0), splat(T, 0.0)); 1099 result = result + larger; 1100 return select(mask, result, v); 1101 } 1102} 1103test "zmath.floor" { 1104 { 1105 try expect(all(floor(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); 1106 try expect(all(floor(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); 1107 try expect(all(isNan(floor(splat(F32x4, math.nan(f32)))), 0)); 1108 try expect(all(isNan(floor(splat(F32x4, -math.nan(f32)))), 0)); 1109 try expect(all(isNan(floor(splat(F32x4, math.snan(f32)))), 0)); 1110 try expect(all(isNan(floor(splat(F32x4, -math.snan(f32)))), 0)); 1111 } 1112 { 1113 const v = floor(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); 1114 try expectVecApproxEqAbs( 1115 v, 1116 f32x16(1.0, -2.0, -2.0, 1.0, 2.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 10.0, 11.0, 12.0, 13.0), 1117 0.0, 1118 ); 1119 } 1120 var v = floor(f32x4(1.5, -1.5, -1.7, -2.1)); 1121 try expectVecEqual(v, f32x4(1.0, -2.0, -2.0, -3.0)); 1122 1123 v = floor(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); 1124 try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); 1125 1126 v = floor(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); 1127 try expect(math.isNan(v[0])); 1128 try expect(math.isNan(v[1])); 1129 try expect(math.isNan(v[2])); 1130 try expect(v[3] == -math.inf(f32)); 1131 1132 v = floor(f32x4(1000.5001, -201.499, -10000.99, 100.75001)); 1133 try expectVecEqual(v, f32x4(1000.0, -202.0, -10001.0, 100.0)); 1134 1135 v = floor(f32x4(-7_388_609.5, 7_388_609.1, 8_388_109.5, -8_388_509.5)); 1136 try expectVecEqual(v, f32x4(-7_388_610.0, 7_388_609.0, 8_388_109.0, -8_388_510.0)); 1137 1138 var f: f32 = -100.0; 1139 var i: u32 = 0; 1140 while (i < 100) : (i += 1) { 1141 const vr = floor(splat(F32x4, f)); 1142 const fr = @floor(splat(F32x4, f)); 1143 const vr8 = floor(splat(F32x8, f)); 1144 const fr8 = @floor(splat(F32x8, f)); 1145 const vr16 = floor(splat(F32x16, f)); 1146 const fr16 = @floor(splat(F32x16, f)); 1147 try expectVecEqual(vr, fr); 1148 try expectVecEqual(vr8, fr8); 1149 try expectVecEqual(vr16, fr16); 1150 f += 0.12345 * @as(f32, @floatFromInt(i)); 1151 } 1152} 1153 1154pub fn ceil(v: anytype) @TypeOf(v) { 1155 const T = @TypeOf(v); 1156 if (cpu_arch == .x86_64 and has_avx) { 1157 if (T == F32x4) { 1158 return asm ("vroundps $2, %%xmm0, %%xmm0" 1159 : [ret] "={xmm0}" (-> T), 1160 : [v] "{xmm0}" (v), 1161 ); 1162 } else if (T == F32x8) { 1163 return asm ("vroundps $2, %%ymm0, %%ymm0" 1164 : [ret] "={ymm0}" (-> T), 1165 : [v] "{ymm0}" (v), 1166 ); 1167 } else if (T == F32x16 and has_avx512f) { 1168 return asm ("vrndscaleps $2, %%zmm0, %%zmm0" 1169 : [ret] "={zmm0}" (-> T), 1170 : [v] "{zmm0}" (v), 1171 ); 1172 } else if (T == F32x16 and !has_avx512f) { 1173 const arr: [16]f32 = v; 1174 var ymm0 = @as(F32x8, arr[0..8].*); 1175 var ymm1 = @as(F32x8, arr[8..16].*); 1176 ymm0 = asm ("vroundps $2, %%ymm0, %%ymm0" 1177 : [ret] "={ymm0}" (-> F32x8), 1178 : [v] "{ymm0}" (ymm0), 1179 ); 1180 ymm1 = asm ("vroundps $2, %%ymm1, %%ymm1" 1181 : [ret] "={ymm1}" (-> F32x8), 1182 : [v] "{ymm1}" (ymm1), 1183 ); 1184 return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); 1185 } 1186 } else { 1187 const mask = abs(v) < splatNoFraction(T); 1188 var result = floatToIntAndBack(v); 1189 const smaller_mask = result < v; 1190 const smaller = select(smaller_mask, splat(T, -1.0), splat(T, 0.0)); 1191 result = result - smaller; 1192 return select(mask, result, v); 1193 } 1194} 1195test "zmath.ceil" { 1196 { 1197 try expect(all(ceil(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); 1198 try expect(all(ceil(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); 1199 try expect(all(isNan(ceil(splat(F32x4, math.nan(f32)))), 0)); 1200 try expect(all(isNan(ceil(splat(F32x4, -math.nan(f32)))), 0)); 1201 try expect(all(isNan(ceil(splat(F32x4, math.snan(f32)))), 0)); 1202 try expect(all(isNan(ceil(splat(F32x4, -math.snan(f32)))), 0)); 1203 } 1204 { 1205 const v = ceil(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); 1206 try expectVecApproxEqAbs( 1207 v, 1208 f32x16(2.0, -1.0, -1.0, 2.0, 3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0, 11.0, 12.0, 13.0, 14.0), 1209 0.0, 1210 ); 1211 } 1212 var v = ceil(f32x4(1.5, -1.5, -1.7, -2.1)); 1213 try expectVecEqual(v, f32x4(2.0, -1.0, -1.0, -2.0)); 1214 1215 v = ceil(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); 1216 try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); 1217 1218 v = ceil(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); 1219 try expect(math.isNan(v[0])); 1220 try expect(math.isNan(v[1])); 1221 try expect(math.isNan(v[2])); 1222 try expect(v[3] == -math.inf(f32)); 1223 1224 v = ceil(f32x4(1000.5001, -201.499, -10000.99, 100.75001)); 1225 try expectVecEqual(v, f32x4(1001.0, -201.0, -10000.0, 101.0)); 1226 1227 v = ceil(f32x4(-1_388_609.5, 1_388_609.1, 1_388_109.9, -1_388_509.9)); 1228 try expectVecEqual(v, f32x4(-1_388_609.0, 1_388_610.0, 1_388_110.0, -1_388_509.0)); 1229 1230 var f: f32 = -100.0; 1231 var i: u32 = 0; 1232 while (i < 100) : (i += 1) { 1233 const vr = ceil(splat(F32x4, f)); 1234 const fr = @ceil(splat(F32x4, f)); 1235 const vr8 = ceil(splat(F32x8, f)); 1236 const fr8 = @ceil(splat(F32x8, f)); 1237 const vr16 = ceil(splat(F32x16, f)); 1238 const fr16 = @ceil(splat(F32x16, f)); 1239 try expectVecEqual(vr, fr); 1240 try expectVecEqual(vr8, fr8); 1241 try expectVecEqual(vr16, fr16); 1242 f += 0.12345 * @as(f32, @floatFromInt(i)); 1243 } 1244} 1245 1246pub inline fn clamp(v: anytype, vmin: anytype, vmax: anytype) @TypeOf(v, vmin, vmax) { 1247 var result = max(vmin, v); 1248 result = min(vmax, result); 1249 return result; 1250} 1251test "zmath.clamp" { 1252 // Calling math.inf causes test to fail! 1253 if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; 1254 { 1255 const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); 1256 const v = clamp(v0, splat(F32x4, -0.5), splat(F32x4, 0.5)); 1257 try expectVecApproxEqAbs(v, f32x4(-0.5, 0.2, 0.5, -0.3), 0.0001); 1258 } 1259 { 1260 const v0 = f32x8(-2.0, 0.25, -0.25, 100.0, -1.0, 0.2, 1.1, -0.3); 1261 const v = clamp(v0, splat(F32x8, -0.5), splat(F32x8, 0.5)); 1262 try expectVecApproxEqAbs(v, f32x8(-0.5, 0.25, -0.25, 0.5, -0.5, 0.2, 0.5, -0.3), 0.0001); 1263 } 1264 { 1265 const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); 1266 const v = clamp(v0, f32x4(-100.0, 0.0, -100.0, 0.0), f32x4(0.0, 100.0, 0.0, 100.0)); 1267 try expectVecApproxEqAbs(v, f32x4(-100.0, 100.0, -100.0, 0.0), 0.0001); 1268 } 1269 { 1270 const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); 1271 const v = clamp(v0, splat(F32x4, -1.0), splat(F32x4, 1.0)); 1272 try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, -1.0, -1.0), 0.0001); 1273 } 1274} 1275 1276pub inline fn clampFast(v: anytype, vmin: anytype, vmax: anytype) @TypeOf(v, vmin, vmax) { 1277 var result = maxFast(vmin, v); 1278 result = minFast(vmax, result); 1279 return result; 1280} 1281test "zmath.clampFast" { 1282 { 1283 const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); 1284 const v = clampFast(v0, splat(F32x4, -0.5), splat(F32x4, 0.5)); 1285 try expectVecApproxEqAbs(v, f32x4(-0.5, 0.2, 0.5, -0.3), 0.0001); 1286 } 1287} 1288 1289pub inline fn saturate(v: anytype) @TypeOf(v) { 1290 const T = @TypeOf(v); 1291 var result = max(v, splat(T, 0.0)); 1292 result = min(result, splat(T, 1.0)); 1293 return result; 1294} 1295test "zmath.saturate" { 1296 // Calling math.inf causes test to fail! 1297 if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; 1298 { 1299 const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); 1300 const v = saturate(v0); 1301 try expectVecApproxEqAbs(v, f32x4(0.0, 0.2, 1.0, 0.0), 0.0001); 1302 } 1303 { 1304 const v0 = f32x8(0.0, 0.0, 2.0, -2.0, -1.0, 0.2, 1.1, -0.3); 1305 const v = saturate(v0); 1306 try expectVecApproxEqAbs(v, f32x8(0.0, 0.0, 1.0, 0.0, 0.0, 0.2, 1.0, 0.0), 0.0001); 1307 } 1308 { 1309 const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); 1310 const v = saturate(v0); 1311 try expectVecApproxEqAbs(v, f32x4(0.0, 1.0, 0.0, 0.0), 0.0001); 1312 } 1313 { 1314 const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); 1315 const v = saturate(v0); 1316 try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, 0.0, 0.0), 0.0001); 1317 } 1318} 1319 1320pub inline fn saturateFast(v: anytype) @TypeOf(v) { 1321 const T = @TypeOf(v); 1322 var result = maxFast(v, splat(T, 0.0)); 1323 result = minFast(result, splat(T, 1.0)); 1324 return result; 1325} 1326test "zmath.saturateFast" { 1327 { 1328 const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); 1329 const v = saturateFast(v0); 1330 try expectVecApproxEqAbs(v, f32x4(0.0, 0.2, 1.0, 0.0), 0.0001); 1331 } 1332 { 1333 const v0 = f32x8(0.0, 0.0, 2.0, -2.0, -1.0, 0.2, 1.1, -0.3); 1334 const v = saturateFast(v0); 1335 try expectVecApproxEqAbs(v, f32x8(0.0, 0.0, 1.0, 0.0, 0.0, 0.2, 1.0, 0.0), 0.0001); 1336 } 1337 { 1338 const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); 1339 const v = saturateFast(v0); 1340 try expectVecApproxEqAbs(v, f32x4(0.0, 1.0, 0.0, 0.0), 0.0001); 1341 } 1342 { 1343 const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); 1344 const v = saturateFast(v0); 1345 try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, 0.0, 0.0), 0.0001); 1346 } 1347} 1348 1349pub inline fn sqrt(v: anytype) @TypeOf(v) { 1350 return @sqrt(v); // sqrtps 1351} 1352 1353pub inline fn abs(v: anytype) @TypeOf(v) { 1354 return @abs(v); // load, andps 1355} 1356 1357pub inline fn select(mask: anytype, v0: anytype, v1: anytype) @TypeOf(v0, v1) { 1358 return @select(f32, mask, v0, v1); 1359} 1360 1361pub inline fn lerp(v0: anytype, v1: anytype, t: f32) @TypeOf(v0, v1) { 1362 const T = @TypeOf(v0, v1); 1363 return v0 + (v1 - v0) * splat(T, t); // subps, shufps, addps, mulps 1364} 1365 1366pub inline fn lerpV(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1, t) { 1367 return v0 + (v1 - v0) * t; // subps, addps, mulps 1368} 1369 1370pub inline fn lerpInverse(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1) { 1371 const T = @TypeOf(v0, v1); 1372 return (splat(T, t) - v0) / (v1 - v0); 1373} 1374 1375pub inline fn lerpInverseV(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1, t) { 1376 return (t - v0) / (v1 - v0); 1377} 1378test "zmath.lerpInverse" { 1379 try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 10.0), 0, 0.0005)); 1380 try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 100.0), 1, 0.0005)); 1381 try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 55.0), 0.5, 0.05)); 1382 try expectVecApproxEqAbs(lerpInverse(f32x4(0, 0, 10, 10), f32x4(100, 200, 100, 100), 10.0), f32x4(0.1, 0.05, 0, 0), 0.0005); 1383} 1384 1385// Frame rate independent lerp (or "damp"), for approaching things over time. 1386// Reference: https://www.gamedeveloper.com/programming/improved-lerp-smoothing- 1387pub inline fn lerpOverTime(v0: anytype, v1: anytype, rate: anytype, dt: anytype) @TypeOf(v0, v1) { 1388 const t = std.math.exp2(-rate * dt); 1389 return lerp(v0, v1, t); 1390} 1391 1392pub inline fn lerpVOverTime(v0: anytype, v1: anytype, rate: anytype, dt: anytype) @TypeOf(v0, v1, rate, dt) { 1393 const t = std.math.exp2(-rate * dt); 1394 return lerpV(v0, v1, t); 1395} 1396test "zmath.lerpOverTime" { 1397 try expect(math.approxEqAbs(f32, lerpVOverTime(0.0, 1.0, 1.0, 1.0), 0.5, 0.0005)); 1398 try expect(math.approxEqAbs(f32, lerpVOverTime(0.5, 1.0, 1.0, 1.0), 0.75, 0.0005)); 1399 try expectVecApproxEqAbs(lerpOverTime(f32x4(0, 0, 10, 10), f32x4(100, 200, 100, 100), 1.0, 1.0), f32x4(50, 100, 55, 55), 0.0005); 1400} 1401 1402/// To transform a vector of values from one range to another. 1403pub inline fn mapLinear(v: anytype, min1: anytype, max1: anytype, min2: anytype, max2: anytype) @TypeOf(v) { 1404 const T = @TypeOf(v); 1405 const min1V = splat(T, min1); 1406 const max1V = splat(T, max1); 1407 const min2V = splat(T, min2); 1408 const max2V = splat(T, max2); 1409 const dV = max1V - min1V; 1410 return min2V + (v - min1V) * (max2V - min2V) / dV; 1411} 1412 1413pub inline fn mapLinearV(v: anytype, min1: anytype, max1: anytype, min2: anytype, max2: anytype) @TypeOf(v, min1, max1, min2, max2) { 1414 const d = max1 - min1; 1415 return min2 + (v - min1) * (max2 - min2) / d; 1416} 1417test "zmath.mapLinear" { 1418 try expect(math.approxEqAbs(f32, mapLinearV(0, 0, 1.2, 10, 100), 10, 0.0005)); 1419 try expect(math.approxEqAbs(f32, mapLinearV(1.2, 0, 1.2, 10, 100), 100, 0.0005)); 1420 try expect(math.approxEqAbs(f32, mapLinearV(0.6, 0, 1.2, 10, 100), 55, 0.0005)); 1421 try expectVecApproxEqAbs(mapLinearV(splat(F32x4, 0), splat(F32x4, 0), splat(F32x4, 1.2), splat(F32x4, 10), splat(F32x4, 100)), splat(F32x4, 10), 0.0005); 1422 try expectVecApproxEqAbs(mapLinear(f32x4(0, 0, 0.6, 1.2), 0, 1.2, 10, 100), f32x4(10, 10, 55, 100), 0.0005); 1423} 1424 1425pub const F32x4Component = enum { x, y, z, w }; 1426 1427pub inline fn swizzle( 1428 v: F32x4, 1429 comptime x: F32x4Component, 1430 comptime y: F32x4Component, 1431 comptime z: F32x4Component, 1432 comptime w: F32x4Component, 1433) F32x4 { 1434 return @shuffle(f32, v, undefined, [4]i32{ @intFromEnum(x), @intFromEnum(y), @intFromEnum(z), @intFromEnum(w) }); 1435} 1436 1437pub inline fn mod(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 1438 // vdivps, vroundps, vmulps, vsubps 1439 return v0 - v1 * trunc(v0 / v1); 1440} 1441test "zmath.mod" { 1442 try expectVecApproxEqAbs(mod(splat(F32x4, 3.1), splat(F32x4, 1.7)), splat(F32x4, 1.4), 0.0005); 1443 try expectVecApproxEqAbs(mod(splat(F32x4, -3.0), splat(F32x4, 2.0)), splat(F32x4, -1.0), 0.0005); 1444 try expectVecApproxEqAbs(mod(splat(F32x4, -3.0), splat(F32x4, -2.0)), splat(F32x4, -1.0), 0.0005); 1445 try expectVecApproxEqAbs(mod(splat(F32x4, 3.0), splat(F32x4, -2.0)), splat(F32x4, 1.0), 0.0005); 1446 try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, 1.0))), 0)); 1447 try expect(all(isNan(mod(splat(F32x4, -math.inf(f32)), splat(F32x4, 123.456))), 0)); 1448 try expect(all(isNan(mod(splat(F32x4, math.nan(f32)), splat(F32x4, 123.456))), 0)); 1449 try expect(all(isNan(mod(splat(F32x4, math.snan(f32)), splat(F32x4, 123.456))), 0)); 1450 try expect(all(isNan(mod(splat(F32x4, -math.snan(f32)), splat(F32x4, 123.456))), 0)); 1451 try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, math.inf(f32)))), 0)); 1452 try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, -math.inf(f32)))), 0)); 1453 try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, math.inf(f32)))), 0)); 1454 try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, math.nan(f32)))), 0)); 1455 try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, math.nan(f32)))), 0)); 1456} 1457 1458pub fn modAngle(v: anytype) @TypeOf(v) { 1459 const T = @TypeOf(v); 1460 return switch (T) { 1461 f32 => modAngle32(v), 1462 F32x4, F32x8, F32x16 => modAngle32xN(v), 1463 else => @compileError("zmath.modAngle() not implemented for " ++ @typeName(T)), 1464 }; 1465} 1466 1467pub inline fn modAngle32xN(v: anytype) @TypeOf(v) { 1468 const T = @TypeOf(v); 1469 return v - splat(T, math.tau) * round(v * splat(T, 1.0 / math.tau)); // 2 x vmulps, 2 x load, vroundps, vaddps 1470} 1471test "zmath.modAngle" { 1472 try expectVecApproxEqAbs(modAngle(splat(F32x4, math.tau)), splat(F32x4, 0.0), 0.0005); 1473 try expectVecApproxEqAbs(modAngle(splat(F32x4, 0.0)), splat(F32x4, 0.0), 0.0005); 1474 try expectVecApproxEqAbs(modAngle(splat(F32x4, math.pi)), splat(F32x4, math.pi), 0.0005); 1475 try expectVecApproxEqAbs(modAngle(splat(F32x4, 11 * math.pi)), splat(F32x4, math.pi), 0.0005); 1476 try expectVecApproxEqAbs(modAngle(splat(F32x4, 3.5 * math.pi)), splat(F32x4, -0.5 * math.pi), 0.0005); 1477 try expectVecApproxEqAbs(modAngle(splat(F32x4, 2.5 * math.pi)), splat(F32x4, 0.5 * math.pi), 0.0005); 1478} 1479 1480pub inline fn mulAdd(v0: anytype, v1: anytype, v2: anytype) @TypeOf(v0, v1, v2) { 1481 const T = @TypeOf(v0, v1, v2); 1482 if (@import("zmath_options").enable_cross_platform_determinism) { 1483 return v0 * v1 + v2; // Compiler will generate mul, add sequence (no fma even if the target supports it). 1484 } else { 1485 if (cpu_arch == .x86_64 and has_avx and has_fma) { 1486 return @mulAdd(T, v0, v1, v2); 1487 } else { 1488 // NOTE(mziulek): On .x86_64 without HW fma instructions @mulAdd maps to really slow code! 1489 return v0 * v1 + v2; 1490 } 1491 } 1492} 1493 1494fn sin32xN(v: anytype) @TypeOf(v) { 1495 // 11-degree minimax approximation 1496 const T = @TypeOf(v); 1497 1498 var x = modAngle(v); 1499 const sign = andInt(x, splatNegativeZero(T)); 1500 const c = orInt(sign, splat(T, math.pi)); 1501 const absx = andNotInt(sign, x); 1502 const rflx = c - x; 1503 const comp = absx <= splat(T, 0.5 * math.pi); 1504 x = select(comp, x, rflx); 1505 const x2 = x * x; 1506 1507 var result = mulAdd(splat(T, -2.3889859e-08), x2, splat(T, 2.7525562e-06)); 1508 result = mulAdd(result, x2, splat(T, -0.00019840874)); 1509 result = mulAdd(result, x2, splat(T, 0.0083333310)); 1510 result = mulAdd(result, x2, splat(T, -0.16666667)); 1511 result = mulAdd(result, x2, splat(T, 1.0)); 1512 return x * result; 1513} 1514test "zmath.sin" { 1515 const epsilon = 0.0001; 1516 1517 try expectVecApproxEqAbs(sin(splat(F32x4, 0.5 * math.pi)), splat(F32x4, 1.0), epsilon); 1518 try expectVecApproxEqAbs(sin(splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon); 1519 try expectVecApproxEqAbs(sin(splat(F32x4, -0.0)), splat(F32x4, -0.0), epsilon); 1520 try expectVecApproxEqAbs(sin(splat(F32x4, 89.123)), splat(F32x4, 0.916166), epsilon); 1521 try expectVecApproxEqAbs(sin(splat(F32x8, 89.123)), splat(F32x8, 0.916166), epsilon); 1522 try expectVecApproxEqAbs(sin(splat(F32x16, 89.123)), splat(F32x16, 0.916166), epsilon); 1523 try expect(all(isNan(sin(splat(F32x4, math.inf(f32)))), 0) == true); 1524 try expect(all(isNan(sin(splat(F32x4, -math.inf(f32)))), 0) == true); 1525 try expect(all(isNan(sin(splat(F32x4, math.nan(f32)))), 0) == true); 1526 try expect(all(isNan(sin(splat(F32x4, math.snan(f32)))), 0) == true); 1527 1528 var f: f32 = -100.0; 1529 var i: u32 = 0; 1530 while (i < 100) : (i += 1) { 1531 const vr = sin(splat(F32x4, f)); 1532 const fr = @sin(splat(F32x4, f)); 1533 const vr8 = sin(splat(F32x8, f)); 1534 const fr8 = @sin(splat(F32x8, f)); 1535 const vr16 = sin(splat(F32x16, f)); 1536 const fr16 = @sin(splat(F32x16, f)); 1537 try expectVecApproxEqAbs(vr, fr, epsilon); 1538 try expectVecApproxEqAbs(vr8, fr8, epsilon); 1539 try expectVecApproxEqAbs(vr16, fr16, epsilon); 1540 f += 0.12345 * @as(f32, @floatFromInt(i)); 1541 } 1542} 1543 1544fn cos32xN(v: anytype) @TypeOf(v) { 1545 // 10-degree minimax approximation 1546 const T = @TypeOf(v); 1547 1548 var x = modAngle(v); 1549 var sign = andInt(x, splatNegativeZero(T)); 1550 const c = orInt(sign, splat(T, math.pi)); 1551 const absx = andNotInt(sign, x); 1552 const rflx = c - x; 1553 const comp = absx <= splat(T, 0.5 * math.pi); 1554 x = select(comp, x, rflx); 1555 sign = select(comp, splat(T, 1.0), splat(T, -1.0)); 1556 const x2 = x * x; 1557 1558 var result = mulAdd(splat(T, -2.6051615e-07), x2, splat(T, 2.4760495e-05)); 1559 result = mulAdd(result, x2, splat(T, -0.0013888378)); 1560 result = mulAdd(result, x2, splat(T, 0.041666638)); 1561 result = mulAdd(result, x2, splat(T, -0.5)); 1562 result = mulAdd(result, x2, splat(T, 1.0)); 1563 return sign * result; 1564} 1565test "zmath.cos" { 1566 const epsilon = 0.0001; 1567 1568 try expectVecApproxEqAbs(cos(splat(F32x4, 0.5 * math.pi)), splat(F32x4, 0.0), epsilon); 1569 try expectVecApproxEqAbs(cos(splat(F32x4, 0.0)), splat(F32x4, 1.0), epsilon); 1570 try expectVecApproxEqAbs(cos(splat(F32x4, -0.0)), splat(F32x4, 1.0), epsilon); 1571 try expect(all(isNan(cos(splat(F32x4, math.inf(f32)))), 0) == true); 1572 try expect(all(isNan(cos(splat(F32x4, -math.inf(f32)))), 0) == true); 1573 try expect(all(isNan(cos(splat(F32x4, math.nan(f32)))), 0) == true); 1574 try expect(all(isNan(cos(splat(F32x4, math.snan(f32)))), 0) == true); 1575 1576 var f: f32 = -100.0; 1577 var i: u32 = 0; 1578 while (i < 100) : (i += 1) { 1579 const vr = cos(splat(F32x4, f)); 1580 const fr = @cos(splat(F32x4, f)); 1581 const vr8 = cos(splat(F32x8, f)); 1582 const fr8 = @cos(splat(F32x8, f)); 1583 const vr16 = cos(splat(F32x16, f)); 1584 const fr16 = @cos(splat(F32x16, f)); 1585 try expectVecApproxEqAbs(vr, fr, epsilon); 1586 try expectVecApproxEqAbs(vr8, fr8, epsilon); 1587 try expectVecApproxEqAbs(vr16, fr16, epsilon); 1588 f += 0.12345 * @as(f32, @floatFromInt(i)); 1589 } 1590} 1591 1592pub fn sin(v: anytype) @TypeOf(v) { 1593 const T = @TypeOf(v); 1594 return switch (T) { 1595 f32 => sin32(v), 1596 F32x4, F32x8, F32x16 => sin32xN(v), 1597 else => @compileError("zmath.sin() not implemented for " ++ @typeName(T)), 1598 }; 1599} 1600 1601pub fn cos(v: anytype) @TypeOf(v) { 1602 const T = @TypeOf(v); 1603 return switch (T) { 1604 f32 => cos32(v), 1605 F32x4, F32x8, F32x16 => cos32xN(v), 1606 else => @compileError("zmath.cos() not implemented for " ++ @typeName(T)), 1607 }; 1608} 1609 1610pub fn sincos(v: anytype) [2]@TypeOf(v) { 1611 const T = @TypeOf(v); 1612 return switch (T) { 1613 f32 => sincos32(v), 1614 F32x4, F32x8, F32x16 => sincos32xN(v), 1615 else => @compileError("zmath.sincos() not implemented for " ++ @typeName(T)), 1616 }; 1617} 1618 1619pub fn asin(v: anytype) @TypeOf(v) { 1620 const T = @TypeOf(v); 1621 return switch (T) { 1622 f32 => asin32(v), 1623 F32x4, F32x8, F32x16 => asin32xN(v), 1624 else => @compileError("zmath.asin() not implemented for " ++ @typeName(T)), 1625 }; 1626} 1627 1628pub fn acos(v: anytype) @TypeOf(v) { 1629 const T = @TypeOf(v); 1630 return switch (T) { 1631 f32 => acos32(v), 1632 F32x4, F32x8, F32x16 => acos32xN(v), 1633 else => @compileError("zmath.acos() not implemented for " ++ @typeName(T)), 1634 }; 1635} 1636 1637fn sincos32xN(v: anytype) [2]@TypeOf(v) { 1638 const T = @TypeOf(v); 1639 1640 var x = modAngle(v); 1641 var sign = andInt(x, splatNegativeZero(T)); 1642 const c = orInt(sign, splat(T, math.pi)); 1643 const absx = andNotInt(sign, x); 1644 const rflx = c - x; 1645 const comp = absx <= splat(T, 0.5 * math.pi); 1646 x = select(comp, x, rflx); 1647 sign = select(comp, splat(T, 1.0), splat(T, -1.0)); 1648 const x2 = x * x; 1649 1650 var sresult = mulAdd(splat(T, -2.3889859e-08), x2, splat(T, 2.7525562e-06)); 1651 sresult = mulAdd(sresult, x2, splat(T, -0.00019840874)); 1652 sresult = mulAdd(sresult, x2, splat(T, 0.0083333310)); 1653 sresult = mulAdd(sresult, x2, splat(T, -0.16666667)); 1654 sresult = x * mulAdd(sresult, x2, splat(T, 1.0)); 1655 1656 var cresult = mulAdd(splat(T, -2.6051615e-07), x2, splat(T, 2.4760495e-05)); 1657 cresult = mulAdd(cresult, x2, splat(T, -0.0013888378)); 1658 cresult = mulAdd(cresult, x2, splat(T, 0.041666638)); 1659 cresult = mulAdd(cresult, x2, splat(T, -0.5)); 1660 cresult = sign * mulAdd(cresult, x2, splat(T, 1.0)); 1661 1662 return .{ sresult, cresult }; 1663} 1664test "zmath.sincos32xN" { 1665 const epsilon = 0.0001; 1666 1667 var f: f32 = -100.0; 1668 var i: u32 = 0; 1669 while (i < 100) : (i += 1) { 1670 const sc = sincos(splat(F32x4, f)); 1671 const sc8 = sincos(splat(F32x8, f)); 1672 const sc16 = sincos(splat(F32x16, f)); 1673 const s4 = @sin(splat(F32x4, f)); 1674 const s8 = @sin(splat(F32x8, f)); 1675 const s16 = @sin(splat(F32x16, f)); 1676 const c4 = @cos(splat(F32x4, f)); 1677 const c8 = @cos(splat(F32x8, f)); 1678 const c16 = @cos(splat(F32x16, f)); 1679 try expectVecApproxEqAbs(sc[0], s4, epsilon); 1680 try expectVecApproxEqAbs(sc8[0], s8, epsilon); 1681 try expectVecApproxEqAbs(sc16[0], s16, epsilon); 1682 try expectVecApproxEqAbs(sc[1], c4, epsilon); 1683 try expectVecApproxEqAbs(sc8[1], c8, epsilon); 1684 try expectVecApproxEqAbs(sc16[1], c16, epsilon); 1685 f += 0.12345 * @as(f32, @floatFromInt(i)); 1686 } 1687} 1688 1689fn asin32xN(v: anytype) @TypeOf(v) { 1690 // 7-degree minimax approximation 1691 const T = @TypeOf(v); 1692 1693 const x = abs(v); 1694 const root = sqrt(maxFast(splat(T, 0.0), splat(T, 1.0) - x)); 1695 1696 var t0 = mulAdd(splat(T, -0.0012624911), x, splat(T, 0.0066700901)); 1697 t0 = mulAdd(t0, x, splat(T, -0.0170881256)); 1698 t0 = mulAdd(t0, x, splat(T, 0.0308918810)); 1699 t0 = mulAdd(t0, x, splat(T, -0.0501743046)); 1700 t0 = mulAdd(t0, x, splat(T, 0.0889789874)); 1701 t0 = mulAdd(t0, x, splat(T, -0.2145988016)); 1702 t0 = root * mulAdd(t0, x, splat(T, 1.5707963050)); 1703 1704 const t1 = splat(T, math.pi) - t0; 1705 return splat(T, 0.5 * math.pi) - select(v >= splat(T, 0.0), t0, t1); 1706} 1707 1708fn acos32xN(v: anytype) @TypeOf(v) { 1709 // 7-degree minimax approximation 1710 const T = @TypeOf(v); 1711 1712 const x = abs(v); 1713 const root = sqrt(maxFast(splat(T, 0.0), splat(T, 1.0) - x)); 1714 1715 var t0 = mulAdd(splat(T, -0.0012624911), x, splat(T, 0.0066700901)); 1716 t0 = mulAdd(t0, x, splat(T, -0.0170881256)); 1717 t0 = mulAdd(t0, x, splat(T, 0.0308918810)); 1718 t0 = mulAdd(t0, x, splat(T, -0.0501743046)); 1719 t0 = mulAdd(t0, x, splat(T, 0.0889789874)); 1720 t0 = mulAdd(t0, x, splat(T, -0.2145988016)); 1721 t0 = root * mulAdd(t0, x, splat(T, 1.5707963050)); 1722 1723 const t1 = splat(T, math.pi) - t0; 1724 return select(v >= splat(T, 0.0), t0, t1); 1725} 1726 1727pub fn atan(v: anytype) @TypeOf(v) { 1728 // 17-degree minimax approximation 1729 const T = @TypeOf(v); 1730 1731 const vabs = abs(v); 1732 const vinv = splat(T, 1.0) / v; 1733 var sign = select(v > splat(T, 1.0), splat(T, 1.0), splat(T, -1.0)); 1734 const comp = vabs <= splat(T, 1.0); 1735 sign = select(comp, splat(T, 0.0), sign); 1736 const x = select(comp, v, vinv); 1737 const x2 = x * x; 1738 1739 var result = mulAdd(splat(T, 0.0028662257), x2, splat(T, -0.0161657367)); 1740 result = mulAdd(result, x2, splat(T, 0.0429096138)); 1741 result = mulAdd(result, x2, splat(T, -0.0752896400)); 1742 result = mulAdd(result, x2, splat(T, 0.1065626393)); 1743 result = mulAdd(result, x2, splat(T, -0.1420889944)); 1744 result = mulAdd(result, x2, splat(T, 0.1999355085)); 1745 result = mulAdd(result, x2, splat(T, -0.3333314528)); 1746 result = x * mulAdd(result, x2, splat(T, 1.0)); 1747 1748 const result1 = sign * splat(T, 0.5 * math.pi) - result; 1749 return select(sign == splat(T, 0.0), result, result1); 1750} 1751test "zmath.atan" { 1752 const epsilon = 0.0001; 1753 { 1754 const v = f32x4(0.25, 0.5, 1.0, 1.25); 1755 const e = f32x4(math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3])); 1756 try expectVecApproxEqAbs(e, atan(v), epsilon); 1757 } 1758 { 1759 const v = f32x8(-0.25, 0.5, -1.0, 1.25, 100.0, -200.0, 300.0, 400.0); 1760 // zig fmt: off 1761 const e = f32x8( 1762 math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3]), 1763 math.atan(v[4]), math.atan(v[5]), math.atan(v[6]), math.atan(v[7]), 1764 ); 1765 // zig fmt: on 1766 try expectVecApproxEqAbs(e, atan(v), epsilon); 1767 } 1768 { 1769 // zig fmt: off 1770 const v = f32x16( 1771 -0.25, 0.5, -1.0, 0.0, 0.1, -0.2, 30.0, 400.0, 1772 -0.25, 0.5, -1.0, -0.0, -0.05, -0.125, 0.0625, 4000.0 1773 ); 1774 const e = f32x16( 1775 math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3]), 1776 math.atan(v[4]), math.atan(v[5]), math.atan(v[6]), math.atan(v[7]), 1777 math.atan(v[8]), math.atan(v[9]), math.atan(v[10]), math.atan(v[11]), 1778 math.atan(v[12]), math.atan(v[13]), math.atan(v[14]), math.atan(v[15]), 1779 ); 1780 // zig fmt: on 1781 try expectVecApproxEqAbs(e, atan(v), epsilon); 1782 } 1783 { 1784 try expectVecApproxEqAbs(atan(splat(F32x4, math.inf(f32))), splat(F32x4, 0.5 * math.pi), epsilon); 1785 try expectVecApproxEqAbs(atan(splat(F32x4, -math.inf(f32))), splat(F32x4, -0.5 * math.pi), epsilon); 1786 try expect(all(isNan(atan(splat(F32x4, math.nan(f32)))), 0) == true); 1787 try expect(all(isNan(atan(splat(F32x4, -math.nan(f32)))), 0) == true); 1788 } 1789} 1790 1791pub fn atan2(vy: anytype, vx: anytype) @TypeOf(vx, vy) { 1792 const T = @TypeOf(vx, vy); 1793 const Tu = @Vector(veclen(T), u32); 1794 1795 const vx_is_positive = 1796 (@as(Tu, @bitCast(vx)) & @as(Tu, @splat(0x8000_0000))) == @as(Tu, @splat(0)); 1797 1798 const vy_sign = andInt(vy, splatNegativeZero(T)); 1799 const c0_25pi = orInt(vy_sign, @as(T, @splat(0.25 * math.pi))); 1800 const c0_50pi = orInt(vy_sign, @as(T, @splat(0.50 * math.pi))); 1801 const c0_75pi = orInt(vy_sign, @as(T, @splat(0.75 * math.pi))); 1802 const c1_00pi = orInt(vy_sign, @as(T, @splat(1.00 * math.pi))); 1803 1804 var r1 = select(vx_is_positive, vy_sign, c1_00pi); 1805 var r2 = select(vx == splat(T, 0.0), c0_50pi, splatInt(T, 0xffff_ffff)); 1806 const r3 = select(vy == splat(T, 0.0), r1, r2); 1807 const r4 = select(vx_is_positive, c0_25pi, c0_75pi); 1808 const r5 = select(isInf(vx), r4, c0_50pi); 1809 const result = select(isInf(vy), r5, r3); 1810 const result_valid = @as(Tu, @bitCast(result)) == @as(Tu, @splat(0xffff_ffff)); 1811 1812 const v = vy / vx; 1813 const r0 = atan(v); 1814 1815 r1 = select(vx_is_positive, splatNegativeZero(T), c1_00pi); 1816 r2 = r0 + r1; 1817 1818 return select(result_valid, r2, result); 1819} 1820test "zmath.atan2" { 1821 // From DirectXMath XMVectorATan2(): 1822 // 1823 // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: 1824 1825 // Y == 0 and X is Negative -> Pi with the sign of Y 1826 // y == 0 and x is positive -> 0 with the sign of y 1827 // Y != 0 and X == 0 -> Pi / 2 with the sign of Y 1828 // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) 1829 // X == -Infinity and Finite Y -> Pi with the sign of Y 1830 // X == +Infinity and Finite Y -> 0 with the sign of Y 1831 // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y 1832 // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y 1833 // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y 1834 1835 const epsilon = 0.0001; 1836 try expectVecApproxEqAbs(atan2(splat(F32x4, 0.0), splat(F32x4, -1.0)), splat(F32x4, math.pi), epsilon); 1837 try expectVecApproxEqAbs(atan2(splat(F32x4, -0.0), splat(F32x4, -1.0)), splat(F32x4, -math.pi), epsilon); 1838 try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, 0.0)), splat(F32x4, 0.5 * math.pi), epsilon); 1839 try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, 0.0)), splat(F32x4, -0.5 * math.pi), epsilon); 1840 try expectVecApproxEqAbs( 1841 atan2(splat(F32x4, 1.0), splat(F32x4, -1.0)), 1842 splat(F32x4, math.atan(@as(f32, -1.0)) + math.pi), 1843 epsilon, 1844 ); 1845 try expectVecApproxEqAbs( 1846 atan2(splat(F32x4, -10.0), splat(F32x4, -2.0)), 1847 splat(F32x4, math.atan(@as(f32, 5.0)) - math.pi), 1848 epsilon, 1849 ); 1850 try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, -math.inf(f32))), splat(F32x4, math.pi), epsilon); 1851 try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, -math.inf(f32))), splat(F32x4, -math.pi), epsilon); 1852 try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, math.inf(f32))), splat(F32x4, 0.0), epsilon); 1853 try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, math.inf(f32))), splat(F32x4, -0.0), epsilon); 1854 try expectVecApproxEqAbs( 1855 atan2(splat(F32x4, math.inf(f32)), splat(F32x4, 2.0)), 1856 splat(F32x4, 0.5 * math.pi), 1857 epsilon, 1858 ); 1859 try expectVecApproxEqAbs( 1860 atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, 2.0)), 1861 splat(F32x4, -0.5 * math.pi), 1862 epsilon, 1863 ); 1864 try expectVecApproxEqAbs( 1865 atan2(splat(F32x4, math.inf(f32)), splat(F32x4, -math.inf(f32))), 1866 splat(F32x4, 0.75 * math.pi), 1867 epsilon, 1868 ); 1869 try expectVecApproxEqAbs( 1870 atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, -math.inf(f32))), 1871 splat(F32x4, -0.75 * math.pi), 1872 epsilon, 1873 ); 1874 try expectVecApproxEqAbs( 1875 atan2(splat(F32x4, math.inf(f32)), splat(F32x4, math.inf(f32))), 1876 splat(F32x4, 0.25 * math.pi), 1877 epsilon, 1878 ); 1879 try expectVecApproxEqAbs( 1880 atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, math.inf(f32))), 1881 splat(F32x4, -0.25 * math.pi), 1882 epsilon, 1883 ); 1884 try expectVecApproxEqAbs( 1885 atan2( 1886 f32x8(0.0, -math.inf(f32), -0.0, 2.0, math.inf(f32), math.inf(f32), 1.0, -math.inf(f32)), 1887 f32x8(-2.0, math.inf(f32), 1.0, 0.0, 10.0, -math.inf(f32), 1.0, -math.inf(f32)), 1888 ), 1889 f32x8( 1890 math.pi, 1891 -0.25 * math.pi, 1892 -0.0, 1893 0.5 * math.pi, 1894 0.5 * math.pi, 1895 0.75 * math.pi, 1896 math.atan(@as(f32, 1.0)), 1897 -0.75 * math.pi, 1898 ), 1899 epsilon, 1900 ); 1901 try expectVecApproxEqAbs(atan2(splat(F32x4, 0.0), splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon); 1902 try expectVecApproxEqAbs(atan2(splat(F32x4, -0.0), splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon); 1903 try expect(all(isNan(atan2(splat(F32x4, 1.0), splat(F32x4, math.nan(f32)))), 0) == true); 1904 try expect(all(isNan(atan2(splat(F32x4, -1.0), splat(F32x4, math.nan(f32)))), 0) == true); 1905 try expect(all(isNan(atan2(splat(F32x4, math.nan(f32)), splat(F32x4, -1.0))), 0) == true); 1906 try expect(all(isNan(atan2(splat(F32x4, -math.nan(f32)), splat(F32x4, 1.0))), 0) == true); 1907} 1908// ------------------------------------------------------------------------------ 1909// 1910// 3. 2D, 3D, 4D vector functions 1911// 1912// ------------------------------------------------------------------------------ 1913pub inline fn dot2(v0: Vec, v1: Vec) F32x4 { 1914 var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | -- | -- | 1915 const xmm1 = swizzle(xmm0, .y, .x, .x, .x); // | y0*y1 | -- | -- | -- | 1916 xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[3]); // | x0*x1 + y0*y1 | -- | -- | -- | 1917 return swizzle(xmm0, .x, .x, .x, .x); 1918} 1919test "zmath.dot2" { 1920 const v0 = f32x4(-1.0, 2.0, 300.0, -2.0); 1921 const v1 = f32x4(4.0, 5.0, 600.0, 2.0); 1922 const v = dot2(v0, v1); 1923 try expectVecApproxEqAbs(v, splat(F32x4, 6.0), 0.0001); 1924} 1925 1926pub inline fn dot3(v0: Vec, v1: Vec) F32x4 { 1927 const dot = v0 * v1; 1928 return f32x4s(dot[0] + dot[1] + dot[2]); 1929} 1930test "zmath.dot3" { 1931 const v0 = f32x4(-1.0, 2.0, 3.0, 1.0); 1932 const v1 = f32x4(4.0, 5.0, 6.0, 1.0); 1933 const v = dot3(v0, v1); 1934 try expectVecApproxEqAbs(v, splat(F32x4, 24.0), 0.0001); 1935} 1936 1937pub inline fn dot4(v0: Vec, v1: Vec) F32x4 { 1938 var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | z0*z1 | w0*w1 | 1939 var xmm1 = swizzle(xmm0, .y, .x, .w, .x); // | y0*y1 | -- | w0*w1 | -- | 1940 xmm1 = xmm0 + xmm1; // | x0*x1 + y0*y1 | -- | z0*z1 + w0*w1 | -- | 1941 xmm0 = swizzle(xmm1, .z, .x, .x, .x); // | z0*z1 + w0*w1 | -- | -- | -- | 1942 xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[2]); // addss 1943 return swizzle(xmm0, .x, .x, .x, .x); 1944} 1945test "zmath.dot4" { 1946 const v0 = f32x4(-1.0, 2.0, 3.0, -2.0); 1947 const v1 = f32x4(4.0, 5.0, 6.0, 2.0); 1948 const v = dot4(v0, v1); 1949 try expectVecApproxEqAbs(v, splat(F32x4, 20.0), 0.0001); 1950} 1951 1952pub inline fn cross3(v0: Vec, v1: Vec) Vec { 1953 var xmm0 = swizzle(v0, .y, .z, .x, .w); 1954 var xmm1 = swizzle(v1, .z, .x, .y, .w); 1955 var result = xmm0 * xmm1; 1956 xmm0 = swizzle(xmm0, .y, .z, .x, .w); 1957 xmm1 = swizzle(xmm1, .z, .x, .y, .w); 1958 result = result - xmm0 * xmm1; 1959 return andInt(result, f32x4_mask3); 1960} 1961test "zmath.cross3" { 1962 { 1963 const v0 = f32x4(1.0, 0.0, 0.0, 1.0); 1964 const v1 = f32x4(0.0, 1.0, 0.0, 1.0); 1965 const v = cross3(v0, v1); 1966 try expectVecApproxEqAbs(v, f32x4(0.0, 0.0, 1.0, 0.0), 0.0001); 1967 } 1968 { 1969 const v0 = f32x4(1.0, 0.0, 0.0, 1.0); 1970 const v1 = f32x4(0.0, -1.0, 0.0, 1.0); 1971 const v = cross3(v0, v1); 1972 try expectVecApproxEqAbs(v, f32x4(0.0, 0.0, -1.0, 0.0), 0.0001); 1973 } 1974 { 1975 const v0 = f32x4(-3.0, 0, -2.0, 1.0); 1976 const v1 = f32x4(5.0, -1.0, 2.0, 1.0); 1977 const v = cross3(v0, v1); 1978 try expectVecApproxEqAbs(v, f32x4(-2.0, -4.0, 3.0, 0.0), 0.0001); 1979 } 1980} 1981 1982pub inline fn lengthSq2(v: Vec) F32x4 { 1983 return dot2(v, v); 1984} 1985pub inline fn lengthSq3(v: Vec) F32x4 { 1986 return dot3(v, v); 1987} 1988pub inline fn lengthSq4(v: Vec) F32x4 { 1989 return dot4(v, v); 1990} 1991 1992pub inline fn length2(v: Vec) F32x4 { 1993 return sqrt(dot2(v, v)); 1994} 1995pub inline fn length3(v: Vec) F32x4 { 1996 return sqrt(dot3(v, v)); 1997} 1998pub inline fn length4(v: Vec) F32x4 { 1999 return sqrt(dot4(v, v)); 2000} 2001test "zmath.length3" { 2002 { 2003 const v = length3(f32x4(1.0, -2.0, 3.0, 1000.0)); 2004 try expectVecApproxEqAbs(v, splat(F32x4, math.sqrt(14.0)), 0.001); 2005 } 2006 { 2007 const v = length3(f32x4(1.0, math.nan(f32), math.nan(f32), 1000.0)); 2008 try expect(all(isNan(v), 0)); 2009 } 2010 { 2011 const v = length3(f32x4(1.0, math.inf(f32), 3.0, 1000.0)); 2012 try expect(all(isInf(v), 0)); 2013 } 2014 { 2015 const v = length3(f32x4(3.0, 2.0, 1.0, math.nan(f32))); 2016 try expectVecApproxEqAbs(v, splat(F32x4, math.sqrt(14.0)), 0.001); 2017 } 2018} 2019 2020pub inline fn normalize2(v: Vec) Vec { 2021 return v * splat(F32x4, 1.0) / sqrt(dot2(v, v)); 2022} 2023pub inline fn normalize3(v: Vec) Vec { 2024 return v * splat(F32x4, 1.0) / sqrt(dot3(v, v)); 2025} 2026pub inline fn normalize4(v: Vec) Vec { 2027 return v * splat(F32x4, 1.0) / sqrt(dot4(v, v)); 2028} 2029test "zmath.normalize3" { 2030 { 2031 const v0 = f32x4(1.0, -2.0, 3.0, 1000.0); 2032 const v = normalize3(v0); 2033 try expectVecApproxEqAbs(v, v0 * splat(F32x4, 1.0 / math.sqrt(14.0)), 0.0005); 2034 } 2035 { 2036 try expect(any(isNan(normalize3(f32x4(1.0, math.inf(f32), 1.0, 1.0))), 0)); 2037 try expect(any(isNan(normalize3(f32x4(-math.inf(f32), math.inf(f32), 0.0, 0.0))), 0)); 2038 try expect(any(isNan(normalize3(f32x4(-math.nan(f32), math.snan(f32), 0.0, 0.0))), 0)); 2039 try expect(any(isNan(normalize3(f32x4(0, 0, 0, 0))), 0)); 2040 } 2041} 2042test "zmath.normalize4" { 2043 { 2044 const v0 = f32x4(1.0, -2.0, 3.0, 10.0); 2045 const v = normalize4(v0); 2046 try expectVecApproxEqAbs(v, v0 * splat(F32x4, 1.0 / math.sqrt(114.0)), 0.0005); 2047 } 2048 { 2049 try expect(any(isNan(normalize4(f32x4(1.0, math.inf(f32), 1.0, 1.0))), 0)); 2050 try expect(any(isNan(normalize4(f32x4(-math.inf(f32), math.inf(f32), 0.0, 0.0))), 0)); 2051 try expect(any(isNan(normalize4(f32x4(-math.nan(f32), math.snan(f32), 0.0, 0.0))), 0)); 2052 try expect(any(isNan(normalize4(f32x4(0, 0, 0, 0))), 0)); 2053 } 2054} 2055 2056fn vecMulMat(v: Vec, m: Mat) Vec { 2057 const vx = @shuffle(f32, v, undefined, [4]i32{ 0, 0, 0, 0 }); 2058 const vy = @shuffle(f32, v, undefined, [4]i32{ 1, 1, 1, 1 }); 2059 const vz = @shuffle(f32, v, undefined, [4]i32{ 2, 2, 2, 2 }); 2060 const vw = @shuffle(f32, v, undefined, [4]i32{ 3, 3, 3, 3 }); 2061 return vx * m[0] + vy * m[1] + vz * m[2] + vw * m[3]; 2062} 2063fn matMulVec(m: Mat, v: Vec) Vec { 2064 return .{ dot4(m[0], v)[0], dot4(m[1], v)[0], dot4(m[2], v)[0], dot4(m[3], v)[0] }; 2065} 2066test "zmath.vecMulMat" { 2067 const m = Mat{ 2068 f32x4(1.0, 0.0, 0.0, 0.0), 2069 f32x4(0.0, 1.0, 0.0, 0.0), 2070 f32x4(0.0, 0.0, 1.0, 0.0), 2071 f32x4(2.0, 3.0, 4.0, 1.0), 2072 }; 2073 const vm = mul(f32x4(1.0, 2.0, 3.0, 1.0), m); 2074 const mv = mul(m, f32x4(1.0, 2.0, 3.0, 1.0)); 2075 const v = mul(transpose(m), f32x4(1.0, 2.0, 3.0, 1.0)); 2076 try expectVecApproxEqAbs(vm, f32x4(3.0, 5.0, 7.0, 1.0), 0.0001); 2077 try expectVecApproxEqAbs(mv, f32x4(1.0, 2.0, 3.0, 21.0), 0.0001); 2078 try expectVecApproxEqAbs(v, f32x4(3.0, 5.0, 7.0, 1.0), 0.0001); 2079} 2080// ------------------------------------------------------------------------------ 2081// 2082// 4. Matrix functions 2083// 2084// ------------------------------------------------------------------------------ 2085pub fn identity() Mat { 2086 const static = struct { 2087 const identity = Mat{ 2088 f32x4(1.0, 0.0, 0.0, 0.0), 2089 f32x4(0.0, 1.0, 0.0, 0.0), 2090 f32x4(0.0, 0.0, 1.0, 0.0), 2091 f32x4(0.0, 0.0, 0.0, 1.0), 2092 }; 2093 }; 2094 return static.identity; 2095} 2096 2097pub fn matFromArr(arr: [16]f32) Mat { 2098 return Mat{ 2099 f32x4(arr[0], arr[1], arr[2], arr[3]), 2100 f32x4(arr[4], arr[5], arr[6], arr[7]), 2101 f32x4(arr[8], arr[9], arr[10], arr[11]), 2102 f32x4(arr[12], arr[13], arr[14], arr[15]), 2103 }; 2104} 2105 2106fn mulRetType(comptime Ta: type, comptime Tb: type) type { 2107 if (Ta == Mat and Tb == Mat) { 2108 return Mat; 2109 } else if ((Ta == f32 and Tb == Mat) or (Ta == Mat and Tb == f32)) { 2110 return Mat; 2111 } else if ((Ta == Vec and Tb == Mat) or (Ta == Mat and Tb == Vec)) { 2112 return Vec; 2113 } 2114 @compileError("zmath.mul() not implemented for types: " ++ @typeName(Ta) ++ @typeName(Tb)); 2115} 2116 2117pub fn mul(a: anytype, b: anytype) mulRetType(@TypeOf(a), @TypeOf(b)) { 2118 const Ta = @TypeOf(a); 2119 const Tb = @TypeOf(b); 2120 if (Ta == Mat and Tb == Mat) { 2121 return mulMat(a, b); 2122 } else if (Ta == f32 and Tb == Mat) { 2123 const va = splat(F32x4, a); 2124 return Mat{ va * b[0], va * b[1], va * b[2], va * b[3] }; 2125 } else if (Ta == Mat and Tb == f32) { 2126 const vb = splat(F32x4, b); 2127 return Mat{ a[0] * vb, a[1] * vb, a[2] * vb, a[3] * vb }; 2128 } else if (Ta == Vec and Tb == Mat) { 2129 return vecMulMat(a, b); 2130 } else if (Ta == Mat and Tb == Vec) { 2131 return matMulVec(a, b); 2132 } else { 2133 @compileError("zmath.mul() not implemented for types: " ++ @typeName(Ta) ++ ", " ++ @typeName(Tb)); 2134 } 2135} 2136test "zmath.mul" { 2137 { 2138 const m = Mat{ 2139 f32x4(0.1, 0.2, 0.3, 0.4), 2140 f32x4(0.5, 0.6, 0.7, 0.8), 2141 f32x4(0.9, 1.0, 1.1, 1.2), 2142 f32x4(1.3, 1.4, 1.5, 1.6), 2143 }; 2144 const ms = mul(@as(f32, 2.0), m); 2145 try expectVecApproxEqAbs(ms[0], f32x4(0.2, 0.4, 0.6, 0.8), 0.0001); 2146 try expectVecApproxEqAbs(ms[1], f32x4(1.0, 1.2, 1.4, 1.6), 0.0001); 2147 try expectVecApproxEqAbs(ms[2], f32x4(1.8, 2.0, 2.2, 2.4), 0.0001); 2148 try expectVecApproxEqAbs(ms[3], f32x4(2.6, 2.8, 3.0, 3.2), 0.0001); 2149 } 2150} 2151 2152fn mulMat(m0: Mat, m1: Mat) Mat { 2153 var result: Mat = undefined; 2154 comptime var row: u32 = 0; 2155 inline while (row < 4) : (row += 1) { 2156 const vx = swizzle(m0[row], .x, .x, .x, .x); 2157 const vy = swizzle(m0[row], .y, .y, .y, .y); 2158 const vz = swizzle(m0[row], .z, .z, .z, .z); 2159 const vw = swizzle(m0[row], .w, .w, .w, .w); 2160 result[row] = mulAdd(vx, m1[0], vz * m1[2]) + mulAdd(vy, m1[1], vw * m1[3]); 2161 } 2162 return result; 2163} 2164test "zmath.matrix.mul" { 2165 const a = Mat{ 2166 f32x4(0.1, 0.2, 0.3, 0.4), 2167 f32x4(0.5, 0.6, 0.7, 0.8), 2168 f32x4(0.9, 1.0, 1.1, 1.2), 2169 f32x4(1.3, 1.4, 1.5, 1.6), 2170 }; 2171 const b = Mat{ 2172 f32x4(1.7, 1.8, 1.9, 2.0), 2173 f32x4(2.1, 2.2, 2.3, 2.4), 2174 f32x4(2.5, 2.6, 2.7, 2.8), 2175 f32x4(2.9, 3.0, 3.1, 3.2), 2176 }; 2177 const c = mul(a, b); 2178 try expectVecApproxEqAbs(c[0], f32x4(2.5, 2.6, 2.7, 2.8), 0.0001); 2179 try expectVecApproxEqAbs(c[1], f32x4(6.18, 6.44, 6.7, 6.96), 0.0001); 2180 try expectVecApproxEqAbs(c[2], f32x4(9.86, 10.28, 10.7, 11.12), 0.0001); 2181 try expectVecApproxEqAbs(c[3], f32x4(13.54, 14.12, 14.7, 15.28), 0.0001); 2182} 2183 2184pub fn transpose(m: Mat) Mat { 2185 const temp1 = @shuffle(f32, m[0], m[1], [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 1) }); 2186 const temp3 = @shuffle(f32, m[0], m[1], [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); 2187 const temp2 = @shuffle(f32, m[2], m[3], [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 1) }); 2188 const temp4 = @shuffle(f32, m[2], m[3], [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); 2189 return .{ 2190 @shuffle(f32, temp1, temp2, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }), 2191 @shuffle(f32, temp1, temp2, [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }), 2192 @shuffle(f32, temp3, temp4, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }), 2193 @shuffle(f32, temp3, temp4, [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }), 2194 }; 2195} 2196test "zmath.matrix.transpose" { 2197 const m = Mat{ 2198 f32x4(1.0, 2.0, 3.0, 4.0), 2199 f32x4(5.0, 6.0, 7.0, 8.0), 2200 f32x4(9.0, 10.0, 11.0, 12.0), 2201 f32x4(13.0, 14.0, 15.0, 16.0), 2202 }; 2203 const mt = transpose(m); 2204 try expectVecApproxEqAbs(mt[0], f32x4(1.0, 5.0, 9.0, 13.0), 0.0001); 2205 try expectVecApproxEqAbs(mt[1], f32x4(2.0, 6.0, 10.0, 14.0), 0.0001); 2206 try expectVecApproxEqAbs(mt[2], f32x4(3.0, 7.0, 11.0, 15.0), 0.0001); 2207 try expectVecApproxEqAbs(mt[3], f32x4(4.0, 8.0, 12.0, 16.0), 0.0001); 2208} 2209 2210pub fn rotationX(angle: f32) Mat { 2211 const sc = sincos(angle); 2212 return .{ 2213 f32x4(1.0, 0.0, 0.0, 0.0), 2214 f32x4(0.0, sc[1], sc[0], 0.0), 2215 f32x4(0.0, -sc[0], sc[1], 0.0), 2216 f32x4(0.0, 0.0, 0.0, 1.0), 2217 }; 2218} 2219 2220pub fn rotationY(angle: f32) Mat { 2221 const sc = sincos(angle); 2222 return .{ 2223 f32x4(sc[1], 0.0, -sc[0], 0.0), 2224 f32x4(0.0, 1.0, 0.0, 0.0), 2225 f32x4(sc[0], 0.0, sc[1], 0.0), 2226 f32x4(0.0, 0.0, 0.0, 1.0), 2227 }; 2228} 2229 2230pub fn rotationZ(angle: f32) Mat { 2231 const sc = sincos(angle); 2232 return .{ 2233 f32x4(sc[1], sc[0], 0.0, 0.0), 2234 f32x4(-sc[0], sc[1], 0.0, 0.0), 2235 f32x4(0.0, 0.0, 1.0, 0.0), 2236 f32x4(0.0, 0.0, 0.0, 1.0), 2237 }; 2238} 2239 2240pub fn translation(x: f32, y: f32, z: f32) Mat { 2241 return .{ 2242 f32x4(1.0, 0.0, 0.0, 0.0), 2243 f32x4(0.0, 1.0, 0.0, 0.0), 2244 f32x4(0.0, 0.0, 1.0, 0.0), 2245 f32x4(x, y, z, 1.0), 2246 }; 2247} 2248pub fn translationV(v: Vec) Mat { 2249 return translation(v[0], v[1], v[2]); 2250} 2251 2252pub fn scaling(x: f32, y: f32, z: f32) Mat { 2253 return .{ 2254 f32x4(x, 0.0, 0.0, 0.0), 2255 f32x4(0.0, y, 0.0, 0.0), 2256 f32x4(0.0, 0.0, z, 0.0), 2257 f32x4(0.0, 0.0, 0.0, 1.0), 2258 }; 2259} 2260pub fn scalingV(v: Vec) Mat { 2261 return scaling(v[0], v[1], v[2]); 2262} 2263 2264pub fn lookToLh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat { 2265 const az = normalize3(eyedir); 2266 const ax = normalize3(cross3(updir, az)); 2267 const ay = normalize3(cross3(az, ax)); 2268 return .{ 2269 f32x4(ax[0], ay[0], az[0], 0), 2270 f32x4(ax[1], ay[1], az[1], 0), 2271 f32x4(ax[2], ay[2], az[2], 0), 2272 f32x4(-dot3(ax, eyepos)[0], -dot3(ay, eyepos)[0], -dot3(az, eyepos)[0], 1.0), 2273 }; 2274} 2275pub fn lookToRh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat { 2276 return lookToLh(eyepos, -eyedir, updir); 2277} 2278pub fn lookAtLh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat { 2279 return lookToLh(eyepos, focuspos - eyepos, updir); 2280} 2281pub fn lookAtRh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat { 2282 return lookToLh(eyepos, eyepos - focuspos, updir); 2283} 2284test "zmath.matrix.lookToLh" { 2285 const m = lookToLh(f32x4(0.0, 0.0, -3.0, 1.0), f32x4(0.0, 0.0, 1.0, 0.0), f32x4(0.0, 1.0, 0.0, 0.0)); 2286 try expectVecApproxEqAbs(m[0], f32x4(1.0, 0.0, 0.0, 0.0), 0.001); 2287 try expectVecApproxEqAbs(m[1], f32x4(0.0, 1.0, 0.0, 0.0), 0.001); 2288 try expectVecApproxEqAbs(m[2], f32x4(0.0, 0.0, 1.0, 0.0), 0.001); 2289 try expectVecApproxEqAbs(m[3], f32x4(0.0, 0.0, 3.0, 1.0), 0.001); 2290} 2291 2292pub fn perspectiveFovLh(fovy: f32, aspect: f32, near: f32, far: f32) Mat { 2293 const scfov = sincos(0.5 * fovy); 2294 2295 assert(near > 0.0 and far > 0.0); 2296 assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); 2297 assert(!math.approxEqAbs(f32, far, near, 0.001)); 2298 assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); 2299 2300 const h = scfov[1] / scfov[0]; 2301 const w = h / aspect; 2302 const r = far / (far - near); 2303 return .{ 2304 f32x4(w, 0.0, 0.0, 0.0), 2305 f32x4(0.0, h, 0.0, 0.0), 2306 f32x4(0.0, 0.0, r, 1.0), 2307 f32x4(0.0, 0.0, -r * near, 0.0), 2308 }; 2309} 2310pub fn perspectiveFovRh(fovy: f32, aspect: f32, near: f32, far: f32) Mat { 2311 const scfov = sincos(0.5 * fovy); 2312 2313 assert(near > 0.0 and far > 0.0); 2314 assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); 2315 assert(!math.approxEqAbs(f32, far, near, 0.001)); 2316 assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); 2317 2318 const h = scfov[1] / scfov[0]; 2319 const w = h / aspect; 2320 const r = far / (near - far); 2321 return .{ 2322 f32x4(w, 0.0, 0.0, 0.0), 2323 f32x4(0.0, h, 0.0, 0.0), 2324 f32x4(0.0, 0.0, r, -1.0), 2325 f32x4(0.0, 0.0, r * near, 0.0), 2326 }; 2327} 2328 2329// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) 2330pub fn perspectiveFovLhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat { 2331 const scfov = sincos(0.5 * fovy); 2332 2333 assert(near > 0.0 and far > 0.0); 2334 assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); 2335 assert(!math.approxEqAbs(f32, far, near, 0.001)); 2336 assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); 2337 2338 const h = scfov[1] / scfov[0]; 2339 const w = h / aspect; 2340 const r = far - near; 2341 return .{ 2342 f32x4(w, 0.0, 0.0, 0.0), 2343 f32x4(0.0, h, 0.0, 0.0), 2344 f32x4(0.0, 0.0, (near + far) / r, 1.0), 2345 f32x4(0.0, 0.0, 2.0 * near * far / -r, 0.0), 2346 }; 2347} 2348 2349// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) 2350pub fn perspectiveFovRhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat { 2351 const scfov = sincos(0.5 * fovy); 2352 2353 assert(near > 0.0 and far > 0.0); 2354 assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); 2355 assert(!math.approxEqAbs(f32, far, near, 0.001)); 2356 assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); 2357 2358 const h = scfov[1] / scfov[0]; 2359 const w = h / aspect; 2360 const r = near - far; 2361 return .{ 2362 f32x4(w, 0.0, 0.0, 0.0), 2363 f32x4(0.0, h, 0.0, 0.0), 2364 f32x4(0.0, 0.0, (near + far) / r, -1.0), 2365 f32x4(0.0, 0.0, 2.0 * near * far / r, 0.0), 2366 }; 2367} 2368 2369pub fn orthographicLh(w: f32, h: f32, near: f32, far: f32) Mat { 2370 assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); 2371 assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); 2372 assert(!math.approxEqAbs(f32, far, near, 0.001)); 2373 2374 const r = 1 / (far - near); 2375 return .{ 2376 f32x4(2 / w, 0.0, 0.0, 0.0), 2377 f32x4(0.0, 2 / h, 0.0, 0.0), 2378 f32x4(0.0, 0.0, r, 0.0), 2379 f32x4(0.0, 0.0, -r * near, 1.0), 2380 }; 2381} 2382 2383pub fn orthographicRh(w: f32, h: f32, near: f32, far: f32) Mat { 2384 assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); 2385 assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); 2386 assert(!math.approxEqAbs(f32, far, near, 0.001)); 2387 2388 const r = 1 / (near - far); 2389 return .{ 2390 f32x4(2 / w, 0.0, 0.0, 0.0), 2391 f32x4(0.0, 2 / h, 0.0, 0.0), 2392 f32x4(0.0, 0.0, r, 0.0), 2393 f32x4(0.0, 0.0, r * near, 1.0), 2394 }; 2395} 2396 2397// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) 2398pub fn orthographicLhGl(w: f32, h: f32, near: f32, far: f32) Mat { 2399 assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); 2400 assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); 2401 assert(!math.approxEqAbs(f32, far, near, 0.001)); 2402 2403 const r = far - near; 2404 return .{ 2405 f32x4(2 / w, 0.0, 0.0, 0.0), 2406 f32x4(0.0, 2 / h, 0.0, 0.0), 2407 f32x4(0.0, 0.0, 2 / r, 0.0), 2408 f32x4(0.0, 0.0, (near + far) / -r, 1.0), 2409 }; 2410} 2411 2412// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) 2413pub fn orthographicRhGl(w: f32, h: f32, near: f32, far: f32) Mat { 2414 assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); 2415 assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); 2416 assert(!math.approxEqAbs(f32, far, near, 0.001)); 2417 2418 const r = near - far; 2419 return .{ 2420 f32x4(2 / w, 0.0, 0.0, 0.0), 2421 f32x4(0.0, 2 / h, 0.0, 0.0), 2422 f32x4(0.0, 0.0, 2 / r, 0.0), 2423 f32x4(0.0, 0.0, (near + far) / r, 1.0), 2424 }; 2425} 2426 2427pub fn orthographicOffCenterLh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { 2428 assert(!math.approxEqAbs(f32, far, near, 0.001)); 2429 2430 const r = 1 / (far - near); 2431 return .{ 2432 f32x4(2 / (right - left), 0.0, 0.0, 0.0), 2433 f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), 2434 f32x4(0.0, 0.0, r, 0.0), 2435 f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), -r * near, 1.0), 2436 }; 2437} 2438 2439pub fn orthographicOffCenterRh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { 2440 assert(!math.approxEqAbs(f32, far, near, 0.001)); 2441 2442 const r = 1 / (near - far); 2443 return .{ 2444 f32x4(2 / (right - left), 0.0, 0.0, 0.0), 2445 f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), 2446 f32x4(0.0, 0.0, r, 0.0), 2447 f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), r * near, 1.0), 2448 }; 2449} 2450 2451// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) 2452pub fn orthographicOffCenterLhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { 2453 assert(!math.approxEqAbs(f32, far, near, 0.001)); 2454 2455 const r = far - near; 2456 return .{ 2457 f32x4(2 / (right - left), 0.0, 0.0, 0.0), 2458 f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), 2459 f32x4(0.0, 0.0, 2 / r, 0.0), 2460 f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), (near + far) / -r, 1.0), 2461 }; 2462} 2463 2464// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) 2465pub fn orthographicOffCenterRhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { 2466 assert(!math.approxEqAbs(f32, far, near, 0.001)); 2467 2468 const r = near - far; 2469 return .{ 2470 f32x4(2 / (right - left), 0.0, 0.0, 0.0), 2471 f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), 2472 f32x4(0.0, 0.0, 2 / r, 0.0), 2473 f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), (near + far) / r, 1.0), 2474 }; 2475} 2476 2477pub fn determinant(m: Mat) F32x4 { 2478 var v0 = swizzle(m[2], .y, .x, .x, .x); 2479 var v1 = swizzle(m[3], .z, .z, .y, .y); 2480 var v2 = swizzle(m[2], .y, .x, .x, .x); 2481 var v3 = swizzle(m[3], .w, .w, .w, .z); 2482 var v4 = swizzle(m[2], .z, .z, .y, .y); 2483 var v5 = swizzle(m[3], .w, .w, .w, .z); 2484 2485 var p0 = v0 * v1; 2486 var p1 = v2 * v3; 2487 var p2 = v4 * v5; 2488 2489 v0 = swizzle(m[2], .z, .z, .y, .y); 2490 v1 = swizzle(m[3], .y, .x, .x, .x); 2491 v2 = swizzle(m[2], .w, .w, .w, .z); 2492 v3 = swizzle(m[3], .y, .x, .x, .x); 2493 v4 = swizzle(m[2], .w, .w, .w, .z); 2494 v5 = swizzle(m[3], .z, .z, .y, .y); 2495 2496 p0 = mulAdd(-v0, v1, p0); 2497 p1 = mulAdd(-v2, v3, p1); 2498 p2 = mulAdd(-v4, v5, p2); 2499 2500 v0 = swizzle(m[1], .w, .w, .w, .z); 2501 v1 = swizzle(m[1], .z, .z, .y, .y); 2502 v2 = swizzle(m[1], .y, .x, .x, .x); 2503 2504 const s = m[0] * f32x4(1.0, -1.0, 1.0, -1.0); 2505 var r = v0 * p0; 2506 r = mulAdd(-v1, p1, r); 2507 r = mulAdd(v2, p2, r); 2508 return dot4(s, r); 2509} 2510test "zmath.matrix.determinant" { 2511 const m = Mat{ 2512 f32x4(10.0, -9.0, -12.0, 1.0), 2513 f32x4(7.0, -12.0, 11.0, 1.0), 2514 f32x4(-10.0, 10.0, 3.0, 1.0), 2515 f32x4(1.0, 2.0, 3.0, 4.0), 2516 }; 2517 try expectVecApproxEqAbs(determinant(m), splat(F32x4, 2939.0), 0.0001); 2518} 2519 2520pub fn inverse(a: anytype) @TypeOf(a) { 2521 const T = @TypeOf(a); 2522 return switch (T) { 2523 Mat => inverseMat(a), 2524 Quat => inverseQuat(a), 2525 else => @compileError("zmath.inverse() not implemented for " ++ @typeName(T)), 2526 }; 2527} 2528 2529fn inverseMat(m: Mat) Mat { 2530 return inverseDet(m, null); 2531} 2532 2533pub fn inverseDet(m: Mat, out_det: ?*F32x4) Mat { 2534 const mt = transpose(m); 2535 var v0: [4]F32x4 = undefined; 2536 var v1: [4]F32x4 = undefined; 2537 2538 v0[0] = swizzle(mt[2], .x, .x, .y, .y); 2539 v1[0] = swizzle(mt[3], .z, .w, .z, .w); 2540 v0[1] = swizzle(mt[0], .x, .x, .y, .y); 2541 v1[1] = swizzle(mt[1], .z, .w, .z, .w); 2542 v0[2] = @shuffle(f32, mt[2], mt[0], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); 2543 v1[2] = @shuffle(f32, mt[3], mt[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); 2544 2545 var d0 = v0[0] * v1[0]; 2546 var d1 = v0[1] * v1[1]; 2547 var d2 = v0[2] * v1[2]; 2548 2549 v0[0] = swizzle(mt[2], .z, .w, .z, .w); 2550 v1[0] = swizzle(mt[3], .x, .x, .y, .y); 2551 v0[1] = swizzle(mt[0], .z, .w, .z, .w); 2552 v1[1] = swizzle(mt[1], .x, .x, .y, .y); 2553 v0[2] = @shuffle(f32, mt[2], mt[0], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); 2554 v1[2] = @shuffle(f32, mt[3], mt[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); 2555 2556 d0 = mulAdd(-v0[0], v1[0], d0); 2557 d1 = mulAdd(-v0[1], v1[1], d1); 2558 d2 = mulAdd(-v0[2], v1[2], d2); 2559 2560 v0[0] = swizzle(mt[1], .y, .z, .x, .y); 2561 v1[0] = @shuffle(f32, d0, d2, [4]i32{ ~@as(i32, 1), 1, 3, 0 }); 2562 v0[1] = swizzle(mt[0], .z, .x, .y, .x); 2563 v1[1] = @shuffle(f32, d0, d2, [4]i32{ 3, ~@as(i32, 1), 1, 2 }); 2564 v0[2] = swizzle(mt[3], .y, .z, .x, .y); 2565 v1[2] = @shuffle(f32, d1, d2, [4]i32{ ~@as(i32, 3), 1, 3, 0 }); 2566 v0[3] = swizzle(mt[2], .z, .x, .y, .x); 2567 v1[3] = @shuffle(f32, d1, d2, [4]i32{ 3, ~@as(i32, 3), 1, 2 }); 2568 2569 var c0 = v0[0] * v1[0]; 2570 var c2 = v0[1] * v1[1]; 2571 var c4 = v0[2] * v1[2]; 2572 var c6 = v0[3] * v1[3]; 2573 2574 v0[0] = swizzle(mt[1], .z, .w, .y, .z); 2575 v1[0] = @shuffle(f32, d0, d2, [4]i32{ 3, 0, 1, ~@as(i32, 0) }); 2576 v0[1] = swizzle(mt[0], .w, .z, .w, .y); 2577 v1[1] = @shuffle(f32, d0, d2, [4]i32{ 2, 1, ~@as(i32, 0), 0 }); 2578 v0[2] = swizzle(mt[3], .z, .w, .y, .z); 2579 v1[2] = @shuffle(f32, d1, d2, [4]i32{ 3, 0, 1, ~@as(i32, 2) }); 2580 v0[3] = swizzle(mt[2], .w, .z, .w, .y); 2581 v1[3] = @shuffle(f32, d1, d2, [4]i32{ 2, 1, ~@as(i32, 2), 0 }); 2582 2583 c0 = mulAdd(-v0[0], v1[0], c0); 2584 c2 = mulAdd(-v0[1], v1[1], c2); 2585 c4 = mulAdd(-v0[2], v1[2], c4); 2586 c6 = mulAdd(-v0[3], v1[3], c6); 2587 2588 v0[0] = swizzle(mt[1], .w, .x, .w, .x); 2589 v1[0] = @shuffle(f32, d0, d2, [4]i32{ 2, ~@as(i32, 1), ~@as(i32, 0), 2 }); 2590 v0[1] = swizzle(mt[0], .y, .w, .x, .z); 2591 v1[1] = @shuffle(f32, d0, d2, [4]i32{ ~@as(i32, 1), 0, 3, ~@as(i32, 0) }); 2592 v0[2] = swizzle(mt[3], .w, .x, .w, .x); 2593 v1[2] = @shuffle(f32, d1, d2, [4]i32{ 2, ~@as(i32, 3), ~@as(i32, 2), 2 }); 2594 v0[3] = swizzle(mt[2], .y, .w, .x, .z); 2595 v1[3] = @shuffle(f32, d1, d2, [4]i32{ ~@as(i32, 3), 0, 3, ~@as(i32, 2) }); 2596 2597 const c1 = mulAdd(-v0[0], v1[0], c0); 2598 const c3 = mulAdd(v0[1], v1[1], c2); 2599 const c5 = mulAdd(-v0[2], v1[2], c4); 2600 const c7 = mulAdd(v0[3], v1[3], c6); 2601 2602 c0 = mulAdd(v0[0], v1[0], c0); 2603 c2 = mulAdd(-v0[1], v1[1], c2); 2604 c4 = mulAdd(v0[2], v1[2], c4); 2605 c6 = mulAdd(-v0[3], v1[3], c6); 2606 2607 var mr = Mat{ 2608 f32x4(c0[0], c1[1], c0[2], c1[3]), 2609 f32x4(c2[0], c3[1], c2[2], c3[3]), 2610 f32x4(c4[0], c5[1], c4[2], c5[3]), 2611 f32x4(c6[0], c7[1], c6[2], c7[3]), 2612 }; 2613 2614 const det = dot4(mr[0], mt[0]); 2615 if (out_det != null) { 2616 out_det.?.* = det; 2617 } 2618 2619 if (math.approxEqAbs(f32, det[0], 0.0, math.floatEps(f32))) { 2620 return .{ 2621 f32x4(0.0, 0.0, 0.0, 0.0), 2622 f32x4(0.0, 0.0, 0.0, 0.0), 2623 f32x4(0.0, 0.0, 0.0, 0.0), 2624 f32x4(0.0, 0.0, 0.0, 0.0), 2625 }; 2626 } 2627 2628 const scale = splat(F32x4, 1.0) / det; 2629 mr[0] *= scale; 2630 mr[1] *= scale; 2631 mr[2] *= scale; 2632 mr[3] *= scale; 2633 return mr; 2634} 2635test "zmath.matrix.inverse" { 2636 const m = Mat{ 2637 f32x4(10.0, -9.0, -12.0, 1.0), 2638 f32x4(7.0, -12.0, 11.0, 1.0), 2639 f32x4(-10.0, 10.0, 3.0, 1.0), 2640 f32x4(1.0, 2.0, 3.0, 4.0), 2641 }; 2642 var det: F32x4 = undefined; 2643 const mi = inverseDet(m, &det); 2644 try expectVecApproxEqAbs(det, splat(F32x4, 2939.0), 0.0001); 2645 2646 try expectVecApproxEqAbs(mi[0], f32x4(-0.170806, -0.13576, -0.349439, 0.164001), 0.0001); 2647 try expectVecApproxEqAbs(mi[1], f32x4(-0.163661, -0.14801, -0.253147, 0.141204), 0.0001); 2648 try expectVecApproxEqAbs(mi[2], f32x4(-0.0871045, 0.00646478, -0.0785982, 0.0398095), 0.0001); 2649 try expectVecApproxEqAbs(mi[3], f32x4(0.18986, 0.103096, 0.272882, 0.10854), 0.0001); 2650} 2651 2652pub fn matFromNormAxisAngle(axis: Vec, angle: f32) Mat { 2653 const sincos_angle = sincos(angle); 2654 2655 const c2 = splat(F32x4, 1.0 - sincos_angle[1]); 2656 const c1 = splat(F32x4, sincos_angle[1]); 2657 const c0 = splat(F32x4, sincos_angle[0]); 2658 2659 const n0 = swizzle(axis, .y, .z, .x, .w); 2660 const n1 = swizzle(axis, .z, .x, .y, .w); 2661 2662 var v0 = c2 * n0 * n1; 2663 const r0 = c2 * axis * axis + c1; 2664 const r1 = c0 * axis + v0; 2665 var r2 = v0 - c0 * axis; 2666 2667 v0 = andInt(r0, f32x4_mask3); 2668 2669 var v1 = @shuffle(f32, r1, r2, [4]i32{ 0, 2, ~@as(i32, 1), ~@as(i32, 2) }); 2670 v1 = swizzle(v1, .y, .z, .w, .x); 2671 2672 var v2 = @shuffle(f32, r1, r2, [4]i32{ 1, 1, ~@as(i32, 0), ~@as(i32, 0) }); 2673 v2 = swizzle(v2, .x, .z, .x, .z); 2674 2675 r2 = @shuffle(f32, v0, v1, [4]i32{ 0, 3, ~@as(i32, 0), ~@as(i32, 1) }); 2676 r2 = swizzle(r2, .x, .z, .w, .y); 2677 2678 var m: Mat = undefined; 2679 m[0] = r2; 2680 2681 r2 = @shuffle(f32, v0, v1, [4]i32{ 1, 3, ~@as(i32, 2), ~@as(i32, 3) }); 2682 r2 = swizzle(r2, .z, .x, .w, .y); 2683 m[1] = r2; 2684 2685 v2 = @shuffle(f32, v2, v0, [4]i32{ 0, 1, ~@as(i32, 2), ~@as(i32, 3) }); 2686 m[2] = v2; 2687 m[3] = f32x4(0.0, 0.0, 0.0, 1.0); 2688 return m; 2689} 2690pub fn matFromAxisAngle(axis: Vec, angle: f32) Mat { 2691 assert(!all(axis == splat(F32x4, 0.0), 3)); 2692 assert(!all(isInf(axis), 3)); 2693 const normal = normalize3(axis); 2694 return matFromNormAxisAngle(normal, angle); 2695} 2696test "zmath.matrix.matFromAxisAngle" { 2697 { 2698 const m0 = matFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), math.pi * 0.25); 2699 const m1 = rotationX(math.pi * 0.25); 2700 try expectVecApproxEqAbs(m0[0], m1[0], 0.001); 2701 try expectVecApproxEqAbs(m0[1], m1[1], 0.001); 2702 try expectVecApproxEqAbs(m0[2], m1[2], 0.001); 2703 try expectVecApproxEqAbs(m0[3], m1[3], 0.001); 2704 } 2705 { 2706 const m0 = matFromAxisAngle(f32x4(0.0, 1.0, 0.0, 0.0), math.pi * 0.125); 2707 const m1 = rotationY(math.pi * 0.125); 2708 try expectVecApproxEqAbs(m0[0], m1[0], 0.001); 2709 try expectVecApproxEqAbs(m0[1], m1[1], 0.001); 2710 try expectVecApproxEqAbs(m0[2], m1[2], 0.001); 2711 try expectVecApproxEqAbs(m0[3], m1[3], 0.001); 2712 } 2713 { 2714 const m0 = matFromAxisAngle(f32x4(0.0, 0.0, 1.0, 0.0), math.pi * 0.333); 2715 const m1 = rotationZ(math.pi * 0.333); 2716 try expectVecApproxEqAbs(m0[0], m1[0], 0.001); 2717 try expectVecApproxEqAbs(m0[1], m1[1], 0.001); 2718 try expectVecApproxEqAbs(m0[2], m1[2], 0.001); 2719 try expectVecApproxEqAbs(m0[3], m1[3], 0.001); 2720 } 2721} 2722 2723pub fn matFromQuat(quat: Quat) Mat { 2724 const q0 = quat + quat; 2725 var q1 = quat * q0; 2726 2727 var v0 = swizzle(q1, .y, .x, .x, .w); 2728 v0 = andInt(v0, f32x4_mask3); 2729 2730 var v1 = swizzle(q1, .z, .z, .y, .w); 2731 v1 = andInt(v1, f32x4_mask3); 2732 2733 const r0 = (f32x4(1.0, 1.0, 1.0, 0.0) - v0) - v1; 2734 2735 v0 = swizzle(quat, .x, .x, .y, .w); 2736 v1 = swizzle(q0, .z, .y, .z, .w); 2737 v0 = v0 * v1; 2738 2739 v1 = swizzle(quat, .w, .w, .w, .w); 2740 const v2 = swizzle(q0, .y, .z, .x, .w); 2741 v1 = v1 * v2; 2742 2743 const r1 = v0 + v1; 2744 const r2 = v0 - v1; 2745 2746 v0 = @shuffle(f32, r1, r2, [4]i32{ 1, 2, ~@as(i32, 0), ~@as(i32, 1) }); 2747 v0 = swizzle(v0, .x, .z, .w, .y); 2748 v1 = @shuffle(f32, r1, r2, [4]i32{ 0, 0, ~@as(i32, 2), ~@as(i32, 2) }); 2749 v1 = swizzle(v1, .x, .z, .x, .z); 2750 2751 q1 = @shuffle(f32, r0, v0, [4]i32{ 0, 3, ~@as(i32, 0), ~@as(i32, 1) }); 2752 q1 = swizzle(q1, .x, .z, .w, .y); 2753 2754 var m: Mat = undefined; 2755 m[0] = q1; 2756 2757 q1 = @shuffle(f32, r0, v0, [4]i32{ 1, 3, ~@as(i32, 2), ~@as(i32, 3) }); 2758 q1 = swizzle(q1, .z, .x, .w, .y); 2759 m[1] = q1; 2760 2761 q1 = @shuffle(f32, v1, r0, [4]i32{ 0, 1, ~@as(i32, 2), ~@as(i32, 3) }); 2762 m[2] = q1; 2763 m[3] = f32x4(0.0, 0.0, 0.0, 1.0); 2764 return m; 2765} 2766test "zmath.matrix.matFromQuat" { 2767 { 2768 const m = matFromQuat(f32x4(0.0, 0.0, 0.0, 1.0)); 2769 try expectVecApproxEqAbs(m[0], f32x4(1.0, 0.0, 0.0, 0.0), 0.0001); 2770 try expectVecApproxEqAbs(m[1], f32x4(0.0, 1.0, 0.0, 0.0), 0.0001); 2771 try expectVecApproxEqAbs(m[2], f32x4(0.0, 0.0, 1.0, 0.0), 0.0001); 2772 try expectVecApproxEqAbs(m[3], f32x4(0.0, 0.0, 0.0, 1.0), 0.0001); 2773 } 2774} 2775 2776pub fn matFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Mat { 2777 return matFromRollPitchYawV(f32x4(pitch, yaw, roll, 0.0)); 2778} 2779pub fn matFromRollPitchYawV(angles: Vec) Mat { 2780 return matFromQuat(quatFromRollPitchYawV(angles)); 2781} 2782 2783pub fn matToQuat(m: Mat) Quat { 2784 return quatFromMat(m); 2785} 2786 2787pub inline fn loadMat(mem: []const f32) Mat { 2788 return .{ 2789 load(mem[0..4], F32x4, 0), 2790 load(mem[4..8], F32x4, 0), 2791 load(mem[8..12], F32x4, 0), 2792 load(mem[12..16], F32x4, 0), 2793 }; 2794} 2795test "zmath.loadMat" { 2796 const a = [18]f32{ 2797 1.0, 2.0, 3.0, 4.0, 2798 5.0, 6.0, 7.0, 8.0, 2799 9.0, 10.0, 11.0, 12.0, 2800 13.0, 14.0, 15.0, 16.0, 2801 17.0, 18.0, 2802 }; 2803 const m = loadMat(a[1..]); 2804 try expectVecEqual(m[0], f32x4(2.0, 3.0, 4.0, 5.0)); 2805 try expectVecEqual(m[1], f32x4(6.0, 7.0, 8.0, 9.0)); 2806 try expectVecEqual(m[2], f32x4(10.0, 11.0, 12.0, 13.0)); 2807 try expectVecEqual(m[3], f32x4(14.0, 15.0, 16.0, 17.0)); 2808} 2809 2810pub inline fn storeMat(mem: []f32, m: Mat) void { 2811 store(mem[0..4], m[0], 0); 2812 store(mem[4..8], m[1], 0); 2813 store(mem[8..12], m[2], 0); 2814 store(mem[12..16], m[3], 0); 2815} 2816 2817pub inline fn loadMat43(mem: []const f32) Mat { 2818 return .{ 2819 f32x4(mem[0], mem[1], mem[2], 0.0), 2820 f32x4(mem[3], mem[4], mem[5], 0.0), 2821 f32x4(mem[6], mem[7], mem[8], 0.0), 2822 f32x4(mem[9], mem[10], mem[11], 1.0), 2823 }; 2824} 2825 2826pub inline fn storeMat43(mem: []f32, m: Mat) void { 2827 store(mem[0..3], m[0], 3); 2828 store(mem[3..6], m[1], 3); 2829 store(mem[6..9], m[2], 3); 2830 store(mem[9..12], m[3], 3); 2831} 2832 2833pub inline fn loadMat34(mem: []const f32) Mat { 2834 return .{ 2835 load(mem[0..4], F32x4, 0), 2836 load(mem[4..8], F32x4, 0), 2837 load(mem[8..12], F32x4, 0), 2838 f32x4(0.0, 0.0, 0.0, 1.0), 2839 }; 2840} 2841 2842pub inline fn storeMat34(mem: []f32, m: Mat) void { 2843 store(mem[0..4], m[0], 0); 2844 store(mem[4..8], m[1], 0); 2845 store(mem[8..12], m[2], 0); 2846} 2847 2848pub inline fn matToArr(m: Mat) [16]f32 { 2849 var array: [16]f32 = undefined; 2850 storeMat(array[0..], m); 2851 return array; 2852} 2853 2854pub inline fn matToArr43(m: Mat) [12]f32 { 2855 var array: [12]f32 = undefined; 2856 storeMat43(array[0..], m); 2857 return array; 2858} 2859 2860pub inline fn matToArr34(m: Mat) [12]f32 { 2861 var array: [12]f32 = undefined; 2862 storeMat34(array[0..], m); 2863 return array; 2864} 2865// ------------------------------------------------------------------------------ 2866// 2867// 5. Quaternion functions 2868// 2869// ------------------------------------------------------------------------------ 2870pub fn qmul(q0: Quat, q1: Quat) Quat { 2871 var result = swizzle(q1, .w, .w, .w, .w); 2872 var q1x = swizzle(q1, .x, .x, .x, .x); 2873 var q1y = swizzle(q1, .y, .y, .y, .y); 2874 var q1z = swizzle(q1, .z, .z, .z, .z); 2875 result = result * q0; 2876 var q0_shuf = swizzle(q0, .w, .z, .y, .x); 2877 q1x = q1x * q0_shuf; 2878 q0_shuf = swizzle(q0_shuf, .y, .x, .w, .z); 2879 result = mulAdd(q1x, f32x4(1.0, -1.0, 1.0, -1.0), result); 2880 q1y = q1y * q0_shuf; 2881 q0_shuf = swizzle(q0_shuf, .w, .z, .y, .x); 2882 q1y = q1y * f32x4(1.0, 1.0, -1.0, -1.0); 2883 q1z = q1z * q0_shuf; 2884 q1y = mulAdd(q1z, f32x4(-1.0, 1.0, 1.0, -1.0), q1y); 2885 return result + q1y; 2886} 2887test "zmath.quaternion.mul" { 2888 { 2889 const q0 = f32x4(2.0, 3.0, 4.0, 1.0); 2890 const q1 = f32x4(3.0, 2.0, 1.0, 4.0); 2891 try expectVecApproxEqAbs(qmul(q0, q1), f32x4(16.0, 4.0, 22.0, -12.0), 0.0001); 2892 } 2893} 2894 2895pub fn quatToMat(quat: Quat) Mat { 2896 return matFromQuat(quat); 2897} 2898 2899pub fn quatToAxisAngle(quat: Quat, axis: *Vec, angle: *f32) void { 2900 axis.* = quat; 2901 angle.* = 2.0 * acos(quat[3]); 2902} 2903test "zmath.quaternion.quatToAxisAngle" { 2904 { 2905 const q0 = quatFromNormAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); 2906 var axis: Vec = f32x4(4.0, 3.0, 2.0, 1.0); 2907 var angle: f32 = 10.0; 2908 quatToAxisAngle(q0, &axis, &angle); 2909 try expect(math.approxEqAbs(f32, axis[0], @sin(@as(f32, 0.25) * math.pi * 0.5), 0.0001)); 2910 try expect(axis[1] == 0.0); 2911 try expect(axis[2] == 0.0); 2912 try expect(math.approxEqAbs(f32, angle, 0.25 * math.pi, 0.0001)); 2913 } 2914} 2915 2916pub fn quatFromMat(m: Mat) Quat { 2917 const r0 = m[0]; 2918 const r1 = m[1]; 2919 const r2 = m[2]; 2920 const r00 = swizzle(r0, .x, .x, .x, .x); 2921 const r11 = swizzle(r1, .y, .y, .y, .y); 2922 const r22 = swizzle(r2, .z, .z, .z, .z); 2923 2924 const x2gey2 = (r11 - r00) <= splat(F32x4, 0.0); 2925 const z2gew2 = (r11 + r00) <= splat(F32x4, 0.0); 2926 const x2py2gez2pw2 = r22 <= splat(F32x4, 0.0); 2927 2928 var t0 = mulAdd(r00, f32x4(1.0, -1.0, -1.0, 1.0), splat(F32x4, 1.0)); 2929 var t1 = r11 * f32x4(-1.0, 1.0, -1.0, 1.0); 2930 var t2 = mulAdd(r22, f32x4(-1.0, -1.0, 1.0, 1.0), t0); 2931 const x2y2z2w2 = t1 + t2; 2932 2933 t0 = @shuffle(f32, r0, r1, [4]i32{ 1, 2, ~@as(i32, 2), ~@as(i32, 1) }); 2934 t1 = @shuffle(f32, r1, r2, [4]i32{ 0, 0, ~@as(i32, 0), ~@as(i32, 1) }); 2935 t1 = swizzle(t1, .x, .z, .w, .y); 2936 const xyxzyz = t0 + t1; 2937 2938 t0 = @shuffle(f32, r2, r1, [4]i32{ 1, 0, ~@as(i32, 0), ~@as(i32, 0) }); 2939 t1 = @shuffle(f32, r1, r0, [4]i32{ 2, 2, ~@as(i32, 2), ~@as(i32, 1) }); 2940 t1 = swizzle(t1, .x, .z, .w, .y); 2941 const xwywzw = (t0 - t1) * f32x4(-1.0, 1.0, -1.0, 1.0); 2942 2943 t0 = @shuffle(f32, x2y2z2w2, xyxzyz, [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 0) }); 2944 t1 = @shuffle(f32, x2y2z2w2, xwywzw, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 0) }); 2945 t2 = @shuffle(f32, xyxzyz, xwywzw, [4]i32{ 1, 2, ~@as(i32, 0), ~@as(i32, 1) }); 2946 2947 const tensor0 = @shuffle(f32, t0, t2, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); 2948 const tensor1 = @shuffle(f32, t0, t2, [4]i32{ 2, 1, ~@as(i32, 1), ~@as(i32, 3) }); 2949 const tensor2 = @shuffle(f32, t2, t1, [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 2) }); 2950 const tensor3 = @shuffle(f32, t2, t1, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 1) }); 2951 2952 t0 = select(x2gey2, tensor0, tensor1); 2953 t1 = select(z2gew2, tensor2, tensor3); 2954 t2 = select(x2py2gez2pw2, t0, t1); 2955 2956 return t2 / length4(t2); 2957} 2958test "zmath.quatFromMat" { 2959 { 2960 const q0 = quatFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); 2961 const q1 = quatFromMat(rotationX(0.25 * math.pi)); 2962 try expectVecApproxEqAbs(q0, q1, 0.0001); 2963 } 2964 { 2965 const q0 = quatFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi); 2966 const q1 = quatFromMat(matFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi)); 2967 try expectVecApproxEqAbs(q0, q1, 0.0001); 2968 } 2969 { 2970 const q0 = quatFromRollPitchYaw(0.1 * math.pi, -0.2 * math.pi, 0.3 * math.pi); 2971 const q1 = quatFromMat(matFromRollPitchYaw(0.1 * math.pi, -0.2 * math.pi, 0.3 * math.pi)); 2972 try expectVecApproxEqAbs(q0, q1, 0.0001); 2973 } 2974} 2975 2976pub fn quatFromNormAxisAngle(axis: Vec, angle: f32) Quat { 2977 const n = f32x4(axis[0], axis[1], axis[2], 1.0); 2978 const sc = sincos(0.5 * angle); 2979 return n * f32x4(sc[0], sc[0], sc[0], sc[1]); 2980} 2981pub fn quatFromAxisAngle(axis: Vec, angle: f32) Quat { 2982 assert(!all(axis == splat(F32x4, 0.0), 3)); 2983 assert(!all(isInf(axis), 3)); 2984 const normal = normalize3(axis); 2985 return quatFromNormAxisAngle(normal, angle); 2986} 2987test "zmath.quaternion.quatFromNormAxisAngle" { 2988 { 2989 const q0 = quatFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); 2990 const q1 = quatFromAxisAngle(f32x4(0.0, 1.0, 0.0, 0.0), 0.125 * math.pi); 2991 const m0 = rotationX(0.25 * math.pi); 2992 const m1 = rotationY(0.125 * math.pi); 2993 const mr0 = quatToMat(qmul(q0, q1)); 2994 const mr1 = mul(m0, m1); 2995 try expectVecApproxEqAbs(mr0[0], mr1[0], 0.0001); 2996 try expectVecApproxEqAbs(mr0[1], mr1[1], 0.0001); 2997 try expectVecApproxEqAbs(mr0[2], mr1[2], 0.0001); 2998 try expectVecApproxEqAbs(mr0[3], mr1[3], 0.0001); 2999 } 3000 { 3001 const m0 = quatToMat(quatFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi)); 3002 const m1 = matFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi); 3003 try expectVecApproxEqAbs(m0[0], m1[0], 0.0001); 3004 try expectVecApproxEqAbs(m0[1], m1[1], 0.0001); 3005 try expectVecApproxEqAbs(m0[2], m1[2], 0.0001); 3006 try expectVecApproxEqAbs(m0[3], m1[3], 0.0001); 3007 } 3008} 3009 3010pub inline fn qidentity() Quat { 3011 return f32x4(@as(f32, 0.0), @as(f32, 0.0), @as(f32, 0.0), @as(f32, 1.0)); 3012} 3013 3014pub inline fn conjugate(quat: Quat) Quat { 3015 return quat * f32x4(-1.0, -1.0, -1.0, 1.0); 3016} 3017 3018fn inverseQuat(quat: Quat) Quat { 3019 const l = lengthSq4(quat); 3020 const conj = conjugate(quat); 3021 return select(l <= splat(F32x4, math.floatEps(f32)), splat(F32x4, 0.0), conj / l); 3022} 3023test "zmath.quaternion.inverseQuat" { 3024 try expectVecApproxEqAbs( 3025 inverse(f32x4(2.0, 3.0, 4.0, 1.0)), 3026 f32x4(-1.0 / 15.0, -1.0 / 10.0, -2.0 / 15.0, 1.0 / 30.0), 3027 0.0001, 3028 ); 3029 try expectVecApproxEqAbs(inverse(qidentity()), qidentity(), 0.0001); 3030} 3031 3032// Algorithm from: https://github.com/g-truc/glm/blob/master/glm/detail/type_quat.inl 3033pub fn rotate(q: Quat, v: Vec) Vec { 3034 const w = splat(F32x4, q[3]); 3035 const axis = f32x4(q[0], q[1], q[2], 0.0); 3036 const uv = cross3(axis, v); 3037 return v + ((uv * w) + cross3(axis, uv)) * splat(F32x4, 2.0); 3038} 3039test "zmath.quaternion.rotate" { 3040 const quat = quatFromRollPitchYaw(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi); 3041 const mat = matFromQuat(quat); 3042 const forward = f32x4(0.0, 0.0, -1.0, 0.0); 3043 const up = f32x4(0.0, 1.0, 0.0, 0.0); 3044 const right = f32x4(1.0, 0.0, 0.0, 0.0); 3045 try expectVecApproxEqAbs(rotate(quat, forward), mul(forward, mat), 0.0001); 3046 try expectVecApproxEqAbs(rotate(quat, up), mul(up, mat), 0.0001); 3047 try expectVecApproxEqAbs(rotate(quat, right), mul(right, mat), 0.0001); 3048} 3049 3050pub fn slerp(q0: Quat, q1: Quat, t: f32) Quat { 3051 return slerpV(q0, q1, splat(F32x4, t)); 3052} 3053pub fn slerpV(q0: Quat, q1: Quat, t: F32x4) Quat { 3054 var cos_omega = dot4(q0, q1); 3055 const sign = select(cos_omega < splat(F32x4, 0.0), splat(F32x4, -1.0), splat(F32x4, 1.0)); 3056 3057 cos_omega = cos_omega * sign; 3058 const sin_omega = sqrt(splat(F32x4, 1.0) - cos_omega * cos_omega); 3059 3060 const omega = atan2(sin_omega, cos_omega); 3061 3062 var v01 = t; 3063 v01 = xorInt(andInt(v01, f32x4_mask2), f32x4_sign_mask1); 3064 v01 = f32x4(1.0, 0.0, 0.0, 0.0) + v01; 3065 3066 var s0 = sin(v01 * omega) / sin_omega; 3067 s0 = select(cos_omega < splat(F32x4, 1.0 - 0.00001), s0, v01); 3068 3069 const s1 = swizzle(s0, .y, .y, .y, .y); 3070 s0 = swizzle(s0, .x, .x, .x, .x); 3071 3072 return q0 * s0 + sign * q1 * s1; 3073} 3074test "zmath.quaternion.slerp" { 3075 const from = f32x4(0.0, 0.0, 0.0, 1.0); 3076 const to = f32x4(0.5, 0.5, -0.5, 0.5); 3077 const result = slerp(from, to, 0.5); 3078 try expectVecApproxEqAbs(result, f32x4(0.28867513, 0.28867513, -0.28867513, 0.86602540), 0.0001); 3079} 3080 3081// Converts q back to euler angles, assuming a YXZ rotation order. 3082// See: http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler 3083pub fn quatToRollPitchYaw(q: Quat) [3]f32 { 3084 var angles: [3]f32 = undefined; 3085 3086 const p = swizzle(q, .w, .y, .x, .z); 3087 const sign = -1.0; 3088 3089 const singularity = p[0] * p[2] + sign * p[1] * p[3]; 3090 if (singularity > 0.499) { 3091 angles[0] = math.pi * 0.5; 3092 angles[1] = 2.0 * math.atan2(p[1], p[0]); 3093 angles[2] = 0.0; 3094 } else if (singularity < -0.499) { 3095 angles[0] = -math.pi * 0.5; 3096 angles[1] = 2.0 * math.atan2(p[1], p[0]); 3097 angles[2] = 0.0; 3098 } else { 3099 const sq = p * p; 3100 const y = splat(F32x4, 2.0) * f32x4(p[0] * p[1] - sign * p[2] * p[3], p[0] * p[3] - sign * p[1] * p[2], 0.0, 0.0); 3101 const x = splat(F32x4, 1.0) - (splat(F32x4, 2.0) * f32x4(sq[1] + sq[2], sq[2] + sq[3], 0.0, 0.0)); 3102 const res = atan2(y, x); 3103 angles[0] = math.asin(2.0 * singularity); 3104 angles[1] = res[0]; 3105 angles[2] = res[1]; 3106 } 3107 3108 return angles; 3109} 3110 3111test "zmath.quaternion.quatToRollPitchYaw" { 3112 { 3113 const expected = f32x4(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); 3114 const quat = quatFromRollPitchYaw(expected[0], expected[1], expected[2]); 3115 const result = quatToRollPitchYaw(quat); 3116 try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); 3117 } 3118 3119 { 3120 const expected = f32x4(0.3 * math.pi, 0.1 * math.pi, 0.2 * math.pi, 0.0); 3121 const quat = quatFromRollPitchYaw(expected[0], expected[1], expected[2]); 3122 const result = quatToRollPitchYaw(quat); 3123 try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); 3124 } 3125 3126 // North pole singularity 3127 { 3128 const angle = f32x4(0.5 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); 3129 const expected = f32x4(0.5 * math.pi, -0.1 * math.pi, 0.0, 0.0); 3130 const quat = quatFromRollPitchYaw(angle[0], angle[1], angle[2]); 3131 const result = quatToRollPitchYaw(quat); 3132 try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); 3133 } 3134 3135 // South pole singularity 3136 { 3137 const angle = f32x4(-0.5 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); 3138 const expected = f32x4(-0.5 * math.pi, 0.5 * math.pi, 0.0, 0.0); 3139 const quat = quatFromRollPitchYaw(angle[0], angle[1], angle[2]); 3140 const result = quatToRollPitchYaw(quat); 3141 try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); 3142 } 3143} 3144 3145pub fn quatFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Quat { 3146 return quatFromRollPitchYawV(f32x4(pitch, yaw, roll, 0.0)); 3147} 3148pub fn quatFromRollPitchYawV(angles: Vec) Quat { // | pitch | yaw | roll | 0 | 3149 const sc = sincos(splat(Vec, 0.5) * angles); 3150 const p0 = @shuffle(f32, sc[1], sc[0], [4]i32{ ~@as(i32, 0), 0, 0, 0 }); 3151 const p1 = @shuffle(f32, sc[0], sc[1], [4]i32{ ~@as(i32, 0), 0, 0, 0 }); 3152 const y0 = @shuffle(f32, sc[1], sc[0], [4]i32{ 1, ~@as(i32, 1), 1, 1 }); 3153 const y1 = @shuffle(f32, sc[0], sc[1], [4]i32{ 1, ~@as(i32, 1), 1, 1 }); 3154 const r0 = @shuffle(f32, sc[1], sc[0], [4]i32{ 2, 2, ~@as(i32, 2), 2 }); 3155 const r1 = @shuffle(f32, sc[0], sc[1], [4]i32{ 2, 2, ~@as(i32, 2), 2 }); 3156 const q1 = p1 * f32x4(1.0, -1.0, -1.0, 1.0) * y1; 3157 const q0 = p0 * y0 * r0; 3158 return mulAdd(q1, r1, q0); 3159} 3160test "zmath.quaternion.quatFromRollPitchYawV" { 3161 { 3162 const m0 = quatToMat(quatFromRollPitchYawV(f32x4(0.25 * math.pi, 0.0, 0.0, 0.0))); 3163 const m1 = rotationX(0.25 * math.pi); 3164 try expectVecApproxEqAbs(m0[0], m1[0], 0.0001); 3165 try expectVecApproxEqAbs(m0[1], m1[1], 0.0001); 3166 try expectVecApproxEqAbs(m0[2], m1[2], 0.0001); 3167 try expectVecApproxEqAbs(m0[3], m1[3], 0.0001); 3168 } 3169 { 3170 const m0 = quatToMat(quatFromRollPitchYaw(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi)); 3171 const m1 = mul( 3172 rotationZ(0.3 * math.pi), 3173 mul(rotationX(0.1 * math.pi), rotationY(0.2 * math.pi)), 3174 ); 3175 try expectVecApproxEqAbs(m0[0], m1[0], 0.0001); 3176 try expectVecApproxEqAbs(m0[1], m1[1], 0.0001); 3177 try expectVecApproxEqAbs(m0[2], m1[2], 0.0001); 3178 try expectVecApproxEqAbs(m0[3], m1[3], 0.0001); 3179 } 3180} 3181// ------------------------------------------------------------------------------ 3182// 3183// 6. Color functions 3184// 3185// ------------------------------------------------------------------------------ 3186pub fn adjustSaturation(color: F32x4, saturation: f32) F32x4 { 3187 const luminance = dot3(f32x4(0.2125, 0.7154, 0.0721, 0.0), color); 3188 var result = mulAdd(color - luminance, f32x4s(saturation), luminance); 3189 result[3] = color[3]; 3190 return result; 3191} 3192 3193pub fn adjustContrast(color: F32x4, contrast: f32) F32x4 { 3194 var result = mulAdd(color - f32x4s(0.5), f32x4s(contrast), f32x4s(0.5)); 3195 result[3] = color[3]; 3196 return result; 3197} 3198 3199pub fn rgbToHsl(rgb: F32x4) F32x4 { 3200 const r = swizzle(rgb, .x, .x, .x, .x); 3201 const g = swizzle(rgb, .y, .y, .y, .y); 3202 const b = swizzle(rgb, .z, .z, .z, .z); 3203 3204 const minv = min(r, min(g, b)); 3205 const maxv = max(r, max(g, b)); 3206 3207 const l = (minv + maxv) * f32x4s(0.5); 3208 const d = maxv - minv; 3209 const la = select(boolx4(true, true, true, false), l, rgb); 3210 3211 if (all(d < f32x4s(math.floatEps(f32)), 3)) { 3212 return select(boolx4(true, true, false, false), f32x4s(0.0), la); 3213 } else { 3214 var s: F32x4 = undefined; 3215 var h: F32x4 = undefined; 3216 3217 const d2 = minv + maxv; 3218 3219 if (all(l > f32x4s(0.5), 3)) { 3220 s = d / (f32x4s(2.0) - d2); 3221 } else { 3222 s = d / d2; 3223 } 3224 3225 if (all(r == maxv, 3)) { 3226 h = (g - b) / d; 3227 } else if (all(g == maxv, 3)) { 3228 h = f32x4s(2.0) + (b - r) / d; 3229 } else { 3230 h = f32x4s(4.0) + (r - g) / d; 3231 } 3232 3233 h /= f32x4s(6.0); 3234 3235 if (all(h < f32x4s(0.0), 3)) { 3236 h += f32x4s(1.0); 3237 } 3238 3239 const lha = select(boolx4(true, true, false, false), h, la); 3240 return select(boolx4(true, false, true, true), lha, s); 3241 } 3242} 3243test "zmath.color.rgbToHsl" { 3244 try expectVecApproxEqAbs(rgbToHsl(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.6111, 0.6, 0.5, 1.0), 0.0001); 3245 try expectVecApproxEqAbs(rgbToHsl(f32x4(1.0, 0.0, 0.0, 0.5)), f32x4(0.0, 1.0, 0.5, 0.5), 0.0001); 3246 try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 1.0, 0.0, 0.25)), f32x4(0.3333, 1.0, 0.5, 0.25), 0.0001); 3247 try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 0.0, 1.0, 1.0)), f32x4(0.6666, 1.0, 0.5, 1.0), 0.0001); 3248 try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 0.0, 0.0, 1.0)), f32x4(0.0, 0.0, 0.0, 1.0), 0.0001); 3249 try expectVecApproxEqAbs(rgbToHsl(f32x4(1.0, 1.0, 1.0, 1.0)), f32x4(0.0, 0.0, 1.0, 1.0), 0.0001); 3250} 3251 3252fn hueToClr(p: F32x4, q: F32x4, h: F32x4) F32x4 { 3253 var t = h; 3254 3255 if (all(t < f32x4s(0.0), 3)) 3256 t += f32x4s(1.0); 3257 3258 if (all(t > f32x4s(1.0), 3)) 3259 t -= f32x4s(1.0); 3260 3261 if (all(t < f32x4s(1.0 / 6.0), 3)) 3262 return mulAdd(q - p, f32x4s(6.0) * t, p); 3263 3264 if (all(t < f32x4s(0.5), 3)) 3265 return q; 3266 3267 if (all(t < f32x4s(2.0 / 3.0), 3)) 3268 return mulAdd(q - p, f32x4s(6.0) * (f32x4s(2.0 / 3.0) - t), p); 3269 3270 return p; 3271} 3272 3273pub fn hslToRgb(hsl: F32x4) F32x4 { 3274 const s = swizzle(hsl, .y, .y, .y, .y); 3275 const l = swizzle(hsl, .z, .z, .z, .z); 3276 3277 if (all(isNearEqual(s, f32x4s(0.0), f32x4s(math.floatEps(f32))), 3)) { 3278 return select(boolx4(true, true, true, false), l, hsl); 3279 } else { 3280 const h = swizzle(hsl, .x, .x, .x, .x); 3281 var q: F32x4 = undefined; 3282 if (all(l < f32x4s(0.5), 3)) { 3283 q = l * (f32x4s(1.0) + s); 3284 } else { 3285 q = (l + s) - (l * s); 3286 } 3287 3288 const p = f32x4s(2.0) * l - q; 3289 3290 const r = hueToClr(p, q, h + f32x4s(1.0 / 3.0)); 3291 const g = hueToClr(p, q, h); 3292 const b = hueToClr(p, q, h - f32x4s(1.0 / 3.0)); 3293 3294 const rg = select(boolx4(true, false, false, false), r, g); 3295 const ba = select(boolx4(true, true, true, false), b, hsl); 3296 return select(boolx4(true, true, false, false), rg, ba); 3297 } 3298} 3299test "zmath.color.hslToRgb" { 3300 try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), hslToRgb(f32x4(0.6111, 0.6, 0.5, 1.0)), 0.0001); 3301 try expectVecApproxEqAbs(f32x4(1.0, 0.0, 0.0, 0.5), hslToRgb(f32x4(0.0, 1.0, 0.5, 0.5)), 0.0001); 3302 try expectVecApproxEqAbs(f32x4(0.0, 1.0, 0.0, 0.25), hslToRgb(f32x4(0.3333, 1.0, 0.5, 0.25)), 0.0005); 3303 try expectVecApproxEqAbs(f32x4(0.0, 0.0, 1.0, 1.0), hslToRgb(f32x4(0.6666, 1.0, 0.5, 1.0)), 0.0005); 3304 try expectVecApproxEqAbs(f32x4(0.0, 0.0, 0.0, 1.0), hslToRgb(f32x4(0.0, 0.0, 0.0, 1.0)), 0.0001); 3305 try expectVecApproxEqAbs(f32x4(1.0, 1.0, 1.0, 1.0), hslToRgb(f32x4(0.0, 0.0, 1.0, 1.0)), 0.0001); 3306 try expectVecApproxEqAbs(hslToRgb(rgbToHsl(f32x4(1.0, 1.0, 1.0, 1.0))), f32x4(1.0, 1.0, 1.0, 1.0), 0.0005); 3307 try expectVecApproxEqAbs( 3308 hslToRgb(rgbToHsl(f32x4(0.82198, 0.1839, 0.632, 1.0))), 3309 f32x4(0.82198, 0.1839, 0.632, 1.0), 3310 0.0005, 3311 ); 3312 try expectVecApproxEqAbs( 3313 rgbToHsl(hslToRgb(f32x4(0.82198, 0.1839, 0.632, 1.0))), 3314 f32x4(0.82198, 0.1839, 0.632, 1.0), 3315 0.0005, 3316 ); 3317 try expectVecApproxEqAbs( 3318 rgbToHsl(hslToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), 3319 f32x4(0.1839, 0.82198, 0.632, 1.0), 3320 0.0005, 3321 ); 3322 try expectVecApproxEqAbs( 3323 hslToRgb(rgbToHsl(f32x4(0.1839, 0.632, 0.82198, 1.0))), 3324 f32x4(0.1839, 0.632, 0.82198, 1.0), 3325 0.0005, 3326 ); 3327} 3328 3329pub fn rgbToHsv(rgb: F32x4) F32x4 { 3330 const r = swizzle(rgb, .x, .x, .x, .x); 3331 const g = swizzle(rgb, .y, .y, .y, .y); 3332 const b = swizzle(rgb, .z, .z, .z, .z); 3333 3334 const minv = min(r, min(g, b)); 3335 const v = max(r, max(g, b)); 3336 const d = v - minv; 3337 const s = if (all(isNearEqual(v, f32x4s(0.0), f32x4s(math.floatEps(f32))), 3)) f32x4s(0.0) else d / v; 3338 3339 if (all(d < f32x4s(math.floatEps(f32)), 3)) { 3340 const hv = select(boolx4(true, false, false, false), f32x4s(0.0), v); 3341 const hva = select(boolx4(true, true, true, false), hv, rgb); 3342 return select(boolx4(true, false, true, true), hva, s); 3343 } else { 3344 var h: F32x4 = undefined; 3345 if (all(r == v, 3)) { 3346 h = (g - b) / d; 3347 if (all(g < b, 3)) 3348 h += f32x4s(6.0); 3349 } else if (all(g == v, 3)) { 3350 h = f32x4s(2.0) + (b - r) / d; 3351 } else { 3352 h = f32x4s(4.0) + (r - g) / d; 3353 } 3354 3355 h /= f32x4s(6.0); 3356 const hv = select(boolx4(true, false, false, false), h, v); 3357 const hva = select(boolx4(true, true, true, false), hv, rgb); 3358 return select(boolx4(true, false, true, true), hva, s); 3359 } 3360} 3361test "zmath.color.rgbToHsv" { 3362 try expectVecApproxEqAbs(rgbToHsv(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.6111, 0.75, 0.8, 1.0), 0.0001); 3363 try expectVecApproxEqAbs(rgbToHsv(f32x4(0.4, 0.2, 0.8, 1.0)), f32x4(0.7222, 0.75, 0.8, 1.0), 0.0001); 3364 try expectVecApproxEqAbs(rgbToHsv(f32x4(0.4, 0.8, 0.2, 1.0)), f32x4(0.2777, 0.75, 0.8, 1.0), 0.0001); 3365 try expectVecApproxEqAbs(rgbToHsv(f32x4(1.0, 0.0, 0.0, 0.5)), f32x4(0.0, 1.0, 1.0, 0.5), 0.0001); 3366 try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 1.0, 0.0, 0.25)), f32x4(0.3333, 1.0, 1.0, 0.25), 0.0001); 3367 try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 0.0, 1.0, 1.0)), f32x4(0.6666, 1.0, 1.0, 1.0), 0.0001); 3368 try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 0.0, 0.0, 1.0)), f32x4(0.0, 0.0, 0.0, 1.0), 0.0001); 3369 try expectVecApproxEqAbs(rgbToHsv(f32x4(1.0, 1.0, 1.0, 1.0)), f32x4(0.0, 0.0, 1.0, 1.0), 0.0001); 3370} 3371 3372pub fn hsvToRgb(hsv: F32x4) F32x4 { 3373 const h = swizzle(hsv, .x, .x, .x, .x); 3374 const s = swizzle(hsv, .y, .y, .y, .y); 3375 const v = swizzle(hsv, .z, .z, .z, .z); 3376 3377 const h6 = h * f32x4s(6.0); 3378 const i = floor(h6); 3379 const f = h6 - i; 3380 3381 const p = v * (f32x4s(1.0) - s); 3382 const q = v * (f32x4s(1.0) - f * s); 3383 const t = v * (f32x4s(1.0) - (f32x4s(1.0) - f) * s); 3384 3385 const ii = @as(i32, @intFromFloat(mod(i, f32x4s(6.0))[0])); 3386 const rgb = switch (ii) { 3387 0 => blk: { 3388 const vt = select(boolx4(true, false, false, false), v, t); 3389 break :blk select(boolx4(true, true, false, false), vt, p); 3390 }, 3391 1 => blk: { 3392 const qv = select(boolx4(true, false, false, false), q, v); 3393 break :blk select(boolx4(true, true, false, false), qv, p); 3394 }, 3395 2 => blk: { 3396 const pv = select(boolx4(true, false, false, false), p, v); 3397 break :blk select(boolx4(true, true, false, false), pv, t); 3398 }, 3399 3 => blk: { 3400 const pq = select(boolx4(true, false, false, false), p, q); 3401 break :blk select(boolx4(true, true, false, false), pq, v); 3402 }, 3403 4 => blk: { 3404 const tp = select(boolx4(true, false, false, false), t, p); 3405 break :blk select(boolx4(true, true, false, false), tp, v); 3406 }, 3407 5 => blk: { 3408 const vp = select(boolx4(true, false, false, false), v, p); 3409 break :blk select(boolx4(true, true, false, false), vp, q); 3410 }, 3411 else => unreachable, 3412 }; 3413 return select(boolx4(true, true, true, false), rgb, hsv); 3414} 3415test "zmath.color.hsvToRgb" { 3416 const epsilon = 0.0005; 3417 try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), hsvToRgb(f32x4(0.6111, 0.75, 0.8, 1.0)), epsilon); 3418 try expectVecApproxEqAbs(f32x4(0.4, 0.2, 0.8, 1.0), hsvToRgb(f32x4(0.7222, 0.75, 0.8, 1.0)), epsilon); 3419 try expectVecApproxEqAbs(f32x4(0.4, 0.8, 0.2, 1.0), hsvToRgb(f32x4(0.2777, 0.75, 0.8, 1.0)), epsilon); 3420 try expectVecApproxEqAbs(f32x4(1.0, 0.0, 0.0, 0.5), hsvToRgb(f32x4(0.0, 1.0, 1.0, 0.5)), epsilon); 3421 try expectVecApproxEqAbs(f32x4(0.0, 1.0, 0.0, 0.25), hsvToRgb(f32x4(0.3333, 1.0, 1.0, 0.25)), epsilon); 3422 try expectVecApproxEqAbs(f32x4(0.0, 0.0, 1.0, 1.0), hsvToRgb(f32x4(0.6666, 1.0, 1.0, 1.0)), epsilon); 3423 try expectVecApproxEqAbs(f32x4(0.0, 0.0, 0.0, 1.0), hsvToRgb(f32x4(0.0, 0.0, 0.0, 1.0)), epsilon); 3424 try expectVecApproxEqAbs(f32x4(1.0, 1.0, 1.0, 1.0), hsvToRgb(f32x4(0.0, 0.0, 1.0, 1.0)), epsilon); 3425 try expectVecApproxEqAbs( 3426 hsvToRgb(rgbToHsv(f32x4(0.1839, 0.632, 0.82198, 1.0))), 3427 f32x4(0.1839, 0.632, 0.82198, 1.0), 3428 epsilon, 3429 ); 3430 try expectVecApproxEqAbs( 3431 hsvToRgb(rgbToHsv(f32x4(0.82198, 0.1839, 0.632, 1.0))), 3432 f32x4(0.82198, 0.1839, 0.632, 1.0), 3433 epsilon, 3434 ); 3435 try expectVecApproxEqAbs( 3436 rgbToHsv(hsvToRgb(f32x4(0.82198, 0.1839, 0.632, 1.0))), 3437 f32x4(0.82198, 0.1839, 0.632, 1.0), 3438 epsilon, 3439 ); 3440 try expectVecApproxEqAbs( 3441 rgbToHsv(hsvToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), 3442 f32x4(0.1839, 0.82198, 0.632, 1.0), 3443 epsilon, 3444 ); 3445} 3446 3447pub fn rgbToSrgb(rgb: F32x4) F32x4 { 3448 const static = struct { 3449 const cutoff = f32x4(0.0031308, 0.0031308, 0.0031308, 1.0); 3450 const linear = f32x4(12.92, 12.92, 12.92, 1.0); 3451 const scale = f32x4(1.055, 1.055, 1.055, 1.0); 3452 const bias = f32x4(0.055, 0.055, 0.055, 1.0); 3453 const rgamma = 1.0 / 2.4; 3454 }; 3455 var v = saturate(rgb); 3456 const v0 = v * static.linear; 3457 const v1 = static.scale * f32x4( 3458 math.pow(f32, v[0], static.rgamma), 3459 math.pow(f32, v[1], static.rgamma), 3460 math.pow(f32, v[2], static.rgamma), 3461 v[3], 3462 ) - static.bias; 3463 v = select(v < static.cutoff, v0, v1); 3464 return select(boolx4(true, true, true, false), v, rgb); 3465} 3466test "zmath.color.rgbToSrgb" { 3467 const epsilon = 0.001; 3468 try expectVecApproxEqAbs(rgbToSrgb(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.484, 0.665, 0.906, 1.0), epsilon); 3469} 3470 3471pub fn srgbToRgb(srgb: F32x4) F32x4 { 3472 const static = struct { 3473 const cutoff = f32x4(0.04045, 0.04045, 0.04045, 1.0); 3474 const rlinear = f32x4(1.0 / 12.92, 1.0 / 12.92, 1.0 / 12.92, 1.0); 3475 const scale = f32x4(1.0 / 1.055, 1.0 / 1.055, 1.0 / 1.055, 1.0); 3476 const bias = f32x4(0.055, 0.055, 0.055, 1.0); 3477 const gamma = 2.4; 3478 }; 3479 var v = saturate(srgb); 3480 const v0 = v * static.rlinear; 3481 var v1 = static.scale * (v + static.bias); 3482 v1 = f32x4( 3483 math.pow(f32, v1[0], static.gamma), 3484 math.pow(f32, v1[1], static.gamma), 3485 math.pow(f32, v1[2], static.gamma), 3486 v1[3], 3487 ); 3488 v = select(v > static.cutoff, v1, v0); 3489 return select(boolx4(true, true, true, false), v, srgb); 3490} 3491test "zmath.color.srgbToRgb" { 3492 const epsilon = 0.0007; 3493 try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), srgbToRgb(f32x4(0.484, 0.665, 0.906, 1.0)), epsilon); 3494 try expectVecApproxEqAbs( 3495 rgbToSrgb(srgbToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), 3496 f32x4(0.1839, 0.82198, 0.632, 1.0), 3497 epsilon, 3498 ); 3499} 3500// ------------------------------------------------------------------------------ 3501// 3502// X. Misc functions 3503// 3504// ------------------------------------------------------------------------------ 3505pub fn linePointDistance(linept0: Vec, linept1: Vec, pt: Vec) F32x4 { 3506 const ptvec = pt - linept0; 3507 const linevec = linept1 - linept0; 3508 const scale = dot3(ptvec, linevec) / lengthSq3(linevec); 3509 return length3(ptvec - linevec * scale); 3510} 3511test "zmath.linePointDistance" { 3512 { 3513 const linept0 = f32x4(-1.0, -2.0, -3.0, 1.0); 3514 const linept1 = f32x4(1.0, 2.0, 3.0, 1.0); 3515 const pt = f32x4(1.0, 1.0, 1.0, 1.0); 3516 const v = linePointDistance(linept0, linept1, pt); 3517 try expectVecApproxEqAbs(v, splat(F32x4, 0.654), 0.001); 3518 } 3519} 3520 3521fn sin32(v: f32) f32 { 3522 var y = v - math.tau * @round(v * 1.0 / math.tau); 3523 3524 if (y > 0.5 * math.pi) { 3525 y = math.pi - y; 3526 } else if (y < -math.pi * 0.5) { 3527 y = -math.pi - y; 3528 } 3529 const y2 = y * y; 3530 3531 // 11-degree minimax approximation 3532 var sinv = mulAdd(@as(f32, -2.3889859e-08), y2, 2.7525562e-06); 3533 sinv = mulAdd(sinv, y2, -0.00019840874); 3534 sinv = mulAdd(sinv, y2, 0.0083333310); 3535 sinv = mulAdd(sinv, y2, -0.16666667); 3536 return y * mulAdd(sinv, y2, 1.0); 3537} 3538fn cos32(v: f32) f32 { 3539 var y = v - math.tau * @round(v * 1.0 / math.tau); 3540 3541 const sign = blk: { 3542 if (y > 0.5 * math.pi) { 3543 y = math.pi - y; 3544 break :blk @as(f32, -1.0); 3545 } else if (y < -math.pi * 0.5) { 3546 y = -math.pi - y; 3547 break :blk @as(f32, -1.0); 3548 } else { 3549 break :blk @as(f32, 1.0); 3550 } 3551 }; 3552 const y2 = y * y; 3553 3554 // 10-degree minimax approximation 3555 var cosv = mulAdd(@as(f32, -2.6051615e-07), y2, 2.4760495e-05); 3556 cosv = mulAdd(cosv, y2, -0.0013888378); 3557 cosv = mulAdd(cosv, y2, 0.041666638); 3558 cosv = mulAdd(cosv, y2, -0.5); 3559 return sign * mulAdd(cosv, y2, 1.0); 3560} 3561fn sincos32(v: f32) [2]f32 { 3562 var y = v - math.tau * @round(v * 1.0 / math.tau); 3563 3564 const sign = blk: { 3565 if (y > 0.5 * math.pi) { 3566 y = math.pi - y; 3567 break :blk @as(f32, -1.0); 3568 } else if (y < -math.pi * 0.5) { 3569 y = -math.pi - y; 3570 break :blk @as(f32, -1.0); 3571 } else { 3572 break :blk @as(f32, 1.0); 3573 } 3574 }; 3575 const y2 = y * y; 3576 3577 // 11-degree minimax approximation 3578 var sinv = mulAdd(@as(f32, -2.3889859e-08), y2, 2.7525562e-06); 3579 sinv = mulAdd(sinv, y2, -0.00019840874); 3580 sinv = mulAdd(sinv, y2, 0.0083333310); 3581 sinv = mulAdd(sinv, y2, -0.16666667); 3582 sinv = y * mulAdd(sinv, y2, 1.0); 3583 3584 // 10-degree minimax approximation 3585 var cosv = mulAdd(@as(f32, -2.6051615e-07), y2, 2.4760495e-05); 3586 cosv = mulAdd(cosv, y2, -0.0013888378); 3587 cosv = mulAdd(cosv, y2, 0.041666638); 3588 cosv = mulAdd(cosv, y2, -0.5); 3589 cosv = sign * mulAdd(cosv, y2, 1.0); 3590 3591 return .{ sinv, cosv }; 3592} 3593test "zmath.sincos32" { 3594 const epsilon = 0.0001; 3595 3596 try expect(math.isNan(sincos32(math.inf(f32))[0])); 3597 try expect(math.isNan(sincos32(math.inf(f32))[1])); 3598 try expect(math.isNan(sincos32(-math.inf(f32))[0])); 3599 try expect(math.isNan(sincos32(-math.inf(f32))[1])); 3600 try expect(math.isNan(sincos32(math.nan(f32))[0])); 3601 try expect(math.isNan(sincos32(-math.nan(f32))[1])); 3602 3603 try expect(math.isNan(sin32(math.inf(f32)))); 3604 try expect(math.isNan(cos32(math.inf(f32)))); 3605 try expect(math.isNan(sin32(-math.inf(f32)))); 3606 try expect(math.isNan(cos32(-math.inf(f32)))); 3607 try expect(math.isNan(sin32(math.nan(f32)))); 3608 try expect(math.isNan(cos32(-math.nan(f32)))); 3609 3610 var f: f32 = -100.0; 3611 var i: u32 = 0; 3612 while (i < 100) : (i += 1) { 3613 const sc = sincos32(f); 3614 const s0 = sin32(f); 3615 const c0 = cos32(f); 3616 const s = @sin(f); 3617 const c = @cos(f); 3618 try expect(math.approxEqAbs(f32, sc[0], s, epsilon)); 3619 try expect(math.approxEqAbs(f32, sc[1], c, epsilon)); 3620 try expect(math.approxEqAbs(f32, s0, s, epsilon)); 3621 try expect(math.approxEqAbs(f32, c0, c, epsilon)); 3622 f += 0.12345 * @as(f32, @floatFromInt(i)); 3623 } 3624} 3625 3626fn asin32(v: f32) f32 { 3627 const x = @abs(v); 3628 var omx = 1.0 - x; 3629 if (omx < 0.0) { 3630 omx = 0.0; 3631 } 3632 const root = @sqrt(omx); 3633 3634 // 7-degree minimax approximation 3635 var result = mulAdd(@as(f32, -0.0012624911), x, 0.0066700901); 3636 result = mulAdd(result, x, -0.0170881256); 3637 result = mulAdd(result, x, 0.0308918810); 3638 result = mulAdd(result, x, -0.0501743046); 3639 result = mulAdd(result, x, 0.0889789874); 3640 result = mulAdd(result, x, -0.2145988016); 3641 result = root * mulAdd(result, x, 1.5707963050); 3642 3643 return if (v >= 0.0) 0.5 * math.pi - result else result - 0.5 * math.pi; 3644} 3645test "zmath.asin32" { 3646 const epsilon = 0.0001; 3647 3648 try expect(math.approxEqAbs(f32, asin(@as(f32, -1.1)), -0.5 * math.pi, epsilon)); 3649 try expect(math.approxEqAbs(f32, asin(@as(f32, 1.1)), 0.5 * math.pi, epsilon)); 3650 try expect(math.approxEqAbs(f32, asin(@as(f32, -1000.1)), -0.5 * math.pi, epsilon)); 3651 try expect(math.approxEqAbs(f32, asin(@as(f32, 100000.1)), 0.5 * math.pi, epsilon)); 3652 try expect(math.isNan(asin(math.inf(f32)))); 3653 try expect(math.isNan(asin(-math.inf(f32)))); 3654 try expect(math.isNan(asin(math.nan(f32)))); 3655 try expect(math.isNan(asin(-math.nan(f32)))); 3656 3657 try expectVecApproxEqAbs(asin(splat(F32x8, -100.0)), splat(F32x8, -0.5 * math.pi), epsilon); 3658 try expectVecApproxEqAbs(asin(splat(F32x16, 100.0)), splat(F32x16, 0.5 * math.pi), epsilon); 3659 try expect(all(isNan(asin(splat(F32x4, math.inf(f32)))), 0) == true); 3660 try expect(all(isNan(asin(splat(F32x4, -math.inf(f32)))), 0) == true); 3661 try expect(all(isNan(asin(splat(F32x4, math.nan(f32)))), 0) == true); 3662 try expect(all(isNan(asin(splat(F32x4, math.snan(f32)))), 0) == true); 3663 3664 var f: f32 = -1.0; 3665 var i: u32 = 0; 3666 while (i < 8) : (i += 1) { 3667 const r0 = asin32(f); 3668 const r1 = math.asin(f); 3669 const r4 = asin(splat(F32x4, f)); 3670 const r8 = asin(splat(F32x8, f)); 3671 const r16 = asin(splat(F32x16, f)); 3672 try expect(math.approxEqAbs(f32, r0, r1, epsilon)); 3673 try expectVecApproxEqAbs(r4, splat(F32x4, r1), epsilon); 3674 try expectVecApproxEqAbs(r8, splat(F32x8, r1), epsilon); 3675 try expectVecApproxEqAbs(r16, splat(F32x16, r1), epsilon); 3676 f += 0.09 * @as(f32, @floatFromInt(i)); 3677 } 3678} 3679 3680fn acos32(v: f32) f32 { 3681 const x = @abs(v); 3682 var omx = 1.0 - x; 3683 if (omx < 0.0) { 3684 omx = 0.0; 3685 } 3686 const root = @sqrt(omx); 3687 3688 // 7-degree minimax approximation 3689 var result = mulAdd(@as(f32, -0.0012624911), x, 0.0066700901); 3690 result = mulAdd(result, x, -0.0170881256); 3691 result = mulAdd(result, x, 0.0308918810); 3692 result = mulAdd(result, x, -0.0501743046); 3693 result = mulAdd(result, x, 0.0889789874); 3694 result = mulAdd(result, x, -0.2145988016); 3695 result = root * mulAdd(result, x, 1.5707963050); 3696 3697 return if (v >= 0.0) result else math.pi - result; 3698} 3699test "zmath.acos32" { 3700 const epsilon = 0.1; 3701 3702 try expect(math.approxEqAbs(f32, acos(@as(f32, -1.1)), math.pi, epsilon)); 3703 try expect(math.approxEqAbs(f32, acos(@as(f32, -10000.1)), math.pi, epsilon)); 3704 try expect(math.approxEqAbs(f32, acos(@as(f32, 1.1)), 0.0, epsilon)); 3705 try expect(math.approxEqAbs(f32, acos(@as(f32, 1000.1)), 0.0, epsilon)); 3706 try expect(math.isNan(acos(math.inf(f32)))); 3707 try expect(math.isNan(acos(-math.inf(f32)))); 3708 try expect(math.isNan(acos(math.nan(f32)))); 3709 try expect(math.isNan(acos(-math.nan(f32)))); 3710 3711 try expectVecApproxEqAbs(acos(splat(F32x8, -100.0)), splat(F32x8, math.pi), epsilon); 3712 try expectVecApproxEqAbs(acos(splat(F32x16, 100.0)), splat(F32x16, 0.0), epsilon); 3713 try expect(all(isNan(acos(splat(F32x4, math.inf(f32)))), 0) == true); 3714 try expect(all(isNan(acos(splat(F32x4, -math.inf(f32)))), 0) == true); 3715 try expect(all(isNan(acos(splat(F32x4, math.nan(f32)))), 0) == true); 3716 try expect(all(isNan(acos(splat(F32x4, math.snan(f32)))), 0) == true); 3717 3718 var f: f32 = -1.0; 3719 var i: u32 = 0; 3720 while (i < 8) : (i += 1) { 3721 const r0 = acos32(f); 3722 const r1 = math.acos(f); 3723 const r4 = acos(splat(F32x4, f)); 3724 const r8 = acos(splat(F32x8, f)); 3725 const r16 = acos(splat(F32x16, f)); 3726 try expect(math.approxEqAbs(f32, r0, r1, epsilon)); 3727 try expectVecApproxEqAbs(r4, splat(F32x4, r1), epsilon); 3728 try expectVecApproxEqAbs(r8, splat(F32x8, r1), epsilon); 3729 try expectVecApproxEqAbs(r16, splat(F32x16, r1), epsilon); 3730 f += 0.09 * @as(f32, @floatFromInt(i)); 3731 } 3732} 3733 3734pub fn modAngle32(in_angle: f32) f32 { 3735 const angle = in_angle + math.pi; 3736 var temp: f32 = @abs(angle); 3737 temp = temp - (2.0 * math.pi * @as(f32, @floatFromInt(@as(i32, @intFromFloat(temp / math.pi))))); 3738 temp = temp - math.pi; 3739 if (angle < 0.0) { 3740 temp = -temp; 3741 } 3742 return temp; 3743} 3744 3745pub fn cmulSoa(re0: anytype, im0: anytype, re1: anytype, im1: anytype) [2]@TypeOf(re0, im0, re1, im1) { 3746 const re0_re1 = re0 * re1; 3747 const re0_im1 = re0 * im1; 3748 return .{ 3749 mulAdd(-im0, im1, re0_re1), // re 3750 mulAdd(re1, im0, re0_im1), // im 3751 }; 3752} 3753// ------------------------------------------------------------------------------ 3754// 3755// FFT (implementation based on xdsp.h from DirectXMath) 3756// 3757// ------------------------------------------------------------------------------ 3758fn fftButterflyDit4_1(re0: *F32x4, im0: *F32x4) void { 3759 const re0l = swizzle(re0.*, .x, .x, .y, .y); 3760 const re0h = swizzle(re0.*, .z, .z, .w, .w); 3761 3762 const im0l = swizzle(im0.*, .x, .x, .y, .y); 3763 const im0h = swizzle(im0.*, .z, .z, .w, .w); 3764 3765 const re_temp = mulAdd(re0h, f32x4(1.0, -1.0, 1.0, -1.0), re0l); 3766 const im_temp = mulAdd(im0h, f32x4(1.0, -1.0, 1.0, -1.0), im0l); 3767 3768 const re_shuf0 = @shuffle(f32, re_temp, im_temp, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); 3769 const re_shuf = swizzle(re_shuf0, .x, .w, .x, .w); 3770 const im_shuf = swizzle(re_shuf0, .z, .y, .z, .y); 3771 3772 const re_templ = swizzle(re_temp, .x, .y, .x, .y); 3773 const im_templ = swizzle(im_temp, .x, .y, .x, .y); 3774 3775 re0.* = mulAdd(re_shuf, f32x4(1.0, 1.0, -1.0, -1.0), re_templ); 3776 im0.* = mulAdd(im_shuf, f32x4(1.0, -1.0, -1.0, 1.0), im_templ); 3777} 3778 3779fn fftButterflyDit4_4( 3780 re0: *F32x4, 3781 re1: *F32x4, 3782 re2: *F32x4, 3783 re3: *F32x4, 3784 im0: *F32x4, 3785 im1: *F32x4, 3786 im2: *F32x4, 3787 im3: *F32x4, 3788 unity_table_re: []const F32x4, 3789 unity_table_im: []const F32x4, 3790 stride: u32, 3791 last: bool, 3792) void { 3793 const re_temp0 = re0.* + re2.*; 3794 const im_temp0 = im0.* + im2.*; 3795 3796 const re_temp2 = re1.* + re3.*; 3797 const im_temp2 = im1.* + im3.*; 3798 3799 const re_temp1 = re0.* - re2.*; 3800 const im_temp1 = im0.* - im2.*; 3801 3802 const re_temp3 = re1.* - re3.*; 3803 const im_temp3 = im1.* - im3.*; 3804 3805 var re_temp4 = re_temp0 + re_temp2; 3806 var im_temp4 = im_temp0 + im_temp2; 3807 3808 var re_temp5 = re_temp1 + im_temp3; 3809 var im_temp5 = im_temp1 - re_temp3; 3810 3811 var re_temp6 = re_temp0 - re_temp2; 3812 var im_temp6 = im_temp0 - im_temp2; 3813 3814 var re_temp7 = re_temp1 - im_temp3; 3815 var im_temp7 = im_temp1 + re_temp3; 3816 3817 { 3818 const re_im = cmulSoa(re_temp5, im_temp5, unity_table_re[stride], unity_table_im[stride]); 3819 re_temp5 = re_im[0]; 3820 im_temp5 = re_im[1]; 3821 } 3822 { 3823 const re_im = cmulSoa(re_temp6, im_temp6, unity_table_re[stride * 2], unity_table_im[stride * 2]); 3824 re_temp6 = re_im[0]; 3825 im_temp6 = re_im[1]; 3826 } 3827 { 3828 const re_im = cmulSoa(re_temp7, im_temp7, unity_table_re[stride * 3], unity_table_im[stride * 3]); 3829 re_temp7 = re_im[0]; 3830 im_temp7 = re_im[1]; 3831 } 3832 3833 if (last) { 3834 fftButterflyDit4_1(&re_temp4, &im_temp4); 3835 fftButterflyDit4_1(&re_temp5, &im_temp5); 3836 fftButterflyDit4_1(&re_temp6, &im_temp6); 3837 fftButterflyDit4_1(&re_temp7, &im_temp7); 3838 } 3839 3840 re0.* = re_temp4; 3841 im0.* = im_temp4; 3842 3843 re1.* = re_temp5; 3844 im1.* = im_temp5; 3845 3846 re2.* = re_temp6; 3847 im2.* = im_temp6; 3848 3849 re3.* = re_temp7; 3850 im3.* = im_temp7; 3851} 3852 3853fn fft4(re: []F32x4, im: []F32x4, count: u32) void { 3854 assert(std.math.isPowerOfTwo(count)); 3855 assert(re.len >= count); 3856 assert(im.len >= count); 3857 3858 var index: u32 = 0; 3859 while (index < count) : (index += 1) { 3860 fftButterflyDit4_1(&re[index], &im[index]); 3861 } 3862} 3863test "zmath.fft4" { 3864 const epsilon = 0.0001; 3865 var re = [_]F32x4{f32x4(1.0, 2.0, 3.0, 4.0)}; 3866 var im = [_]F32x4{f32x4s(0.0)}; 3867 fft4(re[0..], im[0..], 1); 3868 3869 var re_uns: [1]F32x4 = undefined; 3870 var im_uns: [1]F32x4 = undefined; 3871 fftUnswizzle(re[0..], re_uns[0..]); 3872 fftUnswizzle(im[0..], im_uns[0..]); 3873 3874 try expectVecApproxEqAbs(re_uns[0], f32x4(10.0, -2.0, -2.0, -2.0), epsilon); 3875 try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 2.0, 0.0, -2.0), epsilon); 3876} 3877 3878fn fft8(re: []F32x4, im: []F32x4, count: u32) void { 3879 assert(std.math.isPowerOfTwo(count)); 3880 assert(re.len >= 2 * count); 3881 assert(im.len >= 2 * count); 3882 3883 var index: u32 = 0; 3884 while (index < count) : (index += 1) { 3885 var pre = re[index * 2 ..]; 3886 var pim = im[index * 2 ..]; 3887 3888 var odds_re = @shuffle(f32, pre[0], pre[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); 3889 var evens_re = @shuffle(f32, pre[0], pre[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); 3890 var odds_im = @shuffle(f32, pim[0], pim[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); 3891 var evens_im = @shuffle(f32, pim[0], pim[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); 3892 fftButterflyDit4_1(&odds_re, &odds_im); 3893 fftButterflyDit4_1(&evens_re, &evens_im); 3894 3895 { 3896 const re_im = cmulSoa( 3897 odds_re, 3898 odds_im, 3899 f32x4(1.0, 0.70710677, 0.0, -0.70710677), 3900 f32x4(0.0, -0.70710677, -1.0, -0.70710677), 3901 ); 3902 pre[0] = evens_re + re_im[0]; 3903 pim[0] = evens_im + re_im[1]; 3904 } 3905 { 3906 const re_im = cmulSoa( 3907 odds_re, 3908 odds_im, 3909 f32x4(-1.0, -0.70710677, 0.0, 0.70710677), 3910 f32x4(0.0, 0.70710677, 1.0, 0.70710677), 3911 ); 3912 pre[1] = evens_re + re_im[0]; 3913 pim[1] = evens_im + re_im[1]; 3914 } 3915 } 3916} 3917test "zmath.fft8" { 3918 const epsilon = 0.0001; 3919 var re = [_]F32x4{ f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0) }; 3920 var im = [_]F32x4{ f32x4s(0.0), f32x4s(0.0) }; 3921 fft8(re[0..], im[0..], 1); 3922 3923 var re_uns: [2]F32x4 = undefined; 3924 var im_uns: [2]F32x4 = undefined; 3925 fftUnswizzle(re[0..], re_uns[0..]); 3926 fftUnswizzle(im[0..], im_uns[0..]); 3927 3928 try expectVecApproxEqAbs(re_uns[0], f32x4(36.0, -4.0, -4.0, -4.0), epsilon); 3929 try expectVecApproxEqAbs(re_uns[1], f32x4(-4.0, -4.0, -4.0, -4.0), epsilon); 3930 try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 9.656854, 4.0, 1.656854), epsilon); 3931 try expectVecApproxEqAbs(im_uns[1], f32x4(0.0, -1.656854, -4.0, -9.656854), epsilon); 3932} 3933 3934fn fft16(re: []F32x4, im: []F32x4, count: u32) void { 3935 assert(std.math.isPowerOfTwo(count)); 3936 assert(re.len >= 4 * count); 3937 assert(im.len >= 4 * count); 3938 3939 const static = struct { 3940 const unity_table_re = [4]F32x4{ 3941 f32x4(1.0, 1.0, 1.0, 1.0), 3942 f32x4(1.0, 0.92387950, 0.70710677, 0.38268343), 3943 f32x4(1.0, 0.70710677, -4.3711388e-008, -0.70710677), 3944 f32x4(1.0, 0.38268343, -0.70710677, -0.92387950), 3945 }; 3946 const unity_table_im = [4]F32x4{ 3947 f32x4(-0.0, -0.0, -0.0, -0.0), 3948 f32x4(-0.0, -0.38268343, -0.70710677, -0.92387950), 3949 f32x4(-0.0, -0.70710677, -1.0, -0.70710677), 3950 f32x4(-0.0, -0.92387950, -0.70710677, 0.38268343), 3951 }; 3952 }; 3953 3954 var index: u32 = 0; 3955 while (index < count) : (index += 1) { 3956 fftButterflyDit4_4( 3957 &re[index * 4], 3958 &re[index * 4 + 1], 3959 &re[index * 4 + 2], 3960 &re[index * 4 + 3], 3961 &im[index * 4], 3962 &im[index * 4 + 1], 3963 &im[index * 4 + 2], 3964 &im[index * 4 + 3], 3965 static.unity_table_re[0..], 3966 static.unity_table_im[0..], 3967 1, 3968 true, 3969 ); 3970 } 3971} 3972test "zmath.fft16" { 3973 const epsilon = 0.0001; 3974 var re = [_]F32x4{ 3975 f32x4(1.0, 2.0, 3.0, 4.0), 3976 f32x4(5.0, 6.0, 7.0, 8.0), 3977 f32x4(9.0, 10.0, 11.0, 12.0), 3978 f32x4(13.0, 14.0, 15.0, 16.0), 3979 }; 3980 var im = [_]F32x4{ f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0) }; 3981 fft16(re[0..], im[0..], 1); 3982 3983 var re_uns: [4]F32x4 = undefined; 3984 var im_uns: [4]F32x4 = undefined; 3985 fftUnswizzle(re[0..], re_uns[0..]); 3986 fftUnswizzle(im[0..], im_uns[0..]); 3987 3988 try expectVecApproxEqAbs(re_uns[0], f32x4(136.0, -8.0, -8.0, -8.0), epsilon); 3989 try expectVecApproxEqAbs(re_uns[1], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon); 3990 try expectVecApproxEqAbs(re_uns[2], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon); 3991 try expectVecApproxEqAbs(re_uns[3], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon); 3992 try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 40.218716, 19.313708, 11.972846), epsilon); 3993 try expectVecApproxEqAbs(im_uns[1], f32x4(8.0, 5.345429, 3.313708, 1.591299), epsilon); 3994 try expectVecApproxEqAbs(im_uns[2], f32x4(0.0, -1.591299, -3.313708, -5.345429), epsilon); 3995 try expectVecApproxEqAbs(im_uns[3], f32x4(-8.0, -11.972846, -19.313708, -40.218716), epsilon); 3996} 3997 3998fn fftN(re: []F32x4, im: []F32x4, unity_table: []const F32x4, length: u32, count: u32) void { 3999 assert(length > 16); 4000 assert(std.math.isPowerOfTwo(length)); 4001 assert(std.math.isPowerOfTwo(count)); 4002 assert(re.len >= length * count / 4); 4003 assert(re.len == im.len); 4004 4005 const total = count * length; 4006 const total_vectors = total / 4; 4007 const stage_vectors = length / 4; 4008 const stage_vectors_mask = stage_vectors - 1; 4009 const stride = length / 16; 4010 const stride_mask = stride - 1; 4011 const stride_inv_mask = ~stride_mask; 4012 4013 var unity_table_re = unity_table; 4014 var unity_table_im = unity_table[length / 4 ..]; 4015 4016 var index: u32 = 0; 4017 while (index < total_vectors / 4) : (index += 1) { 4018 const n = (index & stride_inv_mask) * 4 + (index & stride_mask); 4019 fftButterflyDit4_4( 4020 &re[n], 4021 &re[n + stride], 4022 &re[n + stride * 2], 4023 &re[n + stride * 3], 4024 &im[n], 4025 &im[n + stride], 4026 &im[n + stride * 2], 4027 &im[n + stride * 3], 4028 unity_table_re[(n & stage_vectors_mask)..], 4029 unity_table_im[(n & stage_vectors_mask)..], 4030 stride, 4031 false, 4032 ); 4033 } 4034 4035 if (length > 16 * 4) { 4036 fftN(re, im, unity_table[(length / 2)..], length / 4, count * 4); 4037 } else if (length == 16 * 4) { 4038 fft16(re, im, count * 4); 4039 } else if (length == 8 * 4) { 4040 fft8(re, im, count * 4); 4041 } else if (length == 4 * 4) { 4042 fft4(re, im, count * 4); 4043 } 4044} 4045test "zmath.fftN" { 4046 var unity_table: [128]F32x4 = undefined; 4047 const epsilon = 0.0001; 4048 4049 // 32 samples 4050 { 4051 var re = [_]F32x4{ 4052 f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4053 f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4054 f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4055 f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4056 }; 4057 var im = [_]F32x4{ 4058 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4059 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4060 }; 4061 4062 fftInitUnityTable(unity_table[0..32]); 4063 fft(re[0..], im[0..], unity_table[0..32]); 4064 4065 try expectVecApproxEqAbs(re[0], f32x4(528.0, -16.0, -16.0, -16.0), epsilon); 4066 try expectVecApproxEqAbs(re[1], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); 4067 try expectVecApproxEqAbs(re[2], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); 4068 try expectVecApproxEqAbs(re[3], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); 4069 try expectVecApproxEqAbs(re[4], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); 4070 try expectVecApproxEqAbs(re[5], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); 4071 try expectVecApproxEqAbs(re[6], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); 4072 try expectVecApproxEqAbs(re[7], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); 4073 try expectVecApproxEqAbs(im[0], f32x4(0.0, 162.450726, 80.437432, 52.744931), epsilon); 4074 try expectVecApproxEqAbs(im[1], f32x4(38.627417, 29.933895, 23.945692, 19.496056), epsilon); 4075 try expectVecApproxEqAbs(im[2], f32x4(16.0, 13.130861, 10.690858, 8.552178), epsilon); 4076 try expectVecApproxEqAbs(im[3], f32x4(6.627417, 4.853547, 3.182598, 1.575862), epsilon); 4077 try expectVecApproxEqAbs(im[4], f32x4(0.0, -1.575862, -3.182598, -4.853547), epsilon); 4078 try expectVecApproxEqAbs(im[5], f32x4(-6.627417, -8.552178, -10.690858, -13.130861), epsilon); 4079 try expectVecApproxEqAbs(im[6], f32x4(-16.0, -19.496056, -23.945692, -29.933895), epsilon); 4080 try expectVecApproxEqAbs(im[7], f32x4(-38.627417, -52.744931, -80.437432, -162.450726), epsilon); 4081 } 4082 4083 // 64 samples 4084 { 4085 var re = [_]F32x4{ 4086 f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4087 f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4088 f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4089 f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4090 f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4091 f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4092 f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4093 f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4094 }; 4095 var im = [_]F32x4{ 4096 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4097 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4098 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4099 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4100 }; 4101 4102 fftInitUnityTable(unity_table[0..64]); 4103 fft(re[0..], im[0..], unity_table[0..64]); 4104 4105 try expectVecApproxEqAbs(re[0], f32x4(1056.0, 0.0, -32.0, 0.0), epsilon); 4106 var i: u32 = 1; 4107 while (i < 16) : (i += 1) { 4108 try expectVecApproxEqAbs(re[i], f32x4(-32.0, 0.0, -32.0, 0.0), epsilon); 4109 } 4110 4111 const expected = [_]f32{ 4112 0.0, 0.0, 324.901452, 0.000000, 160.874864, 0.0, 105.489863, 0.000000, 4113 77.254834, 0.0, 59.867789, 0.0, 47.891384, 0.0, 38.992113, 0.0, 4114 32.000000, 0.000000, 26.261721, 0.000000, 21.381716, 0.000000, 17.104356, 0.000000, 4115 13.254834, 0.000000, 9.707094, 0.000000, 6.365196, 0.000000, 3.151725, 0.000000, 4116 0.000000, 0.000000, -3.151725, 0.000000, -6.365196, 0.000000, -9.707094, 0.000000, 4117 -13.254834, 0.000000, -17.104356, 0.000000, -21.381716, 0.000000, -26.261721, 0.000000, 4118 -32.000000, 0.000000, -38.992113, 0.000000, -47.891384, 0.000000, -59.867789, 0.000000, 4119 -77.254834, 0.000000, -105.489863, 0.000000, -160.874864, 0.000000, -324.901452, 0.000000, 4120 }; 4121 for (expected, 0..) |e, ie| { 4122 try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon)); 4123 } 4124 } 4125 4126 // 128 samples 4127 { 4128 var re = [_]F32x4{ 4129 f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4130 f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4131 f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4132 f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4133 f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4134 f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4135 f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4136 f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4137 f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4138 f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4139 f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4140 f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4141 f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4142 f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4143 f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4144 f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4145 }; 4146 var im = [_]F32x4{ 4147 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4148 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4149 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4150 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4151 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4152 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4153 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4154 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4155 }; 4156 4157 fftInitUnityTable(unity_table[0..128]); 4158 fft(re[0..], im[0..], unity_table[0..128]); 4159 4160 try expectVecApproxEqAbs(re[0], f32x4(2112.0, 0.0, 0.0, 0.0), epsilon); 4161 var i: u32 = 1; 4162 while (i < 32) : (i += 1) { 4163 try expectVecApproxEqAbs(re[i], f32x4(-64.0, 0.0, 0.0, 0.0), epsilon); 4164 } 4165 4166 const expected = [_]f32{ 4167 0.000000, 0.000000, 0.000000, 0.000000, 649.802905, 0.000000, 0.000000, 0.000000, 4168 321.749727, 0.000000, 0.000000, 0.000000, 210.979725, 0.000000, 0.000000, 0.000000, 4169 154.509668, 0.000000, 0.000000, 0.000000, 119.735578, 0.000000, 0.000000, 0.000000, 4170 95.782769, 0.000000, 0.000000, 0.000000, 77.984226, 0.000000, 0.000000, 0.000000, 4171 64.000000, 0.000000, 0.000000, 0.000000, 52.523443, 0.000000, 0.000000, 0.000000, 4172 42.763433, 0.000000, 0.000000, 0.000000, 34.208713, 0.000000, 0.000000, 0.000000, 4173 26.509668, 0.000000, 0.000000, 0.000000, 19.414188, 0.000000, 0.000000, 0.000000, 4174 12.730392, 0.000000, 0.000000, 0.000000, 6.303450, 0.000000, 0.000000, 0.000000, 4175 0.000000, 0.000000, 0.000000, 0.000000, -6.303450, 0.000000, 0.000000, 0.000000, 4176 -12.730392, 0.000000, 0.000000, 0.000000, -19.414188, 0.000000, 0.000000, 0.000000, 4177 -26.509668, 0.000000, 0.000000, 0.000000, -34.208713, 0.000000, 0.000000, 0.000000, 4178 -42.763433, 0.000000, 0.000000, 0.000000, -52.523443, 0.000000, 0.000000, 0.000000, 4179 -64.000000, 0.000000, 0.000000, 0.000000, -77.984226, 0.000000, 0.000000, 0.000000, 4180 -95.782769, 0.000000, 0.000000, 0.000000, -119.735578, 0.000000, 0.000000, 0.000000, 4181 -154.509668, 0.000000, 0.000000, 0.000000, -210.979725, 0.000000, 0.000000, 0.000000, 4182 -321.749727, 0.000000, 0.000000, 0.000000, -649.802905, 0.000000, 0.000000, 0.000000, 4183 }; 4184 for (expected, 0..) |e, ie| { 4185 try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon)); 4186 } 4187 } 4188} 4189 4190fn fftUnswizzle(input: []const F32x4, output: []F32x4) void { 4191 assert(std.math.isPowerOfTwo(input.len)); 4192 assert(input.len == output.len); 4193 assert(input.ptr != output.ptr); 4194 4195 const log2_length = std.math.log2_int(usize, input.len * 4); 4196 assert(log2_length >= 2); 4197 4198 const length = input.len; 4199 4200 const f32_output = @as([*]f32, @ptrCast(output.ptr))[0 .. output.len * 4]; 4201 4202 const static = struct { 4203 const swizzle_table = [256]u8{ 4204 0x00, 0x40, 0x80, 0xC0, 0x10, 0x50, 0x90, 0xD0, 0x20, 0x60, 0xA0, 0xE0, 0x30, 0x70, 0xB0, 0xF0, 4205 0x04, 0x44, 0x84, 0xC4, 0x14, 0x54, 0x94, 0xD4, 0x24, 0x64, 0xA4, 0xE4, 0x34, 0x74, 0xB4, 0xF4, 4206 0x08, 0x48, 0x88, 0xC8, 0x18, 0x58, 0x98, 0xD8, 0x28, 0x68, 0xA8, 0xE8, 0x38, 0x78, 0xB8, 0xF8, 4207 0x0C, 0x4C, 0x8C, 0xCC, 0x1C, 0x5C, 0x9C, 0xDC, 0x2C, 0x6C, 0xAC, 0xEC, 0x3C, 0x7C, 0xBC, 0xFC, 4208 0x01, 0x41, 0x81, 0xC1, 0x11, 0x51, 0x91, 0xD1, 0x21, 0x61, 0xA1, 0xE1, 0x31, 0x71, 0xB1, 0xF1, 4209 0x05, 0x45, 0x85, 0xC5, 0x15, 0x55, 0x95, 0xD5, 0x25, 0x65, 0xA5, 0xE5, 0x35, 0x75, 0xB5, 0xF5, 4210 0x09, 0x49, 0x89, 0xC9, 0x19, 0x59, 0x99, 0xD9, 0x29, 0x69, 0xA9, 0xE9, 0x39, 0x79, 0xB9, 0xF9, 4211 0x0D, 0x4D, 0x8D, 0xCD, 0x1D, 0x5D, 0x9D, 0xDD, 0x2D, 0x6D, 0xAD, 0xED, 0x3D, 0x7D, 0xBD, 0xFD, 4212 0x02, 0x42, 0x82, 0xC2, 0x12, 0x52, 0x92, 0xD2, 0x22, 0x62, 0xA2, 0xE2, 0x32, 0x72, 0xB2, 0xF2, 4213 0x06, 0x46, 0x86, 0xC6, 0x16, 0x56, 0x96, 0xD6, 0x26, 0x66, 0xA6, 0xE6, 0x36, 0x76, 0xB6, 0xF6, 4214 0x0A, 0x4A, 0x8A, 0xCA, 0x1A, 0x5A, 0x9A, 0xDA, 0x2A, 0x6A, 0xAA, 0xEA, 0x3A, 0x7A, 0xBA, 0xFA, 4215 0x0E, 0x4E, 0x8E, 0xCE, 0x1E, 0x5E, 0x9E, 0xDE, 0x2E, 0x6E, 0xAE, 0xEE, 0x3E, 0x7E, 0xBE, 0xFE, 4216 0x03, 0x43, 0x83, 0xC3, 0x13, 0x53, 0x93, 0xD3, 0x23, 0x63, 0xA3, 0xE3, 0x33, 0x73, 0xB3, 0xF3, 4217 0x07, 0x47, 0x87, 0xC7, 0x17, 0x57, 0x97, 0xD7, 0x27, 0x67, 0xA7, 0xE7, 0x37, 0x77, 0xB7, 0xF7, 4218 0x0B, 0x4B, 0x8B, 0xCB, 0x1B, 0x5B, 0x9B, 0xDB, 0x2B, 0x6B, 0xAB, 0xEB, 0x3B, 0x7B, 0xBB, 0xFB, 4219 0x0F, 0x4F, 0x8F, 0xCF, 0x1F, 0x5F, 0x9F, 0xDF, 0x2F, 0x6F, 0xAF, 0xEF, 0x3F, 0x7F, 0xBF, 0xFF, 4220 }; 4221 }; 4222 4223 if ((log2_length & 1) == 0) { 4224 const rev32 = @as(u6, @intCast(32 - log2_length)); 4225 var index: usize = 0; 4226 while (index < length) : (index += 1) { 4227 const n = index * 4; 4228 const addr = 4229 (@as(usize, @intCast(static.swizzle_table[n & 0xff])) << 24) | 4230 (@as(usize, @intCast(static.swizzle_table[(n >> 8) & 0xff])) << 16) | 4231 (@as(usize, @intCast(static.swizzle_table[(n >> 16) & 0xff])) << 8) | 4232 @as(usize, @intCast(static.swizzle_table[(n >> 24) & 0xff])); 4233 f32_output[addr >> rev32] = input[index][0]; 4234 f32_output[(0x40000000 | addr) >> rev32] = input[index][1]; 4235 f32_output[(0x80000000 | addr) >> rev32] = input[index][2]; 4236 f32_output[(0xC0000000 | addr) >> rev32] = input[index][3]; 4237 } 4238 } else { 4239 const rev7 = @as(usize, 1) << @as(u6, @intCast(log2_length - 3)); 4240 const rev32 = @as(u6, @intCast(32 - (log2_length - 3))); 4241 var index: usize = 0; 4242 while (index < length) : (index += 1) { 4243 const n = index / 2; 4244 var addr = 4245 (((@as(usize, @intCast(static.swizzle_table[n & 0xff])) << 24) | 4246 (@as(usize, @intCast(static.swizzle_table[(n >> 8) & 0xff])) << 16) | 4247 (@as(usize, @intCast(static.swizzle_table[(n >> 16) & 0xff])) << 8) | 4248 (@as(usize, @intCast(static.swizzle_table[(n >> 24) & 0xff])))) >> rev32) | 4249 ((index & 1) * rev7 * 4); 4250 f32_output[addr] = input[index][0]; 4251 addr += rev7; 4252 f32_output[addr] = input[index][1]; 4253 addr += rev7; 4254 f32_output[addr] = input[index][2]; 4255 addr += rev7; 4256 f32_output[addr] = input[index][3]; 4257 } 4258 } 4259} 4260 4261pub fn fftInitUnityTable(out_unity_table: []F32x4) void { 4262 assert(std.math.isPowerOfTwo(out_unity_table.len)); 4263 assert(out_unity_table.len >= 32 and out_unity_table.len <= 512); 4264 4265 var unity_table = out_unity_table; 4266 4267 const v0123 = f32x4(0.0, 1.0, 2.0, 3.0); 4268 var length = out_unity_table.len / 4; 4269 var vlstep = f32x4s(0.5 * math.pi / @as(f32, @floatFromInt(length))); 4270 4271 while (true) { 4272 length /= 4; 4273 var vjp = v0123; 4274 4275 var j: u32 = 0; 4276 while (j < length) : (j += 1) { 4277 unity_table[j] = f32x4s(1.0); 4278 unity_table[j + length * 4] = f32x4s(0.0); 4279 4280 var vls = vjp * vlstep; 4281 var sin_cos = sincos(vls); 4282 unity_table[j + length] = sin_cos[1]; 4283 unity_table[j + length * 5] = sin_cos[0] * f32x4s(-1.0); 4284 4285 var vijp = vjp + vjp; 4286 vls = vijp * vlstep; 4287 sin_cos = sincos(vls); 4288 unity_table[j + length * 2] = sin_cos[1]; 4289 unity_table[j + length * 6] = sin_cos[0] * f32x4s(-1.0); 4290 4291 vijp = vijp + vjp; 4292 vls = vijp * vlstep; 4293 sin_cos = sincos(vls); 4294 unity_table[j + length * 3] = sin_cos[1]; 4295 unity_table[j + length * 7] = sin_cos[0] * f32x4s(-1.0); 4296 4297 vjp += f32x4s(4.0); 4298 } 4299 vlstep *= f32x4s(4.0); 4300 unity_table = unity_table[8 * length ..]; 4301 4302 if (length <= 4) 4303 break; 4304 } 4305} 4306 4307pub fn fft(re: []F32x4, im: []F32x4, unity_table: []const F32x4) void { 4308 const length = @as(u32, @intCast(re.len * 4)); 4309 assert(std.math.isPowerOfTwo(length)); 4310 assert(length >= 4 and length <= 512); 4311 assert(re.len == im.len); 4312 4313 var re_temp_storage: [128]F32x4 = undefined; 4314 var im_temp_storage: [128]F32x4 = undefined; 4315 const re_temp = re_temp_storage[0..re.len]; 4316 const im_temp = im_temp_storage[0..im.len]; 4317 4318 @memcpy(re_temp, re); 4319 @memcpy(im_temp, im); 4320 4321 if (length > 16) { 4322 assert(unity_table.len == length); 4323 fftN(re_temp, im_temp, unity_table, length, 1); 4324 } else if (length == 16) { 4325 fft16(re_temp, im_temp, 1); 4326 } else if (length == 8) { 4327 fft8(re_temp, im_temp, 1); 4328 } else if (length == 4) { 4329 fft4(re_temp, im_temp, 1); 4330 } 4331 4332 fftUnswizzle(re_temp, re); 4333 fftUnswizzle(im_temp, im); 4334} 4335 4336pub fn ifft(re: []F32x4, im: []const F32x4, unity_table: []const F32x4) void { 4337 const length = @as(u32, @intCast(re.len * 4)); 4338 assert(std.math.isPowerOfTwo(length)); 4339 assert(length >= 4 and length <= 512); 4340 assert(re.len == im.len); 4341 4342 var re_temp_storage: [128]F32x4 = undefined; 4343 var im_temp_storage: [128]F32x4 = undefined; 4344 var re_temp = re_temp_storage[0..re.len]; 4345 var im_temp = im_temp_storage[0..im.len]; 4346 4347 const rnp = f32x4s(1.0 / @as(f32, @floatFromInt(length))); 4348 const rnm = f32x4s(-1.0 / @as(f32, @floatFromInt(length))); 4349 4350 for (re, 0..) |_, i| { 4351 re_temp[i] = re[i] * rnp; 4352 im_temp[i] = im[i] * rnm; 4353 } 4354 4355 if (length > 16) { 4356 assert(unity_table.len == length); 4357 fftN(re_temp, im_temp, unity_table, length, 1); 4358 } else if (length == 16) { 4359 fft16(re_temp, im_temp, 1); 4360 } else if (length == 8) { 4361 fft8(re_temp, im_temp, 1); 4362 } else if (length == 4) { 4363 fft4(re_temp, im_temp, 1); 4364 } 4365 4366 fftUnswizzle(re_temp, re); 4367} 4368test "zmath.ifft" { 4369 var unity_table: [512]F32x4 = undefined; 4370 const epsilon = 0.0001; 4371 4372 // 64 samples 4373 { 4374 var re = [_]F32x4{ 4375 f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4376 f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4377 f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4378 f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4379 f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4380 f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4381 f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4382 f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4383 }; 4384 var im = [_]F32x4{ 4385 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4386 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4387 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4388 f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4389 }; 4390 4391 fftInitUnityTable(unity_table[0..64]); 4392 fft(re[0..], im[0..], unity_table[0..64]); 4393 4394 try expectVecApproxEqAbs(re[0], f32x4(1056.0, 0.0, -32.0, 0.0), epsilon); 4395 var i: u32 = 1; 4396 while (i < 16) : (i += 1) { 4397 try expectVecApproxEqAbs(re[i], f32x4(-32.0, 0.0, -32.0, 0.0), epsilon); 4398 } 4399 4400 ifft(re[0..], im[0..], unity_table[0..64]); 4401 4402 try expectVecApproxEqAbs(re[0], f32x4(1.0, 2.0, 3.0, 4.0), epsilon); 4403 try expectVecApproxEqAbs(re[1], f32x4(5.0, 6.0, 7.0, 8.0), epsilon); 4404 try expectVecApproxEqAbs(re[2], f32x4(9.0, 10.0, 11.0, 12.0), epsilon); 4405 try expectVecApproxEqAbs(re[3], f32x4(13.0, 14.0, 15.0, 16.0), epsilon); 4406 try expectVecApproxEqAbs(re[4], f32x4(17.0, 18.0, 19.0, 20.0), epsilon); 4407 try expectVecApproxEqAbs(re[5], f32x4(21.0, 22.0, 23.0, 24.0), epsilon); 4408 try expectVecApproxEqAbs(re[6], f32x4(25.0, 26.0, 27.0, 28.0), epsilon); 4409 try expectVecApproxEqAbs(re[7], f32x4(29.0, 30.0, 31.0, 32.0), epsilon); 4410 } 4411 4412 // 512 samples 4413 { 4414 var re: [128]F32x4 = undefined; 4415 var im = [_]F32x4{f32x4s(0.0)} ** 128; 4416 4417 for (&re, 0..) |*v, i| { 4418 const f = @as(f32, @floatFromInt(i * 4)); 4419 v.* = f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0); 4420 } 4421 4422 fftInitUnityTable(unity_table[0..512]); 4423 fft(re[0..], im[0..], unity_table[0..512]); 4424 4425 for (re, 0..) |v, i| { 4426 const f = @as(f32, @floatFromInt(i * 4)); 4427 try expect(!approxEqAbs(v, f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0), epsilon)); 4428 } 4429 4430 ifft(re[0..], im[0..], unity_table[0..512]); 4431 4432 for (re, 0..) |v, i| { 4433 const f = @as(f32, @floatFromInt(i * 4)); 4434 try expectVecApproxEqAbs(v, f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0), epsilon); 4435 } 4436 } 4437} 4438// ------------------------------------------------------------------------------ 4439// 4440// Private functions and constants 4441// 4442// ------------------------------------------------------------------------------ 4443const f32x4_sign_mask1: F32x4 = F32x4{ @as(f32, @bitCast(@as(u32, 0x8000_0000))), 0, 0, 0 }; 4444const f32x4_mask2: F32x4 = F32x4{ 4445 @as(f32, @bitCast(@as(u32, 0xffff_ffff))), 4446 @as(f32, @bitCast(@as(u32, 0xffff_ffff))), 4447 0, 4448 0, 4449}; 4450const f32x4_mask3: F32x4 = F32x4{ 4451 @as(f32, @bitCast(@as(u32, 0xffff_ffff))), 4452 @as(f32, @bitCast(@as(u32, 0xffff_ffff))), 4453 @as(f32, @bitCast(@as(u32, 0xffff_ffff))), 4454 0, 4455}; 4456 4457inline fn splatNegativeZero(comptime T: type) T { 4458 return @splat(@as(f32, @bitCast(@as(u32, 0x8000_0000)))); 4459} 4460inline fn splatNoFraction(comptime T: type) T { 4461 return @splat(@as(f32, 8_388_608.0)); 4462} 4463inline fn splatAbsMask(comptime T: type) T { 4464 return @splat(@as(f32, @bitCast(@as(u32, 0x7fff_ffff)))); 4465} 4466 4467fn floatToIntAndBack(v: anytype) @TypeOf(v) { 4468 // This routine won't handle nan, inf and numbers greater than 8_388_608.0 (will generate undefined values). 4469 @setRuntimeSafety(false); 4470 4471 const T = @TypeOf(v); 4472 const len = veclen(T); 4473 4474 var vi32: [len]i32 = undefined; 4475 comptime var i: u32 = 0; 4476 // vcvttps2dq 4477 inline while (i < len) : (i += 1) { 4478 vi32[i] = @as(i32, @intFromFloat(v[i])); 4479 } 4480 4481 var vf32: [len]f32 = undefined; 4482 i = 0; 4483 // vcvtdq2ps 4484 inline while (i < len) : (i += 1) { 4485 vf32[i] = @as(f32, @floatFromInt(vi32[i])); 4486 } 4487 4488 return vf32; 4489} 4490test "zmath.floatToIntAndBack" { 4491 { 4492 const v = floatToIntAndBack(f32x4(1.1, 2.9, 3.0, -4.5)); 4493 try expectVecEqual(v, f32x4(1.0, 2.0, 3.0, -4.0)); 4494 } 4495 { 4496 const v = floatToIntAndBack(f32x8(1.1, 2.9, 3.0, -4.5, 2.5, -2.5, 1.1, -100.2)); 4497 try expectVecEqual(v, f32x8(1.0, 2.0, 3.0, -4.0, 2.0, -2.0, 1.0, -100.0)); 4498 } 4499 { 4500 const v = floatToIntAndBack(f32x4(math.inf(f32), 2.9, math.nan(f32), math.snan(f32))); 4501 try expect(v[1] == 2.0); 4502 } 4503} 4504 4505pub fn expectVecEqual(expected: anytype, actual: anytype) !void { 4506 const T = @TypeOf(expected, actual); 4507 inline for (0..veclen(T)) |i| { 4508 try std.testing.expectEqual(expected[i], actual[i]); 4509 } 4510} 4511 4512pub fn expectVecApproxEqAbs(expected: anytype, actual: anytype, eps: f32) !void { 4513 const T = @TypeOf(expected, actual); 4514 inline for (0..veclen(T)) |i| { 4515 try std.testing.expectApproxEqAbs(expected[i], actual[i], eps); 4516 } 4517} 4518 4519pub fn approxEqAbs(v0: anytype, v1: anytype, eps: f32) bool { 4520 const T = @TypeOf(v0, v1); 4521 comptime var i: comptime_int = 0; 4522 inline while (i < veclen(T)) : (i += 1) { 4523 if (!math.approxEqAbs(f32, v0[i], v1[i], eps)) { 4524 return false; 4525 } 4526 } 4527 return true; 4528} 4529 4530// ------------------------------------------------------------------------------ 4531// This software is available under 2 licenses -- choose whichever you prefer. 4532// ------------------------------------------------------------------------------ 4533// ALTERNATIVE A - MIT License 4534// Copyright (c) 2022 Michal Ziulek and Contributors 4535// Permission is hereby granted, free of charge, to any person obtaining a copy of 4536// this software and associated documentation files (the "Software"), to deal in 4537// the Software without restriction, including without limitation the rights to 4538// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 4539// of the Software, and to permit persons to whom the Software is furnished to do 4540// so, subject to the following conditions: 4541// The above copyright notice and this permission notice shall be included in all 4542// copies or substantial portions of the Software. 4543// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 4544// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 4545// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 4546// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 4547// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 4548// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 4549// SOFTWARE. 4550// ------------------------------------------------------------------------------ 4551// ALTERNATIVE B - Public Domain (www.unlicense.org) 4552// This is free and unencumbered software released into the public domain. 4553// Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 4554// software, either in source code form or as a compiled binary, for any purpose, 4555// commercial or non-commercial, and by any means. 4556// In jurisdictions that recognize copyright laws, the author or authors of this 4557// software dedicate any and all copyright interest in the software to the public 4558// domain. We make this dedication for the benefit of the public at large and to 4559// the detriment of our heirs and successors. We intend this dedication to be an 4560// overt act of relinquishment in perpetuity of all present and future rights to 4561// this software under copyright law. 4562// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 4563// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 4564// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 4565// AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 4566// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 4567// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 4568// ------------------------------------------------------------------------------