The open source OpenXR runtime
1#include "TracyDxt1.hpp"
2#include "../common/TracyForceInline.hpp"
3
4#include <assert.h>
5#include <stdint.h>
6#include <string.h>
7
8#ifdef __ARM_NEON
9# include <arm_neon.h>
10#endif
11
12#if defined __AVX__ && !defined __SSE4_1__
13# define __SSE4_1__
14#endif
15
16#if defined __SSE4_1__ || defined __AVX2__
17# ifdef _MSC_VER
18# include <intrin.h>
19# else
20# include <x86intrin.h>
21# ifndef _mm256_cvtsi256_si32
22# define _mm256_cvtsi256_si32( v ) ( _mm_cvtsi128_si32( _mm256_castsi256_si128( v ) ) )
23# endif
24# endif
25#endif
26
27namespace tracy
28{
29
30static inline uint16_t to565( uint8_t r, uint8_t g, uint8_t b )
31{
32 return ( ( r & 0xF8 ) << 8 ) | ( ( g & 0xFC ) << 3 ) | ( b >> 3 );
33}
34
35static inline uint16_t to565( uint32_t c )
36{
37 return
38 ( ( c & 0xF80000 ) >> 19 ) |
39 ( ( c & 0x00FC00 ) >> 5 ) |
40 ( ( c & 0x0000F8 ) << 8 );
41}
42
43static const uint16_t DivTable[255*3+1] = {
44 0xffff, 0xffff, 0xffff, 0xffff, 0xcccc, 0xaaaa, 0x9249, 0x8000, 0x71c7, 0x6666, 0x5d17, 0x5555, 0x4ec4, 0x4924, 0x4444, 0x4000,
45 0x3c3c, 0x38e3, 0x35e5, 0x3333, 0x30c3, 0x2e8b, 0x2c85, 0x2aaa, 0x28f5, 0x2762, 0x25ed, 0x2492, 0x234f, 0x2222, 0x2108, 0x2000,
46 0x1f07, 0x1e1e, 0x1d41, 0x1c71, 0x1bac, 0x1af2, 0x1a41, 0x1999, 0x18f9, 0x1861, 0x17d0, 0x1745, 0x16c1, 0x1642, 0x15c9, 0x1555,
47 0x14e5, 0x147a, 0x1414, 0x13b1, 0x1352, 0x12f6, 0x129e, 0x1249, 0x11f7, 0x11a7, 0x115b, 0x1111, 0x10c9, 0x1084, 0x1041, 0x1000,
48 0x0fc0, 0x0f83, 0x0f48, 0x0f0f, 0x0ed7, 0x0ea0, 0x0e6c, 0x0e38, 0x0e07, 0x0dd6, 0x0da7, 0x0d79, 0x0d4c, 0x0d20, 0x0cf6, 0x0ccc,
49 0x0ca4, 0x0c7c, 0x0c56, 0x0c30, 0x0c0c, 0x0be8, 0x0bc5, 0x0ba2, 0x0b81, 0x0b60, 0x0b40, 0x0b21, 0x0b02, 0x0ae4, 0x0ac7, 0x0aaa,
50 0x0a8e, 0x0a72, 0x0a57, 0x0a3d, 0x0a23, 0x0a0a, 0x09f1, 0x09d8, 0x09c0, 0x09a9, 0x0991, 0x097b, 0x0964, 0x094f, 0x0939, 0x0924,
51 0x090f, 0x08fb, 0x08e7, 0x08d3, 0x08c0, 0x08ad, 0x089a, 0x0888, 0x0876, 0x0864, 0x0853, 0x0842, 0x0831, 0x0820, 0x0810, 0x0800,
52 0x07f0, 0x07e0, 0x07d1, 0x07c1, 0x07b3, 0x07a4, 0x0795, 0x0787, 0x0779, 0x076b, 0x075d, 0x0750, 0x0743, 0x0736, 0x0729, 0x071c,
53 0x070f, 0x0703, 0x06f7, 0x06eb, 0x06df, 0x06d3, 0x06c8, 0x06bc, 0x06b1, 0x06a6, 0x069b, 0x0690, 0x0685, 0x067b, 0x0670, 0x0666,
54 0x065c, 0x0652, 0x0648, 0x063e, 0x0634, 0x062b, 0x0621, 0x0618, 0x060f, 0x0606, 0x05fd, 0x05f4, 0x05eb, 0x05e2, 0x05d9, 0x05d1,
55 0x05c9, 0x05c0, 0x05b8, 0x05b0, 0x05a8, 0x05a0, 0x0598, 0x0590, 0x0588, 0x0581, 0x0579, 0x0572, 0x056b, 0x0563, 0x055c, 0x0555,
56 0x054e, 0x0547, 0x0540, 0x0539, 0x0532, 0x052b, 0x0525, 0x051e, 0x0518, 0x0511, 0x050b, 0x0505, 0x04fe, 0x04f8, 0x04f2, 0x04ec,
57 0x04e6, 0x04e0, 0x04da, 0x04d4, 0x04ce, 0x04c8, 0x04c3, 0x04bd, 0x04b8, 0x04b2, 0x04ad, 0x04a7, 0x04a2, 0x049c, 0x0497, 0x0492,
58 0x048d, 0x0487, 0x0482, 0x047d, 0x0478, 0x0473, 0x046e, 0x0469, 0x0465, 0x0460, 0x045b, 0x0456, 0x0452, 0x044d, 0x0448, 0x0444,
59 0x043f, 0x043b, 0x0436, 0x0432, 0x042d, 0x0429, 0x0425, 0x0421, 0x041c, 0x0418, 0x0414, 0x0410, 0x040c, 0x0408, 0x0404, 0x0400,
60 0x03fc, 0x03f8, 0x03f4, 0x03f0, 0x03ec, 0x03e8, 0x03e4, 0x03e0, 0x03dd, 0x03d9, 0x03d5, 0x03d2, 0x03ce, 0x03ca, 0x03c7, 0x03c3,
61 0x03c0, 0x03bc, 0x03b9, 0x03b5, 0x03b2, 0x03ae, 0x03ab, 0x03a8, 0x03a4, 0x03a1, 0x039e, 0x039b, 0x0397, 0x0394, 0x0391, 0x038e,
62 0x038b, 0x0387, 0x0384, 0x0381, 0x037e, 0x037b, 0x0378, 0x0375, 0x0372, 0x036f, 0x036c, 0x0369, 0x0366, 0x0364, 0x0361, 0x035e,
63 0x035b, 0x0358, 0x0355, 0x0353, 0x0350, 0x034d, 0x034a, 0x0348, 0x0345, 0x0342, 0x0340, 0x033d, 0x033a, 0x0338, 0x0335, 0x0333,
64 0x0330, 0x032e, 0x032b, 0x0329, 0x0326, 0x0324, 0x0321, 0x031f, 0x031c, 0x031a, 0x0317, 0x0315, 0x0313, 0x0310, 0x030e, 0x030c,
65 0x0309, 0x0307, 0x0305, 0x0303, 0x0300, 0x02fe, 0x02fc, 0x02fa, 0x02f7, 0x02f5, 0x02f3, 0x02f1, 0x02ef, 0x02ec, 0x02ea, 0x02e8,
66 0x02e6, 0x02e4, 0x02e2, 0x02e0, 0x02de, 0x02dc, 0x02da, 0x02d8, 0x02d6, 0x02d4, 0x02d2, 0x02d0, 0x02ce, 0x02cc, 0x02ca, 0x02c8,
67 0x02c6, 0x02c4, 0x02c2, 0x02c0, 0x02be, 0x02bc, 0x02bb, 0x02b9, 0x02b7, 0x02b5, 0x02b3, 0x02b1, 0x02b0, 0x02ae, 0x02ac, 0x02aa,
68 0x02a8, 0x02a7, 0x02a5, 0x02a3, 0x02a1, 0x02a0, 0x029e, 0x029c, 0x029b, 0x0299, 0x0297, 0x0295, 0x0294, 0x0292, 0x0291, 0x028f,
69 0x028d, 0x028c, 0x028a, 0x0288, 0x0287, 0x0285, 0x0284, 0x0282, 0x0280, 0x027f, 0x027d, 0x027c, 0x027a, 0x0279, 0x0277, 0x0276,
70 0x0274, 0x0273, 0x0271, 0x0270, 0x026e, 0x026d, 0x026b, 0x026a, 0x0268, 0x0267, 0x0265, 0x0264, 0x0263, 0x0261, 0x0260, 0x025e,
71 0x025d, 0x025c, 0x025a, 0x0259, 0x0257, 0x0256, 0x0255, 0x0253, 0x0252, 0x0251, 0x024f, 0x024e, 0x024d, 0x024b, 0x024a, 0x0249,
72 0x0247, 0x0246, 0x0245, 0x0243, 0x0242, 0x0241, 0x0240, 0x023e, 0x023d, 0x023c, 0x023b, 0x0239, 0x0238, 0x0237, 0x0236, 0x0234,
73 0x0233, 0x0232, 0x0231, 0x0230, 0x022e, 0x022d, 0x022c, 0x022b, 0x022a, 0x0229, 0x0227, 0x0226, 0x0225, 0x0224, 0x0223, 0x0222,
74 0x0220, 0x021f, 0x021e, 0x021d, 0x021c, 0x021b, 0x021a, 0x0219, 0x0218, 0x0216, 0x0215, 0x0214, 0x0213, 0x0212, 0x0211, 0x0210,
75 0x020f, 0x020e, 0x020d, 0x020c, 0x020b, 0x020a, 0x0209, 0x0208, 0x0207, 0x0206, 0x0205, 0x0204, 0x0203, 0x0202, 0x0201, 0x0200,
76 0x01ff, 0x01fe, 0x01fd, 0x01fc, 0x01fb, 0x01fa, 0x01f9, 0x01f8, 0x01f7, 0x01f6, 0x01f5, 0x01f4, 0x01f3, 0x01f2, 0x01f1, 0x01f0,
77 0x01ef, 0x01ee, 0x01ed, 0x01ec, 0x01eb, 0x01ea, 0x01e9, 0x01e9, 0x01e8, 0x01e7, 0x01e6, 0x01e5, 0x01e4, 0x01e3, 0x01e2, 0x01e1,
78 0x01e0, 0x01e0, 0x01df, 0x01de, 0x01dd, 0x01dc, 0x01db, 0x01da, 0x01da, 0x01d9, 0x01d8, 0x01d7, 0x01d6, 0x01d5, 0x01d4, 0x01d4,
79 0x01d3, 0x01d2, 0x01d1, 0x01d0, 0x01cf, 0x01cf, 0x01ce, 0x01cd, 0x01cc, 0x01cb, 0x01cb, 0x01ca, 0x01c9, 0x01c8, 0x01c7, 0x01c7,
80 0x01c6, 0x01c5, 0x01c4, 0x01c3, 0x01c3, 0x01c2, 0x01c1, 0x01c0, 0x01c0, 0x01bf, 0x01be, 0x01bd, 0x01bd, 0x01bc, 0x01bb, 0x01ba,
81 0x01ba, 0x01b9, 0x01b8, 0x01b7, 0x01b7, 0x01b6, 0x01b5, 0x01b4, 0x01b4, 0x01b3, 0x01b2, 0x01b2, 0x01b1, 0x01b0, 0x01af, 0x01af,
82 0x01ae, 0x01ad, 0x01ad, 0x01ac, 0x01ab, 0x01aa, 0x01aa, 0x01a9, 0x01a8, 0x01a8, 0x01a7, 0x01a6, 0x01a6, 0x01a5, 0x01a4, 0x01a4,
83 0x01a3, 0x01a2, 0x01a2, 0x01a1, 0x01a0, 0x01a0, 0x019f, 0x019e, 0x019e, 0x019d, 0x019c, 0x019c, 0x019b, 0x019a, 0x019a, 0x0199,
84 0x0198, 0x0198, 0x0197, 0x0197, 0x0196, 0x0195, 0x0195, 0x0194, 0x0193, 0x0193, 0x0192, 0x0192, 0x0191, 0x0190, 0x0190, 0x018f,
85 0x018f, 0x018e, 0x018d, 0x018d, 0x018c, 0x018b, 0x018b, 0x018a, 0x018a, 0x0189, 0x0189, 0x0188, 0x0187, 0x0187, 0x0186, 0x0186,
86 0x0185, 0x0184, 0x0184, 0x0183, 0x0183, 0x0182, 0x0182, 0x0181, 0x0180, 0x0180, 0x017f, 0x017f, 0x017e, 0x017e, 0x017d, 0x017d,
87 0x017c, 0x017b, 0x017b, 0x017a, 0x017a, 0x0179, 0x0179, 0x0178, 0x0178, 0x0177, 0x0177, 0x0176, 0x0175, 0x0175, 0x0174, 0x0174,
88 0x0173, 0x0173, 0x0172, 0x0172, 0x0171, 0x0171, 0x0170, 0x0170, 0x016f, 0x016f, 0x016e, 0x016e, 0x016d, 0x016d, 0x016c, 0x016c,
89 0x016b, 0x016b, 0x016a, 0x016a, 0x0169, 0x0169, 0x0168, 0x0168, 0x0167, 0x0167, 0x0166, 0x0166, 0x0165, 0x0165, 0x0164, 0x0164,
90 0x0163, 0x0163, 0x0162, 0x0162, 0x0161, 0x0161, 0x0160, 0x0160, 0x015f, 0x015f, 0x015e, 0x015e, 0x015d, 0x015d, 0x015d, 0x015c,
91 0x015c, 0x015b, 0x015b, 0x015a, 0x015a, 0x0159, 0x0159, 0x0158, 0x0158, 0x0158, 0x0157, 0x0157, 0x0156, 0x0156
92};
93
94#if defined __ARM_NEON && defined __aarch64__
95static const uint16_t DivTableNEON[255*3+1] = {
96 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
97 0x0000, 0x1c71, 0x1af2, 0x1999, 0x1861, 0x1745, 0x1642, 0x1555, 0x147a, 0x13b1, 0x12f6, 0x1249, 0x11a7, 0x1111, 0x1084, 0x1000,
98 0x0f83, 0x0f0f, 0x0ea0, 0x0e38, 0x0dd6, 0x0d79, 0x0d20, 0x0ccc, 0x0c7c, 0x0c30, 0x0be8, 0x0ba2, 0x0b60, 0x0b21, 0x0ae4, 0x0aaa,
99 0x0a72, 0x0a3d, 0x0a0a, 0x09d8, 0x09a9, 0x097b, 0x094f, 0x0924, 0x08fb, 0x08d3, 0x08ad, 0x0888, 0x0864, 0x0842, 0x0820, 0x0800,
100 0x07e0, 0x07c1, 0x07a4, 0x0787, 0x076b, 0x0750, 0x0736, 0x071c, 0x0703, 0x06eb, 0x06d3, 0x06bc, 0x06a6, 0x0690, 0x067b, 0x0666,
101 0x0652, 0x063e, 0x062b, 0x0618, 0x0606, 0x05f4, 0x05e2, 0x05d1, 0x05c0, 0x05b0, 0x05a0, 0x0590, 0x0581, 0x0572, 0x0563, 0x0555,
102 0x0547, 0x0539, 0x052b, 0x051e, 0x0511, 0x0505, 0x04f8, 0x04ec, 0x04e0, 0x04d4, 0x04c8, 0x04bd, 0x04b2, 0x04a7, 0x049c, 0x0492,
103 0x0487, 0x047d, 0x0473, 0x0469, 0x0460, 0x0456, 0x044d, 0x0444, 0x043b, 0x0432, 0x0429, 0x0421, 0x0418, 0x0410, 0x0408, 0x0400,
104 0x03f8, 0x03f0, 0x03e8, 0x03e0, 0x03d9, 0x03d2, 0x03ca, 0x03c3, 0x03bc, 0x03b5, 0x03ae, 0x03a8, 0x03a1, 0x039b, 0x0394, 0x038e,
105 0x0387, 0x0381, 0x037b, 0x0375, 0x036f, 0x0369, 0x0364, 0x035e, 0x0358, 0x0353, 0x034d, 0x0348, 0x0342, 0x033d, 0x0338, 0x0333,
106 0x032e, 0x0329, 0x0324, 0x031f, 0x031a, 0x0315, 0x0310, 0x030c, 0x0307, 0x0303, 0x02fe, 0x02fa, 0x02f5, 0x02f1, 0x02ec, 0x02e8,
107 0x02e4, 0x02e0, 0x02dc, 0x02d8, 0x02d4, 0x02d0, 0x02cc, 0x02c8, 0x02c4, 0x02c0, 0x02bc, 0x02b9, 0x02b5, 0x02b1, 0x02ae, 0x02aa,
108 0x02a7, 0x02a3, 0x02a0, 0x029c, 0x0299, 0x0295, 0x0292, 0x028f, 0x028c, 0x0288, 0x0285, 0x0282, 0x027f, 0x027c, 0x0279, 0x0276,
109 0x0273, 0x0270, 0x026d, 0x026a, 0x0267, 0x0264, 0x0261, 0x025e, 0x025c, 0x0259, 0x0256, 0x0253, 0x0251, 0x024e, 0x024b, 0x0249,
110 0x0246, 0x0243, 0x0241, 0x023e, 0x023c, 0x0239, 0x0237, 0x0234, 0x0232, 0x0230, 0x022d, 0x022b, 0x0229, 0x0226, 0x0224, 0x0222,
111 0x021f, 0x021d, 0x021b, 0x0219, 0x0216, 0x0214, 0x0212, 0x0210, 0x020e, 0x020c, 0x020a, 0x0208, 0x0206, 0x0204, 0x0202, 0x0200,
112 0x01fe, 0x01fc, 0x01fa, 0x01f8, 0x01f6, 0x01f4, 0x01f2, 0x01f0, 0x01ee, 0x01ec, 0x01ea, 0x01e9, 0x01e7, 0x01e5, 0x01e3, 0x01e1,
113 0x01e0, 0x01de, 0x01dc, 0x01da, 0x01d9, 0x01d7, 0x01d5, 0x01d4, 0x01d2, 0x01d0, 0x01cf, 0x01cd, 0x01cb, 0x01ca, 0x01c8, 0x01c7,
114 0x01c5, 0x01c3, 0x01c2, 0x01c0, 0x01bf, 0x01bd, 0x01bc, 0x01ba, 0x01b9, 0x01b7, 0x01b6, 0x01b4, 0x01b3, 0x01b2, 0x01b0, 0x01af,
115 0x01ad, 0x01ac, 0x01aa, 0x01a9, 0x01a8, 0x01a6, 0x01a5, 0x01a4, 0x01a2, 0x01a1, 0x01a0, 0x019e, 0x019d, 0x019c, 0x019a, 0x0199,
116 0x0198, 0x0197, 0x0195, 0x0194, 0x0193, 0x0192, 0x0190, 0x018f, 0x018e, 0x018d, 0x018b, 0x018a, 0x0189, 0x0188, 0x0187, 0x0186,
117 0x0184, 0x0183, 0x0182, 0x0181, 0x0180, 0x017f, 0x017e, 0x017d, 0x017b, 0x017a, 0x0179, 0x0178, 0x0177, 0x0176, 0x0175, 0x0174,
118 0x0173, 0x0172, 0x0171, 0x0170, 0x016f, 0x016e, 0x016d, 0x016c, 0x016b, 0x016a, 0x0169, 0x0168, 0x0167, 0x0166, 0x0165, 0x0164,
119 0x0163, 0x0162, 0x0161, 0x0160, 0x015f, 0x015e, 0x015d, 0x015c, 0x015b, 0x015a, 0x0159, 0x0158, 0x0158, 0x0157, 0x0156, 0x0155,
120 0x0154, 0x0153, 0x0152, 0x0151, 0x0150, 0x0150, 0x014f, 0x014e, 0x014d, 0x014c, 0x014b, 0x014a, 0x014a, 0x0149, 0x0148, 0x0147,
121 0x0146, 0x0146, 0x0145, 0x0144, 0x0143, 0x0142, 0x0142, 0x0141, 0x0140, 0x013f, 0x013e, 0x013e, 0x013d, 0x013c, 0x013b, 0x013b,
122 0x013a, 0x0139, 0x0138, 0x0138, 0x0137, 0x0136, 0x0135, 0x0135, 0x0134, 0x0133, 0x0132, 0x0132, 0x0131, 0x0130, 0x0130, 0x012f,
123 0x012e, 0x012e, 0x012d, 0x012c, 0x012b, 0x012b, 0x012a, 0x0129, 0x0129, 0x0128, 0x0127, 0x0127, 0x0126, 0x0125, 0x0125, 0x0124,
124 0x0123, 0x0123, 0x0122, 0x0121, 0x0121, 0x0120, 0x0120, 0x011f, 0x011e, 0x011e, 0x011d, 0x011c, 0x011c, 0x011b, 0x011b, 0x011a,
125 0x0119, 0x0119, 0x0118, 0x0118, 0x0117, 0x0116, 0x0116, 0x0115, 0x0115, 0x0114, 0x0113, 0x0113, 0x0112, 0x0112, 0x0111, 0x0111,
126 0x0110, 0x010f, 0x010f, 0x010e, 0x010e, 0x010d, 0x010d, 0x010c, 0x010c, 0x010b, 0x010a, 0x010a, 0x0109, 0x0109, 0x0108, 0x0108,
127 0x0107, 0x0107, 0x0106, 0x0106, 0x0105, 0x0105, 0x0104, 0x0104, 0x0103, 0x0103, 0x0102, 0x0102, 0x0101, 0x0101, 0x0100, 0x0100,
128 0x00ff, 0x00ff, 0x00fe, 0x00fe, 0x00fd, 0x00fd, 0x00fc, 0x00fc, 0x00fb, 0x00fb, 0x00fa, 0x00fa, 0x00f9, 0x00f9, 0x00f8, 0x00f8,
129 0x00f7, 0x00f7, 0x00f6, 0x00f6, 0x00f5, 0x00f5, 0x00f4, 0x00f4, 0x00f4, 0x00f3, 0x00f3, 0x00f2, 0x00f2, 0x00f1, 0x00f1, 0x00f0,
130 0x00f0, 0x00f0, 0x00ef, 0x00ef, 0x00ee, 0x00ee, 0x00ed, 0x00ed, 0x00ed, 0x00ec, 0x00ec, 0x00eb, 0x00eb, 0x00ea, 0x00ea, 0x00ea,
131 0x00e9, 0x00e9, 0x00e8, 0x00e8, 0x00e7, 0x00e7, 0x00e7, 0x00e6, 0x00e6, 0x00e5, 0x00e5, 0x00e5, 0x00e4, 0x00e4, 0x00e3, 0x00e3,
132 0x00e3, 0x00e2, 0x00e2, 0x00e1, 0x00e1, 0x00e1, 0x00e0, 0x00e0, 0x00e0, 0x00df, 0x00df, 0x00de, 0x00de, 0x00de, 0x00dd, 0x00dd,
133 0x00dd, 0x00dc, 0x00dc, 0x00db, 0x00db, 0x00db, 0x00da, 0x00da, 0x00da, 0x00d9, 0x00d9, 0x00d9, 0x00d8, 0x00d8, 0x00d7, 0x00d7,
134 0x00d7, 0x00d6, 0x00d6, 0x00d6, 0x00d5, 0x00d5, 0x00d5, 0x00d4, 0x00d4, 0x00d4, 0x00d3, 0x00d3, 0x00d3, 0x00d2, 0x00d2, 0x00d2,
135 0x00d1, 0x00d1, 0x00d1, 0x00d0, 0x00d0, 0x00d0, 0x00cf, 0x00cf, 0x00cf, 0x00ce, 0x00ce, 0x00ce, 0x00cd, 0x00cd, 0x00cd, 0x00cc,
136 0x00cc, 0x00cc, 0x00cb, 0x00cb, 0x00cb, 0x00ca, 0x00ca, 0x00ca, 0x00c9, 0x00c9, 0x00c9, 0x00c9, 0x00c8, 0x00c8, 0x00c8, 0x00c7,
137 0x00c7, 0x00c7, 0x00c6, 0x00c6, 0x00c6, 0x00c5, 0x00c5, 0x00c5, 0x00c5, 0x00c4, 0x00c4, 0x00c4, 0x00c3, 0x00c3, 0x00c3, 0x00c3,
138 0x00c2, 0x00c2, 0x00c2, 0x00c1, 0x00c1, 0x00c1, 0x00c1, 0x00c0, 0x00c0, 0x00c0, 0x00bf, 0x00bf, 0x00bf, 0x00bf, 0x00be, 0x00be,
139 0x00be, 0x00bd, 0x00bd, 0x00bd, 0x00bd, 0x00bc, 0x00bc, 0x00bc, 0x00bc, 0x00bb, 0x00bb, 0x00bb, 0x00ba, 0x00ba, 0x00ba, 0x00ba,
140 0x00b9, 0x00b9, 0x00b9, 0x00b9, 0x00b8, 0x00b8, 0x00b8, 0x00b8, 0x00b7, 0x00b7, 0x00b7, 0x00b7, 0x00b6, 0x00b6, 0x00b6, 0x00b6,
141 0x00b5, 0x00b5, 0x00b5, 0x00b5, 0x00b4, 0x00b4, 0x00b4, 0x00b4, 0x00b3, 0x00b3, 0x00b3, 0x00b3, 0x00b2, 0x00b2, 0x00b2, 0x00b2,
142 0x00b1, 0x00b1, 0x00b1, 0x00b1, 0x00b0, 0x00b0, 0x00b0, 0x00b0, 0x00af, 0x00af, 0x00af, 0x00af, 0x00ae, 0x00ae, 0x00ae, 0x00ae,
143 0x00ae, 0x00ad, 0x00ad, 0x00ad, 0x00ad, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ab, 0x00ab, 0x00ab, 0x00ab,
144};
145#endif
146
147
148static tracy_force_inline uint64_t ProcessRGB( const uint8_t* src )
149{
150#ifdef __SSE4_1__
151 __m128i px0 = _mm_loadu_si128(((__m128i*)src) + 0);
152 __m128i px1 = _mm_loadu_si128(((__m128i*)src) + 1);
153 __m128i px2 = _mm_loadu_si128(((__m128i*)src) + 2);
154 __m128i px3 = _mm_loadu_si128(((__m128i*)src) + 3);
155
156 __m128i smask = _mm_set1_epi32( 0xF8FCF8 );
157 __m128i sd0 = _mm_and_si128( px0, smask );
158 __m128i sd1 = _mm_and_si128( px1, smask );
159 __m128i sd2 = _mm_and_si128( px2, smask );
160 __m128i sd3 = _mm_and_si128( px3, smask );
161
162 __m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
163
164 __m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
165 __m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
166 __m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
167 __m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
168
169 __m128i sm0 = _mm_and_si128(sc0, sc1);
170 __m128i sm1 = _mm_and_si128(sc2, sc3);
171 __m128i sm = _mm_and_si128(sm0, sm1);
172
173 if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
174 {
175 return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
176 }
177
178 __m128i amask = _mm_set1_epi32( 0xFFFFFF );
179 px0 = _mm_and_si128( px0, amask );
180 px1 = _mm_and_si128( px1, amask );
181 px2 = _mm_and_si128( px2, amask );
182 px3 = _mm_and_si128( px3, amask );
183
184 __m128i min0 = _mm_min_epu8( px0, px1 );
185 __m128i min1 = _mm_min_epu8( px2, px3 );
186 __m128i min2 = _mm_min_epu8( min0, min1 );
187
188 __m128i max0 = _mm_max_epu8( px0, px1 );
189 __m128i max1 = _mm_max_epu8( px2, px3 );
190 __m128i max2 = _mm_max_epu8( max0, max1 );
191
192 __m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
193 __m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
194 __m128i min4 = _mm_min_epu8( min2, min3 );
195 __m128i max4 = _mm_max_epu8( max2, max3 );
196
197 __m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
198 __m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
199 __m128i rmin = _mm_min_epu8( min4, min5 );
200 __m128i rmax = _mm_max_epu8( max4, max5 );
201
202 __m128i range1 = _mm_subs_epu8( rmax, rmin );
203 __m128i range2 = _mm_sad_epu8( rmax, rmin );
204
205 uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1;
206 __m128i range = _mm_set1_epi16( DivTable[vrange] );
207
208 __m128i inset1 = _mm_srli_epi16( range1, 4 );
209 __m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) );
210 __m128i min = _mm_adds_epu8( rmin, inset );
211 __m128i max = _mm_subs_epu8( rmax, inset );
212
213 __m128i c0 = _mm_subs_epu8( px0, rmin );
214 __m128i c1 = _mm_subs_epu8( px1, rmin );
215 __m128i c2 = _mm_subs_epu8( px2, rmin );
216 __m128i c3 = _mm_subs_epu8( px3, rmin );
217
218 __m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) );
219 __m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) );
220 __m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) );
221 __m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) );
222
223 __m128i s0 = _mm_hadd_epi16( is0, is1 );
224 __m128i s1 = _mm_hadd_epi16( is2, is3 );
225
226 __m128i m0 = _mm_mulhi_epu16( s0, range );
227 __m128i m1 = _mm_mulhi_epu16( s1, range );
228
229 __m128i p0 = _mm_packus_epi16( m0, m1 );
230
231 __m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) );
232 __m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 );
233 __m128i p3 = _mm_or_si128( p1, p2 );
234 __m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) );
235
236 uint32_t vmin = _mm_cvtsi128_si32( min );
237 uint32_t vmax = _mm_cvtsi128_si32( max );
238 uint32_t vp = _mm_cvtsi128_si32( p );
239
240 return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
241#elif defined __ARM_NEON
242# ifdef __aarch64__
243 uint8x16x4_t px = vld4q_u8( src );
244
245 uint8x16_t lr = px.val[0];
246 uint8x16_t lg = px.val[1];
247 uint8x16_t lb = px.val[2];
248
249 uint8_t rmaxr = vmaxvq_u8( lr );
250 uint8_t rmaxg = vmaxvq_u8( lg );
251 uint8_t rmaxb = vmaxvq_u8( lb );
252
253 uint8_t rminr = vminvq_u8( lr );
254 uint8_t rming = vminvq_u8( lg );
255 uint8_t rminb = vminvq_u8( lb );
256
257 int rr = rmaxr - rminr;
258 int rg = rmaxg - rming;
259 int rb = rmaxb - rminb;
260
261 int vrange1 = rr + rg + rb;
262 uint16_t vrange2 = DivTableNEON[vrange1];
263
264 uint8_t insetr = rr >> 4;
265 uint8_t insetg = rg >> 4;
266 uint8_t insetb = rb >> 4;
267
268 uint8_t minr = rminr + insetr;
269 uint8_t ming = rming + insetg;
270 uint8_t minb = rminb + insetb;
271
272 uint8_t maxr = rmaxr - insetr;
273 uint8_t maxg = rmaxg - insetg;
274 uint8_t maxb = rmaxb - insetb;
275
276 uint8x16_t cr = vsubq_u8( lr, vdupq_n_u8( rminr ) );
277 uint8x16_t cg = vsubq_u8( lg, vdupq_n_u8( rming ) );
278 uint8x16_t cb = vsubq_u8( lb, vdupq_n_u8( rminb ) );
279
280 uint16x8_t is0l = vaddl_u8( vget_low_u8( cr ), vget_low_u8( cg ) );
281 uint16x8_t is0h = vaddl_u8( vget_high_u8( cr ), vget_high_u8( cg ) );
282 uint16x8_t is1l = vaddw_u8( is0l, vget_low_u8( cb ) );
283 uint16x8_t is1h = vaddw_u8( is0h, vget_high_u8( cb ) );
284
285 int16x8_t range = vdupq_n_s16( vrange2 );
286 uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1l ), range ) );
287 uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1h ), range ) );
288
289 uint8x8_t p00 = vmovn_u16( m0 );
290 uint8x8_t p01 = vmovn_u16( m1 );
291 uint8x16_t p0 = vcombine_u8( p00, p01 );
292
293 uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
294 uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
295 uint32x4_t p3 = vaddq_u32( p1, p2 );
296
297 uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
298 uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
299
300 uint32_t vp;
301 vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
302
303 return uint64_t( ( uint64_t( to565( minr, ming, minb ) ) << 16 ) | to565( maxr, maxg, maxb ) | ( uint64_t( vp ) << 32 ) );
304# else
305 uint32x4_t px0 = vld1q_u32( (uint32_t*)src );
306 uint32x4_t px1 = vld1q_u32( (uint32_t*)src + 4 );
307 uint32x4_t px2 = vld1q_u32( (uint32_t*)src + 8 );
308 uint32x4_t px3 = vld1q_u32( (uint32_t*)src + 12 );
309
310 uint32x4_t smask = vdupq_n_u32( 0xF8FCF8 );
311 uint32x4_t sd0 = vandq_u32( smask, px0 );
312 uint32x4_t sd1 = vandq_u32( smask, px1 );
313 uint32x4_t sd2 = vandq_u32( smask, px2 );
314 uint32x4_t sd3 = vandq_u32( smask, px3 );
315
316 uint32x4_t sc = vdupq_n_u32( sd0[0] );
317
318 uint32x4_t sc0 = vceqq_u32( sd0, sc );
319 uint32x4_t sc1 = vceqq_u32( sd1, sc );
320 uint32x4_t sc2 = vceqq_u32( sd2, sc );
321 uint32x4_t sc3 = vceqq_u32( sd3, sc );
322
323 uint32x4_t sm0 = vandq_u32( sc0, sc1 );
324 uint32x4_t sm1 = vandq_u32( sc2, sc3 );
325 int64x2_t sm = vreinterpretq_s64_u32( vandq_u32( sm0, sm1 ) );
326
327 if( sm[0] == -1 && sm[1] == -1 )
328 {
329 return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
330 }
331
332 uint32x4_t mask = vdupq_n_u32( 0xFFFFFF );
333 uint8x16_t l0 = vreinterpretq_u8_u32( vandq_u32( mask, px0 ) );
334 uint8x16_t l1 = vreinterpretq_u8_u32( vandq_u32( mask, px1 ) );
335 uint8x16_t l2 = vreinterpretq_u8_u32( vandq_u32( mask, px2 ) );
336 uint8x16_t l3 = vreinterpretq_u8_u32( vandq_u32( mask, px3 ) );
337
338 uint8x16_t min0 = vminq_u8( l0, l1 );
339 uint8x16_t min1 = vminq_u8( l2, l3 );
340 uint8x16_t min2 = vminq_u8( min0, min1 );
341
342 uint8x16_t max0 = vmaxq_u8( l0, l1 );
343 uint8x16_t max1 = vmaxq_u8( l2, l3 );
344 uint8x16_t max2 = vmaxq_u8( max0, max1 );
345
346 uint8x16_t min3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( min2 ) ) );
347 uint8x16_t max3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( max2 ) ) );
348
349 uint8x16_t min4 = vminq_u8( min2, min3 );
350 uint8x16_t max4 = vmaxq_u8( max2, max3 );
351
352 uint8x16_t min5 = vcombine_u8( vget_high_u8( min4 ), vget_low_u8( min4 ) );
353 uint8x16_t max5 = vcombine_u8( vget_high_u8( max4 ), vget_low_u8( max4 ) );
354
355 uint8x16_t rmin = vminq_u8( min4, min5 );
356 uint8x16_t rmax = vmaxq_u8( max4, max5 );
357
358 uint8x16_t range1 = vsubq_u8( rmax, rmin );
359 uint8x8_t range2 = vget_low_u8( range1 );
360 uint8x8x2_t range3 = vzip_u8( range2, vdup_n_u8( 0 ) );
361 uint16x4_t range4 = vreinterpret_u16_u8( range3.val[0] );
362
363 uint16_t vrange1;
364 uint16x4_t range5 = vpadd_u16( range4, range4 );
365 uint16x4_t range6 = vpadd_u16( range5, range5 );
366 vst1_lane_u16( &vrange1, range6, 0 );
367
368 uint32_t vrange2 = ( 2 << 16 ) / uint32_t( vrange1 + 1 );
369 uint16x8_t range = vdupq_n_u16( vrange2 );
370
371 uint8x16_t inset = vshrq_n_u8( range1, 4 );
372 uint8x16_t min = vaddq_u8( rmin, inset );
373 uint8x16_t max = vsubq_u8( rmax, inset );
374
375 uint8x16_t c0 = vsubq_u8( l0, rmin );
376 uint8x16_t c1 = vsubq_u8( l1, rmin );
377 uint8x16_t c2 = vsubq_u8( l2, rmin );
378 uint8x16_t c3 = vsubq_u8( l3, rmin );
379
380 uint16x8_t is0 = vpaddlq_u8( c0 );
381 uint16x8_t is1 = vpaddlq_u8( c1 );
382 uint16x8_t is2 = vpaddlq_u8( c2 );
383 uint16x8_t is3 = vpaddlq_u8( c3 );
384
385 uint16x4_t is4 = vpadd_u16( vget_low_u16( is0 ), vget_high_u16( is0 ) );
386 uint16x4_t is5 = vpadd_u16( vget_low_u16( is1 ), vget_high_u16( is1 ) );
387 uint16x4_t is6 = vpadd_u16( vget_low_u16( is2 ), vget_high_u16( is2 ) );
388 uint16x4_t is7 = vpadd_u16( vget_low_u16( is3 ), vget_high_u16( is3 ) );
389
390 uint16x8_t s0 = vcombine_u16( is4, is5 );
391 uint16x8_t s1 = vcombine_u16( is6, is7 );
392
393 uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s0 ), vreinterpretq_s16_u16( range ) ) );
394 uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s1 ), vreinterpretq_s16_u16( range ) ) );
395
396 uint8x8_t p00 = vmovn_u16( m0 );
397 uint8x8_t p01 = vmovn_u16( m1 );
398 uint8x16_t p0 = vcombine_u8( p00, p01 );
399
400 uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
401 uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
402 uint32x4_t p3 = vaddq_u32( p1, p2 );
403
404 uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
405 uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
406
407 uint32_t vmin, vmax, vp;
408 vst1q_lane_u32( &vmin, vreinterpretq_u32_u8( min ), 0 );
409 vst1q_lane_u32( &vmax, vreinterpretq_u32_u8( max ), 0 );
410 vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
411
412 return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
413# endif
414#else
415 uint32_t ref;
416 memcpy( &ref, src, 4 );
417 uint32_t refMask = ref & 0xF8FCF8;
418 auto stmp = src + 4;
419 for( int i=1; i<16; i++ )
420 {
421 uint32_t px;
422 memcpy( &px, stmp, 4 );
423 if( ( px & 0xF8FCF8 ) != refMask ) break;
424 stmp += 4;
425 }
426 if( stmp == src + 64 )
427 {
428 return uint64_t( to565( ref ) ) << 16;
429 }
430
431 uint8_t min[3] = { src[0], src[1], src[2] };
432 uint8_t max[3] = { src[0], src[1], src[2] };
433 auto tmp = src + 4;
434 for( int i=1; i<16; i++ )
435 {
436 for( int j=0; j<3; j++ )
437 {
438 if( tmp[j] < min[j] ) min[j] = tmp[j];
439 else if( tmp[j] > max[j] ) max[j] = tmp[j];
440 }
441 tmp += 4;
442 }
443
444 const uint32_t range = DivTable[max[0] - min[0] + max[1] - min[1] + max[2] - min[2]];
445 const uint32_t rmin = min[0] + min[1] + min[2];
446 for( int i=0; i<3; i++ )
447 {
448 const uint8_t inset = ( max[i] - min[i] ) >> 4;
449 min[i] += inset;
450 max[i] -= inset;
451 }
452
453 uint32_t data = 0;
454 for( int i=0; i<16; i++ )
455 {
456 const uint32_t c = src[0] + src[1] + src[2] - rmin;
457 const uint8_t idx = ( c * range ) >> 16;
458 data |= idx << (i*2);
459 src += 4;
460 }
461
462 return uint64_t( ( uint64_t( to565( min[0], min[1], min[2] ) ) << 16 ) | to565( max[0], max[1], max[2] ) | ( uint64_t( data ) << 32 ) );
463#endif
464}
465
466#ifdef __AVX2__
467static tracy_force_inline void ProcessRGB_AVX( const uint8_t* src, char*& dst )
468{
469 __m256i px0 = _mm256_loadu_si256(((__m256i*)src) + 0);
470 __m256i px1 = _mm256_loadu_si256(((__m256i*)src) + 1);
471 __m256i px2 = _mm256_loadu_si256(((__m256i*)src) + 2);
472 __m256i px3 = _mm256_loadu_si256(((__m256i*)src) + 3);
473
474 __m256i smask = _mm256_set1_epi32( 0xF8FCF8 );
475 __m256i sd0 = _mm256_and_si256( px0, smask );
476 __m256i sd1 = _mm256_and_si256( px1, smask );
477 __m256i sd2 = _mm256_and_si256( px2, smask );
478 __m256i sd3 = _mm256_and_si256( px3, smask );
479
480 __m256i sc = _mm256_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
481
482 __m256i sc0 = _mm256_cmpeq_epi8( sd0, sc );
483 __m256i sc1 = _mm256_cmpeq_epi8( sd1, sc );
484 __m256i sc2 = _mm256_cmpeq_epi8( sd2, sc );
485 __m256i sc3 = _mm256_cmpeq_epi8( sd3, sc );
486
487 __m256i sm0 = _mm256_and_si256( sc0, sc1 );
488 __m256i sm1 = _mm256_and_si256( sc2, sc3 );
489 __m256i sm = _mm256_and_si256( sm0, sm1 );
490
491 const int64_t solid0 = 1 - _mm_testc_si128( _mm256_castsi256_si128( sm ), _mm_set1_epi32( -1 ) );
492 const int64_t solid1 = 1 - _mm_testc_si128( _mm256_extracti128_si256( sm, 1 ), _mm_set1_epi32( -1 ) );
493
494 if( solid0 + solid1 == 0 )
495 {
496 const auto c0 = uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
497 const auto c1 = uint64_t( to565( src[16], src[17], src[18] ) ) << 16;
498 memcpy( dst, &c0, 8 );
499 memcpy( dst+8, &c1, 8 );
500 dst += 16;
501 return;
502 }
503
504 __m256i amask = _mm256_set1_epi32( 0xFFFFFF );
505 px0 = _mm256_and_si256( px0, amask );
506 px1 = _mm256_and_si256( px1, amask );
507 px2 = _mm256_and_si256( px2, amask );
508 px3 = _mm256_and_si256( px3, amask );
509
510 __m256i min0 = _mm256_min_epu8( px0, px1 );
511 __m256i min1 = _mm256_min_epu8( px2, px3 );
512 __m256i min2 = _mm256_min_epu8( min0, min1 );
513
514 __m256i max0 = _mm256_max_epu8( px0, px1 );
515 __m256i max1 = _mm256_max_epu8( px2, px3 );
516 __m256i max2 = _mm256_max_epu8( max0, max1 );
517
518 __m256i min3 = _mm256_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
519 __m256i max3 = _mm256_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
520 __m256i min4 = _mm256_min_epu8( min2, min3 );
521 __m256i max4 = _mm256_max_epu8( max2, max3 );
522
523 __m256i min5 = _mm256_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
524 __m256i max5 = _mm256_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
525 __m256i rmin = _mm256_min_epu8( min4, min5 );
526 __m256i rmax = _mm256_max_epu8( max4, max5 );
527
528 __m256i range1 = _mm256_subs_epu8( rmax, rmin );
529 __m256i range2 = _mm256_sad_epu8( rmax, rmin );
530
531 uint16_t vrange0 = DivTable[_mm256_cvtsi256_si32( range2 ) >> 1];
532 uint16_t vrange1 = DivTable[_mm256_extract_epi16( range2, 8 ) >> 1];
533 __m256i range00 = _mm256_set1_epi16( vrange0 );
534 __m256i range = _mm256_inserti128_si256( range00, _mm_set1_epi16( vrange1 ), 1 );
535
536 __m256i inset1 = _mm256_srli_epi16( range1, 4 );
537 __m256i inset = _mm256_and_si256( inset1, _mm256_set1_epi8( 0xF ) );
538 __m256i min = _mm256_adds_epu8( rmin, inset );
539 __m256i max = _mm256_subs_epu8( rmax, inset );
540
541 __m256i c0 = _mm256_subs_epu8( px0, rmin );
542 __m256i c1 = _mm256_subs_epu8( px1, rmin );
543 __m256i c2 = _mm256_subs_epu8( px2, rmin );
544 __m256i c3 = _mm256_subs_epu8( px3, rmin );
545
546 __m256i is0 = _mm256_maddubs_epi16( c0, _mm256_set1_epi8( 1 ) );
547 __m256i is1 = _mm256_maddubs_epi16( c1, _mm256_set1_epi8( 1 ) );
548 __m256i is2 = _mm256_maddubs_epi16( c2, _mm256_set1_epi8( 1 ) );
549 __m256i is3 = _mm256_maddubs_epi16( c3, _mm256_set1_epi8( 1 ) );
550
551 __m256i s0 = _mm256_hadd_epi16( is0, is1 );
552 __m256i s1 = _mm256_hadd_epi16( is2, is3 );
553
554 __m256i m0 = _mm256_mulhi_epu16( s0, range );
555 __m256i m1 = _mm256_mulhi_epu16( s1, range );
556
557 __m256i p0 = _mm256_packus_epi16( m0, m1 );
558
559 __m256i p1 = _mm256_or_si256( _mm256_srai_epi32( p0, 6 ), _mm256_srai_epi32( p0, 12 ) );
560 __m256i p2 = _mm256_or_si256( _mm256_srai_epi32( p0, 18 ), p0 );
561 __m256i p3 = _mm256_or_si256( p1, p2 );
562 __m256i p =_mm256_shuffle_epi8( p3, _mm256_set1_epi32( 0x0C080400 ) );
563
564 __m256i mm0 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), min );
565 __m256i mm1 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), max );
566 __m256i mm2 = _mm256_unpacklo_epi64( mm1, mm0 );
567 __m256i mmr = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 11 ), 11 );
568 __m256i mmg = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 26 ), 5 );
569 __m256i mmb = _mm256_srli_epi64( _mm256_slli_epi64( mm2, 16 ), 59 );
570 __m256i mm3 = _mm256_or_si256( mmr, mmg );
571 __m256i mm4 = _mm256_or_si256( mm3, mmb );
572 __m256i mm5 = _mm256_shuffle_epi8( mm4, _mm256_set1_epi32( 0x09080100 ) );
573
574 __m256i d0 = _mm256_unpacklo_epi32( mm5, p );
575 __m256i d1 = _mm256_permute4x64_epi64( d0, _MM_SHUFFLE( 3, 2, 2, 0 ) );
576 __m128i d2 = _mm256_castsi256_si128( d1 );
577
578 __m128i mask = _mm_set_epi64x( 0xFFFF0000 | -solid1, 0xFFFF0000 | -solid0 );
579 __m128i d3 = _mm_and_si128( d2, mask );
580 _mm_storeu_si128( (__m128i*)dst, d3 );
581 dst += 16;
582}
583#endif
584
585void CompressImageDxt1( const char* src, char* dst, int w, int h )
586{
587 assert( (w % 4) == 0 && (h % 4) == 0 );
588
589#ifdef __AVX2__
590 if( w%8 == 0 )
591 {
592 uint32_t buf[8*4];
593 int i = 0;
594
595 auto blocks = w * h / 32;
596 do
597 {
598 auto tmp = (char*)buf;
599 memcpy( tmp, src, 8*4 );
600 memcpy( tmp + 8*4, src + w * 4, 8*4 );
601 memcpy( tmp + 16*4, src + w * 8, 8*4 );
602 memcpy( tmp + 24*4, src + w * 12, 8*4 );
603 src += 8*4;
604 if( ++i == w/8 )
605 {
606 src += w * 3 * 4;
607 i = 0;
608 }
609
610 ProcessRGB_AVX( (uint8_t*)buf, dst );
611 }
612 while( --blocks );
613 }
614 else
615#endif
616 {
617 uint32_t buf[4*4];
618 int i = 0;
619
620 auto ptr = dst;
621 auto blocks = w * h / 16;
622 do
623 {
624 auto tmp = (char*)buf;
625 memcpy( tmp, src, 4*4 );
626 memcpy( tmp + 4*4, src + w * 4, 4*4 );
627 memcpy( tmp + 8*4, src + w * 8, 4*4 );
628 memcpy( tmp + 12*4, src + w * 12, 4*4 );
629 src += 4*4;
630 if( ++i == w/4 )
631 {
632 src += w * 3 * 4;
633 i = 0;
634 }
635
636 const auto c = ProcessRGB( (uint8_t*)buf );
637 memcpy( ptr, &c, sizeof( uint64_t ) );
638 ptr += sizeof( uint64_t );
639 }
640 while( --blocks );
641 }
642}
643
644}