Reactos
at listview 358 lines 10 kB view raw
1/* 2 * PROJECT: ReactOS SDK 3 * LICENSE: MIT (https://spdx.org/licenses/MIT) 4 * PURPOSE: Intrinsics for the SSE2 instruction set 5 * COPYRIGHT: Copyright 2024 Timo Kreuzer (timo.kreuzer@reactos.org) 6 */ 7 8#pragma once 9 10#define _INCLUDED_IMM 11 12//#include <wmmintrin.h> 13#include <emmintrin.h> 14 15#if defined(_MSC_VER) && !defined(__clang__) 16 17typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(32) __m256i 18{ 19 __int8 m256i_i8[32]; 20 __int16 m256i_i16[16]; 21 __int32 m256i_i32[8]; 22 __int64 m256i_i64[4]; 23 unsigned __int8 m256i_u8[32]; 24 unsigned __int16 m256i_u16[16]; 25 unsigned __int32 m256i_u32[8]; 26 unsigned __int64 m256i_u64[4]; 27} __m256i; 28 29#else /* _MSC_VER */ 30 31typedef char __v32qi __attribute__ ((__vector_size__ (32))); 32typedef short __v16hi __attribute__ ((__vector_size__ (32))); 33typedef long long __v4di __attribute__ ((__vector_size__ (32))); 34 35typedef long long __m256i __attribute__((__vector_size__(32), __may_alias__)); 36 37#endif /* _MSC_VER */ 38 39#ifdef __cplusplus 40extern "C" { 41#endif 42 43extern __m256i __cdecl _mm256_cmpeq_epi8(__m256i, __m256i); 44extern __m256i __cdecl _mm256_cmpeq_epi16(__m256i, __m256i); 45extern int __cdecl _mm256_movemask_epi8(__m256i); 46extern __m256i __cdecl _mm256_setzero_si256(void); 47extern void __cdecl _mm256_zeroupper(void); 48 49extern int __cdecl _rdrand16_step(unsigned short *random_val); 50extern int __cdecl _rdrand32_step(unsigned int *random_val); 51#if defined(_M_X64) 52extern int __cdecl _rdrand64_step(unsigned __int64 *random_val); 53#endif 54 55extern int __cdecl _rdseed16_step(unsigned short *random_val); 56extern int __cdecl _rdseed32_step(unsigned int *random_val); 57#if defined(_M_X64) 58extern int __cdecl _rdseed64_step(unsigned __int64 *random_val); 59#endif 60 61void __cdecl _fxsave(void *); 62void __cdecl _fxrstor(void const *); 63void __cdecl _xsave(void *, unsigned __int64); 64void __cdecl _xsavec(void *, unsigned __int64); 65void __cdecl _xsaveopt(void *, unsigned __int64); 66void __cdecl _xsaves(void *, unsigned __int64); 67void __cdecl _xrstor(void const *, unsigned __int64); 68void __cdecl _xrstors(void const *, unsigned __int64); 69#if defined (_M_X64) 70void __cdecl _fxsave64(void *); 71void __cdecl _fxrstor64(void const *); 72void __cdecl _xsave64(void *, unsigned __int64); 73void __cdecl _xsavec64(void *, unsigned __int64); 74void __cdecl _xsaveopt64(void *, unsigned __int64); 75void __cdecl _xsaves64(void *, unsigned __int64); 76void __cdecl _xrstor64(void const *, unsigned __int64); 77void __cdecl _xrstors64(void const *, unsigned __int64); 78#endif 79 80unsigned __int64 __cdecl _xgetbv(unsigned int); 81void __cdecl _xsetbv(unsigned int, unsigned __int64); 82 83 84#if defined(_MSC_VER) && !defined(__clang__) 85 86#pragma intrinsic(_mm256_cmpeq_epi8) 87#pragma intrinsic(_mm256_cmpeq_epi16) 88#pragma intrinsic(_mm256_movemask_epi8) 89#pragma intrinsic(_mm256_setzero_si256) 90#pragma intrinsic(_mm256_zeroupper) 91 92#pragma intrinsic(_rdrand16_step) 93#pragma intrinsic(_rdrand32_step) 94#if defined(_M_X64) 95#pragma intrinsic(_rdrand64_step) 96#endif 97#pragma intrinsic(_rdseed16_step) 98#pragma intrinsic(_rdseed32_step) 99#if defined(_M_X64) 100#pragma intrinsic(_rdseed64_step) 101#endif 102 103#pragma intrinsic(_fxsave) 104#pragma intrinsic(_fxrstor) 105#pragma intrinsic(_xsave) 106#pragma intrinsic(_xsaveopt) 107#pragma intrinsic(_xsavec) 108#pragma intrinsic(_xsaves) 109#pragma intrinsic(_xrstor) 110#pragma intrinsic(_xrstors) 111#if defined (_M_X64) 112#pragma intrinsic(_fxsave64) 113#pragma intrinsic(_fxrstor64) 114#pragma intrinsic(_xsave64) 115#pragma intrinsic(_xsaveopt64) 116#pragma intrinsic(_xsavec64) 117#pragma intrinsic(_xsaves64) 118#pragma intrinsic(_xrstor64) 119#pragma intrinsic(_xrstors64) 120#endif 121 122#pragma intrinsic(_xgetbv) 123#pragma intrinsic(_xsetbv) 124 125#else /* _MSC_VER */ 126 127#ifdef __clang__ 128#define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2"),__min_vector_width__(128))) 129#define __ATTRIBUTE_AVX__ __attribute__((__target__("avx"),__min_vector_width__(256))) 130#define __ATTRIBUTE_AVX2__ __attribute__((__target__("avx2"),__min_vector_width__(256))) 131#else 132#define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2"))) 133#define __ATTRIBUTE_AVX__ __attribute__((__target__("avx"))) 134#define __ATTRIBUTE_AVX2__ __attribute__((__target__("avx2"))) 135#endif 136#define __INTRIN_INLINE_SSE2 __INTRIN_INLINE __ATTRIBUTE_SSE2__ 137#define __INTRIN_INLINE_AVX __INTRIN_INLINE __ATTRIBUTE_AVX__ 138#define __INTRIN_INLINE_AVX2 __INTRIN_INLINE __ATTRIBUTE_AVX2__ 139 140__INTRIN_INLINE_AVX __m256i __cdecl _mm256_cmpeq_epi8(__m256i __A, __m256i __B) 141{ 142 return (__m256i)((__v32qi)__A == (__v32qi)__B); 143} 144 145__INTRIN_INLINE_AVX __m256i __cdecl _mm256_cmpeq_epi16(__m256i __A, __m256i __B) 146{ 147 return (__m256i)((__v16hi)__A == (__v16hi)__B); 148} 149 150__INTRIN_INLINE_AVX2 int __cdecl _mm256_movemask_epi8(__m256i __A) 151{ 152 return __builtin_ia32_pmovmskb256((__v32qi)__A); 153} 154 155__INTRIN_INLINE_AVX __m256i __cdecl _mm256_setzero_si256(void) 156{ 157 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; 158} 159 160__INTRIN_INLINE void __cdecl _mm256_zeroupper(void) 161{ 162 __asm__ __volatile__("vzeroupper"); 163} 164 165__INTRIN_INLINE int _rdrand16_step(unsigned short* random_val) 166{ 167 unsigned char ok; 168 __asm__ __volatile__("rdrand %0; setc %1" : "=r"(*random_val), "=qm"(ok)); 169 return (int)ok; 170} 171 172__INTRIN_INLINE int _rdrand32_step(unsigned int* random_val) 173{ 174 unsigned char ok; 175 __asm__ __volatile__("rdrand %0; setc %1" : "=r"(*random_val), "=qm"(ok)); 176 return (int)ok; 177} 178 179#if defined(__x86_64__) 180__INTRIN_INLINE int _rdrand64_step(unsigned __int64* random_val) 181{ 182 unsigned char ok; 183 __asm__ __volatile__("rdrand %0; setc %1" : "=r"(*random_val), "=qm"(ok)); 184 return (int)ok; 185} 186#endif // __x86_64__ 187 188__INTRIN_INLINE int _rdseed16_step(unsigned short* random_val) 189{ 190 unsigned char ok; 191 __asm__ __volatile__("rdseed %0; setc %1" : "=r"(*random_val), "=qm"(ok)); 192 return (int)ok; 193} 194 195__INTRIN_INLINE int _rdseed32_step(unsigned int* random_val) 196{ 197 unsigned char ok; 198 __asm__ __volatile__("rdseed %0; setc %1" : "=r"(*random_val), "=qm"(ok)); 199 return (int)ok; 200} 201 202#if defined(__x86_64__) 203__INTRIN_INLINE int _rdseed64_step(unsigned __int64* random_val) 204{ 205 unsigned char ok; 206 __asm__ __volatile__("rdseed %0; setc %1" : "=r"(*random_val), "=qm"(ok)); 207 return (int)ok; 208} 209#endif // __x86_64__ 210 211__INTRIN_INLINE void _fxsave(void *__P) 212{ 213#if 0 // Needs newer GCC 214 __builtin_ia32_fxsave(__P); 215#else 216 __asm__ __volatile__("fxsave (%0)" : : "r"(__P)); 217#endif 218} 219 220__INTRIN_INLINE void _fxrstor(void const *__P) 221{ 222#if 0 // Needs newer GCC 223 __builtin_ia32_fxrstor((void*)__P); 224#else 225 __asm__ __volatile__("fxrstor (%0)" : : "r"(__P)); 226#endif 227} 228 229#if defined(__x86_64__) 230__INTRIN_INLINE void _fxsave64(void *__P) 231{ 232 __builtin_ia32_fxsave64(__P); 233} 234 235__INTRIN_INLINE void _fxrstor64(void const *__P) 236{ 237 __builtin_ia32_fxrstor64((void*)__P); 238} 239#endif // __x86_64__ 240 241#ifdef __clang__ 242#define __ATTRIBUTE_XSAVE__ __attribute__((__target__("xsave"))) 243#else 244#define __ATTRIBUTE_XSAVE__ 245#endif 246#define __INTRIN_INLINE_XSAVE __INTRIN_INLINE __ATTRIBUTE_XSAVE__ 247 248__INTRIN_INLINE_XSAVE void _xsave(void *__P, unsigned __int64 __M) 249{ 250 __builtin_ia32_xsave(__P, __M); 251} 252 253__INTRIN_INLINE_XSAVE void _xsavec(void *__P, unsigned __int64 __M) 254{ 255#if 0 // Needs newer GCC 256 __builtin_ia32_xsavec(__P, __M); 257#else 258 __asm__ __volatile__("xsavec %0" : "=m" (*(char*)__P) : "a" ((unsigned int)__M), "d" ((unsigned int)(__M >> 32)) :"memory"); 259#endif 260} 261 262__INTRIN_INLINE_XSAVE void _xsaveopt(void *__P, unsigned __int64 __M) 263{ 264#if 0 // Needs newer GCC 265 __builtin_ia32_xsaveopt(__P, __M); 266#else 267 __asm__ __volatile__("xsaveopt %0" : "=m" (*(char*)__P) : "a" ((unsigned int)__M), "d" ((unsigned int)(__M >> 32)) :"memory"); 268#endif 269} 270 271__INTRIN_INLINE_XSAVE void _xsaves(void *__P, unsigned __int64 __M) 272{ 273#if 0 // Needs newer GCC 274 __builtin_ia32_xsaves(__P, __M); 275#else 276 __asm__ __volatile__("xsaves %0" : "=m" (*(char*)__P) : "a" ((unsigned int)__M), "d" ((unsigned int)(__M >> 32)) :"memory"); 277#endif 278} 279 280__INTRIN_INLINE_XSAVE void _xrstor(void const *__P, unsigned __int64 __M) 281{ 282 __builtin_ia32_xrstor((void*)__P, __M); 283} 284 285__INTRIN_INLINE_XSAVE void _xrstors(void const *__P, unsigned __int64 __M) 286{ 287#if 0 // Needs newer GCC 288 __builtin_ia32_xrstors((void*)__P, __M); 289#else 290 __asm__ __volatile__("xrstors %0" : "=m" (*(char*)__P) : "a" ((unsigned int)__M), "d" ((unsigned int)(__M >> 32)) :"memory"); 291#endif 292} 293 294#if defined(__x86_64__) 295__INTRIN_INLINE_XSAVE void _xsave64(void *__P, unsigned __int64 __M) 296{ 297 __builtin_ia32_xsave64(__P, __M); 298} 299 300__INTRIN_INLINE_XSAVE void _xsavec64(void *__P, unsigned __int64 __M) 301{ 302#if 0 // Needs newer GCC 303 __builtin_ia32_xsavec64(__P, __M); 304#else 305 __asm__ __volatile__("xsavec %0" : "=m" (*(char*)__P) : "a" ((unsigned int)__M), "d" ((unsigned int)(__M >> 32)) : "memory"); 306#endif 307} 308 309__INTRIN_INLINE_XSAVE void _xsaveopt64(void *__P, unsigned __int64 __M) 310{ 311#if 0 // Needs newer GCC 312 __builtin_ia32_xsaveopt64(__P, __M); 313#else 314 __asm__ __volatile__("xsaveopt %0" : "=m" (*(char*)__P) : "a" ((unsigned int)__M), "d" ((unsigned int)(__M >> 32)) : "memory"); 315#endif 316} 317 318__INTRIN_INLINE_XSAVE void _xsaves64(void *__P, unsigned __int64 __M) 319{ 320#if 0 // Needs newer GCC 321 __builtin_ia32_xsaves64(__P, __M); 322#else 323 __asm__ __volatile__("xsaves %0" : "=m" (*(char*)__P) : "a" ((unsigned int)__M), "d" ((unsigned int)(__M >> 32)) : "memory"); 324#endif 325} 326 327__INTRIN_INLINE_XSAVE void _xrstor64(void const *__P, unsigned __int64 __M) 328{ 329 __builtin_ia32_xrstor64((void*)__P, __M); 330} 331 332__INTRIN_INLINE_XSAVE void _xrstors64(void const *__P, unsigned __int64 __M) 333{ 334#if 0 // Needs newer GCC 335 __builtin_ia32_xrstors64((void*)__P, __M); 336#else 337 __asm__ __volatile__("xrstors %0" : "=m" (*(char*)__P) : "a" ((unsigned int)__M), "d" ((unsigned int)(__M >> 32)) : "memory"); 338#endif 339} 340#endif // __x86_64__ 341 342#ifndef __clang__ 343__INTRIN_INLINE unsigned __int64 _xgetbv(unsigned int __A) 344{ 345 return __builtin_ia32_xgetbv(__A); 346} 347 348__INTRIN_INLINE void _xsetbv(unsigned int __A, unsigned __int64 __V) 349{ 350 __builtin_ia32_xsetbv(__A, __V); 351} 352#endif // !__clang__ 353 354#endif /* _MSC_VER */ 355 356#ifdef __cplusplus 357} // extern "C" 358#endif