Reactos
at master 1966 lines 60 kB view raw
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10#pragma once 11#ifndef _INCLUDED_EMM 12#define _INCLUDED_EMM 13 14#include <vcruntime.h> 15#include <xmmintrin.h> 16 17#if defined(_MSC_VER) && !defined(__clang__) 18 19typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128i 20{ 21 __int8 m128i_i8[16]; 22 __int16 m128i_i16[8]; 23 __int32 m128i_i32[4]; 24 __int64 m128i_i64[2]; 25 unsigned __int8 m128i_u8[16]; 26 unsigned __int16 m128i_u16[8]; 27 unsigned __int32 m128i_u32[4]; 28 unsigned __int64 m128i_u64[2]; 29} __m128i; 30#ifdef _STATIC_ASSERT 31_STATIC_ASSERT(sizeof(__m128i) == 16); 32#endif 33 34typedef struct _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128d 35{ 36 double m128d_f64[2]; 37} __m128d; 38 39typedef __declspec(align(1)) __m128i __m128i_u; 40 41#define __ATTRIBUTE_SSE2__ 42 43#else /* _MSC_VER */ 44 45typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); 46typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); 47 48typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1))); 49typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1))); 50 51/* Type defines. */ 52typedef double __v2df __attribute__((__vector_size__(16))); 53typedef long long __v2di __attribute__((__vector_size__(16))); 54typedef short __v8hi __attribute__((__vector_size__(16))); 55typedef char __v16qi __attribute__((__vector_size__(16))); 56 57/* Unsigned types */ 58typedef unsigned long long __v2du __attribute__((__vector_size__(16))); 59typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 60typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 61 62/* We need an explicitly signed variant for char. Note that this shouldn't 63 * appear in the interface though. */ 64typedef signed char __v16qs __attribute__((__vector_size__(16))); 65 66#ifdef __clang__ 67#define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2"),__min_vector_width__(128))) 68#define __ATTRIBUTE_MMXSSE2__ __attribute__((__target__("mmx,sse2"),__min_vector_width__(128))) 69#else 70#define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2"))) 71#define __ATTRIBUTE_MMXSSE2__ __attribute__((__target__("mmx,sse2"))) 72#endif 73#define __INTRIN_INLINE_SSE2 __INTRIN_INLINE __ATTRIBUTE_SSE2__ 74#define __INTRIN_INLINE_MMXSSE2 __INTRIN_INLINE __ATTRIBUTE_MMXSSE2__ 75 76#endif /* _MSC_VER */ 77 78#ifdef __cplusplus 79extern "C" { 80#endif 81 82extern __m128d _mm_add_sd(__m128d a, __m128d b); 83extern __m128d _mm_add_pd(__m128d a, __m128d b); 84extern __m128d _mm_sub_sd(__m128d a, __m128d b); 85extern __m128d _mm_sub_pd(__m128d a, __m128d b); 86extern __m128d _mm_mul_sd(__m128d a, __m128d b); 87extern __m128d _mm_mul_pd(__m128d a, __m128d b); 88extern __m128d _mm_div_sd(__m128d a, __m128d b); 89extern __m128d _mm_div_pd(__m128d a, __m128d b); 90extern __m128d _mm_sqrt_sd(__m128d a, __m128d b); 91extern __m128d _mm_sqrt_pd(__m128d a); 92extern __m128d _mm_min_sd(__m128d a, __m128d b); 93extern __m128d _mm_min_pd(__m128d a, __m128d b); 94extern __m128d _mm_max_sd(__m128d a, __m128d b); 95extern __m128d _mm_max_pd(__m128d a, __m128d b); 96extern __m128d _mm_and_pd(__m128d a, __m128d b); 97extern __m128d _mm_andnot_pd(__m128d a, __m128d b); 98extern __m128d _mm_or_pd(__m128d a, __m128d b); 99extern __m128d _mm_xor_pd(__m128d a, __m128d b); 100extern __m128d _mm_cmpeq_pd(__m128d a, __m128d b); 101extern __m128d _mm_cmplt_pd(__m128d a, __m128d b); 102extern __m128d _mm_cmple_pd(__m128d a, __m128d b); 103extern __m128d _mm_cmpgt_pd(__m128d a, __m128d b); 104extern __m128d _mm_cmpge_pd(__m128d a, __m128d b); 105extern __m128d _mm_cmpord_pd(__m128d a, __m128d b); 106extern __m128d _mm_cmpunord_pd(__m128d a, __m128d b); 107extern __m128d _mm_cmpneq_pd(__m128d a, __m128d b); 108extern __m128d _mm_cmpnlt_pd(__m128d a, __m128d b); 109extern __m128d _mm_cmpnle_pd(__m128d a, __m128d b); 110extern __m128d _mm_cmpngt_pd(__m128d a, __m128d b); 111extern __m128d _mm_cmpnge_pd(__m128d a, __m128d b); 112extern __m128d _mm_cmpeq_sd(__m128d a, __m128d b); 113extern __m128d _mm_cmplt_sd(__m128d a, __m128d b); 114extern __m128d _mm_cmple_sd(__m128d a, __m128d b); 115extern __m128d _mm_cmpgt_sd(__m128d a, __m128d b); 116extern __m128d _mm_cmpge_sd(__m128d a, __m128d b); 117extern __m128d _mm_cmpord_sd(__m128d a, __m128d b); 118extern __m128d _mm_cmpunord_sd(__m128d a, __m128d b); 119extern __m128d _mm_cmpneq_sd(__m128d a, __m128d b); 120extern __m128d _mm_cmpnlt_sd(__m128d a, __m128d b); 121extern __m128d _mm_cmpnle_sd(__m128d a, __m128d b); 122extern __m128d _mm_cmpngt_sd(__m128d a, __m128d b); 123extern __m128d _mm_cmpnge_sd(__m128d a, __m128d b); 124extern int _mm_comieq_sd(__m128d a, __m128d b); 125extern int _mm_comilt_sd(__m128d a, __m128d b); 126extern int _mm_comile_sd(__m128d a, __m128d b); 127extern int _mm_comigt_sd(__m128d a, __m128d b); 128extern int _mm_comige_sd(__m128d a, __m128d b); 129extern int _mm_comineq_sd(__m128d a, __m128d b); 130extern int _mm_ucomieq_sd(__m128d a, __m128d b); 131extern int _mm_ucomilt_sd(__m128d a, __m128d b); 132extern int _mm_ucomile_sd(__m128d a, __m128d b); 133extern int _mm_ucomigt_sd(__m128d a, __m128d b); 134extern int _mm_ucomige_sd(__m128d a, __m128d b); 135extern int _mm_ucomineq_sd(__m128d a, __m128d b); 136extern __m128 _mm_cvtpd_ps(__m128d a); 137extern __m128d _mm_cvtps_pd(__m128 a); 138extern __m128d _mm_cvtepi32_pd(__m128i a); 139extern __m128i _mm_cvtpd_epi32(__m128d a); 140extern int _mm_cvtsd_si32(__m128d a); 141extern __m128 _mm_cvtsd_ss(__m128 a, __m128d b); 142extern __m128d _mm_cvtsi32_sd(__m128d a, int b); 143extern __m128d _mm_cvtss_sd(__m128d a, __m128 b); 144extern __m128i _mm_cvttpd_epi32(__m128d a); 145extern int _mm_cvttsd_si32(__m128d a); 146extern __m64 _mm_cvtpd_pi32(__m128d a); 147extern __m64 _mm_cvttpd_pi32(__m128d a); 148extern __m128d _mm_cvtpi32_pd(__m64 a); 149extern double _mm_cvtsd_f64(__m128d a); 150extern __m128d _mm_load_pd(double const *dp); 151extern __m128d _mm_load1_pd(double const *dp); 152extern __m128d _mm_loadr_pd(double const *dp); 153extern __m128d _mm_loadu_pd(double const *dp); 154//extern __m128i _mm_loadu_si64(void const *a); 155//extern __m128i _mm_loadu_si32(void const *a); 156//extern __m128i _mm_loadu_si16(void const *a); 157extern __m128d _mm_load_sd(double const *dp); 158extern __m128d _mm_loadh_pd(__m128d a, double const *dp); 159extern __m128d _mm_loadl_pd(__m128d a, double const *dp); 160//extern __m128d _mm_undefined_pd(void); 161extern __m128d _mm_set_sd(double w); 162extern __m128d _mm_set1_pd(double w); 163extern __m128d _mm_set_pd(double w, double x); 164extern __m128d _mm_setr_pd(double w, double x); 165extern __m128d _mm_setzero_pd(void); 166extern __m128d _mm_move_sd(__m128d a, __m128d b); 167extern void _mm_store_sd(double *dp, __m128d a); 168extern void _mm_store_pd(double *dp, __m128d a); 169extern void _mm_store1_pd(double *dp, __m128d a); 170extern void _mm_storeu_pd(double *dp, __m128d a); 171extern void _mm_storer_pd(double *dp, __m128d a); 172extern void _mm_storeh_pd(double *dp, __m128d a); 173extern void _mm_storel_pd(double *dp, __m128d a); 174extern __m128i _mm_add_epi8(__m128i a, __m128i b); 175extern __m128i _mm_add_epi16(__m128i a, __m128i b); 176extern __m128i _mm_add_epi32(__m128i a, __m128i b); 177extern __m64 _mm_add_si64(__m64 a, __m64 b); 178extern __m128i _mm_add_epi64(__m128i a, __m128i b); 179extern __m128i _mm_adds_epi8(__m128i a, __m128i b); 180extern __m128i _mm_adds_epi16(__m128i a, __m128i b); 181extern __m128i _mm_adds_epu8(__m128i a, __m128i b); 182extern __m128i _mm_adds_epu16(__m128i a, __m128i b); 183extern __m128i _mm_avg_epu8(__m128i a, __m128i b); 184extern __m128i _mm_avg_epu16(__m128i a, __m128i b); 185extern __m128i _mm_madd_epi16(__m128i a, __m128i b); 186extern __m128i _mm_max_epi16(__m128i a, __m128i b); 187extern __m128i _mm_max_epu8(__m128i a, __m128i b); 188extern __m128i _mm_min_epi16(__m128i a, __m128i b); 189extern __m128i _mm_min_epu8(__m128i a, __m128i b); 190extern __m128i _mm_mulhi_epi16(__m128i a, __m128i b); 191extern __m128i _mm_mulhi_epu16(__m128i a, __m128i b); 192extern __m128i _mm_mullo_epi16(__m128i a, __m128i b); 193extern __m64 _mm_mul_su32(__m64 a, __m64 b); 194extern __m128i _mm_mul_epu32(__m128i a, __m128i b); 195extern __m128i _mm_sad_epu8(__m128i a, __m128i b); 196extern __m128i _mm_sub_epi8(__m128i a, __m128i b); 197extern __m128i _mm_sub_epi16(__m128i a, __m128i b); 198extern __m128i _mm_sub_epi32(__m128i a, __m128i b); 199extern __m64 _mm_sub_si64(__m64 a, __m64 b); 200extern __m128i _mm_sub_epi64(__m128i a, __m128i b); 201extern __m128i _mm_subs_epi8(__m128i a, __m128i b); 202extern __m128i _mm_subs_epi16(__m128i a, __m128i b); 203extern __m128i _mm_subs_epu8(__m128i a, __m128i b); 204extern __m128i _mm_subs_epu16(__m128i a, __m128i b); 205extern __m128i _mm_and_si128(__m128i a, __m128i b); 206extern __m128i _mm_andnot_si128(__m128i a, __m128i b); 207extern __m128i _mm_or_si128(__m128i a, __m128i b); 208extern __m128i _mm_xor_si128(__m128i a, __m128i b); 209extern __m128i _mm_slli_si128(__m128i a, int i); 210extern __m128i _mm_slli_epi16(__m128i a, int count); 211extern __m128i _mm_sll_epi16(__m128i a, __m128i count); 212extern __m128i _mm_slli_epi32(__m128i a, int count); 213extern __m128i _mm_sll_epi32(__m128i a, __m128i count); 214extern __m128i _mm_slli_epi64(__m128i a, int count); 215extern __m128i _mm_sll_epi64(__m128i a, __m128i count); 216extern __m128i _mm_srai_epi16(__m128i a, int count); 217extern __m128i _mm_sra_epi16(__m128i a, __m128i count); 218extern __m128i _mm_srai_epi32(__m128i a, int count); 219extern __m128i _mm_sra_epi32(__m128i a, __m128i count); 220extern __m128i _mm_srli_si128(__m128i a, int imm); 221extern __m128i _mm_srli_epi16(__m128i a, int count); 222extern __m128i _mm_srl_epi16(__m128i a, __m128i count); 223extern __m128i _mm_srli_epi32(__m128i a, int count); 224extern __m128i _mm_srl_epi32(__m128i a, __m128i count); 225extern __m128i _mm_srli_epi64(__m128i a, int count); 226extern __m128i _mm_srl_epi64(__m128i a, __m128i count); 227extern __m128i _mm_cmpeq_epi8(__m128i a, __m128i b); 228extern __m128i _mm_cmpeq_epi16(__m128i a, __m128i b); 229extern __m128i _mm_cmpeq_epi32(__m128i a, __m128i b); 230extern __m128i _mm_cmpgt_epi8(__m128i a, __m128i b); 231extern __m128i _mm_cmpgt_epi16(__m128i a, __m128i b); 232extern __m128i _mm_cmpgt_epi32(__m128i a, __m128i b); 233extern __m128i _mm_cmplt_epi8(__m128i a, __m128i b); 234extern __m128i _mm_cmplt_epi16(__m128i a, __m128i b); 235extern __m128i _mm_cmplt_epi32(__m128i a, __m128i b); 236#ifdef _M_AMD64 237extern __m128d _mm_cvtsi64_sd(__m128d a, long long b); 238extern long long _mm_cvtsd_si64(__m128d a); 239extern long long _mm_cvttsd_si64(__m128d a); 240#endif 241extern __m128 _mm_cvtepi32_ps(__m128i a); 242extern __m128i _mm_cvtps_epi32(__m128 a); 243extern __m128i _mm_cvttps_epi32(__m128 a); 244extern __m128i _mm_cvtsi32_si128(int a); 245#ifdef _M_AMD64 246extern __m128i _mm_cvtsi64_si128(long long a); 247#endif 248extern int _mm_cvtsi128_si32(__m128i a); 249#ifdef _M_AMD64 250extern long long _mm_cvtsi128_si64(__m128i a); 251#endif 252extern __m128i _mm_load_si128(__m128i const *p); 253extern __m128i _mm_loadu_si128(__m128i_u const *p); 254extern __m128i _mm_loadl_epi64(__m128i_u const *p); 255//extern __m128i _mm_undefined_si128(void); 256//extern __m128i _mm_set_epi64x(long long q1, long long q0); // FIXME 257extern __m128i _mm_set_epi64(__m64 q1, __m64 q0); 258//extern __m128i _mm_set_epi32(int i3, int i1, int i0); 259extern __m128i _mm_set_epi32(int i3, int i2, int i1, int i0); 260//extern __m128i _mm_set_epi16(short w7, short w2, short w1, short w0); 261extern __m128i _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0); 262//extern __m128i _mm_set_epi8(char b15, char b10, char b4, char b3, char b2, char b1, char b0); 263extern __m128i _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0); 264//extern __m128i _mm_set1_epi64x(long long q); // FIXME 265extern __m128i _mm_set1_epi64(__m64 q); 266extern __m128i _mm_set1_epi32(int i); 267extern __m128i _mm_set1_epi16(short w); 268extern __m128i _mm_set1_epi8(char b); 269extern __m128i _mm_setl_epi64(__m128i q); // FIXME: clang? 270extern __m128i _mm_setr_epi64(__m64 q0, __m64 q1); 271//extern __m128i _mm_setr_epi32(int i0, int i2, int i3); 272extern __m128i _mm_setr_epi32(int i0, int i1, int i2, int i3); 273//extern __m128i _mm_setr_epi16(short w0, short w5, short w6, short w7); 274extern __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7); 275//extern __m128i _mm_setr_epi8(char b0, char b6, char b11, char b12, char b13, char b14, char b15); 276extern __m128i _mm_setr_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0); 277extern __m128i _mm_setzero_si128(void); 278extern void _mm_store_si128(__m128i *p, __m128i b); 279extern void _mm_storeu_si128(__m128i_u *p, __m128i b); 280//extern void _mm_storeu_si64(void *p, __m128i b); 281//extern void _mm_storeu_si32(void *p, __m128i b); 282//extern void _mm_storeu_si16(void *p, __m128i b); 283extern void _mm_maskmoveu_si128(__m128i d, __m128i n, _Out_writes_bytes_(16) char *p); 284extern void _mm_storel_epi64(__m128i_u *p, __m128i a); 285extern void _mm_stream_pd(double *p, __m128d a); 286extern void _mm_stream_si128(__m128i *p, __m128i a); 287extern void _mm_stream_si32(int *p, int a); 288extern void _mm_clflush(void const *p); 289extern void _mm_lfence(void); 290extern void _mm_mfence(void); 291extern __m128i _mm_packs_epi16(__m128i a, __m128i b); 292extern __m128i _mm_packs_epi32(__m128i a, __m128i b); 293extern __m128i _mm_packus_epi16(__m128i a, __m128i b); 294extern int _mm_extract_epi16(__m128i a, int imm); 295extern __m128i _mm_insert_epi16(__m128i a, int b, int imm); 296extern int _mm_movemask_epi8(__m128i a); 297extern __m128i _mm_shuffle_epi32(__m128i a, int imm); 298extern __m128i _mm_shufflelo_epi16(__m128i a, int imm); 299extern __m128i _mm_shufflehi_epi16(__m128i a, int imm); 300extern __m128i _mm_unpackhi_epi8(__m128i a, __m128i b); 301extern __m128i _mm_unpackhi_epi16(__m128i a, __m128i b); 302extern __m128i _mm_unpackhi_epi32(__m128i a, __m128i b); 303extern __m128i _mm_unpackhi_epi64(__m128i a, __m128i b); 304extern __m128i _mm_unpacklo_epi8(__m128i a, __m128i b); 305extern __m128i _mm_unpacklo_epi16(__m128i a, __m128i b); 306extern __m128i _mm_unpacklo_epi32(__m128i a, __m128i b); 307extern __m128i _mm_unpacklo_epi64(__m128i a, __m128i b); 308extern __m64 _mm_movepi64_pi64(__m128i a); 309extern __m128i _mm_movpi64_epi64(__m64 a); 310extern __m128i _mm_move_epi64(__m128i a); 311extern __m128d _mm_unpackhi_pd(__m128d a, __m128d b); 312extern __m128d _mm_unpacklo_pd(__m128d a, __m128d b); 313extern int _mm_movemask_pd(__m128d a); 314extern __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm); 315extern __m128 _mm_castpd_ps(__m128d a); 316extern __m128i _mm_castpd_si128(__m128d a); 317extern __m128d _mm_castps_pd(__m128 a); 318extern __m128i _mm_castps_si128(__m128 a); 319extern __m128 _mm_castsi128_ps(__m128i a); 320extern __m128d _mm_castsi128_pd(__m128i a); 321void _mm_pause(void); 322 323/* Alternate names */ 324#define _mm_set_pd1(a) _mm_set1_pd(a) 325#define _mm_load_pd1(p) _mm_load1_pd(p) 326#define _mm_store_pd1(p, a) _mm_store1_pd((p), (a)) 327#define _mm_bslli_si128 _mm_slli_si128 328#define _mm_bsrli_si128 _mm_srli_si128 329#define _mm_stream_si64 _mm_stream_si64x 330 331#if defined(_MSC_VER) && !defined(__clang__) 332 333#pragma intrinsic(_mm_add_sd) 334#pragma intrinsic(_mm_add_pd) 335#pragma intrinsic(_mm_sub_sd) 336#pragma intrinsic(_mm_sub_pd) 337#pragma intrinsic(_mm_mul_sd) 338#pragma intrinsic(_mm_mul_pd) 339#pragma intrinsic(_mm_div_sd) 340#pragma intrinsic(_mm_div_pd) 341#pragma intrinsic(_mm_sqrt_sd) 342#pragma intrinsic(_mm_sqrt_pd) 343#pragma intrinsic(_mm_min_sd) 344#pragma intrinsic(_mm_min_pd) 345#pragma intrinsic(_mm_max_sd) 346#pragma intrinsic(_mm_max_pd) 347#pragma intrinsic(_mm_and_pd) 348#pragma intrinsic(_mm_andnot_pd) 349#pragma intrinsic(_mm_or_pd) 350#pragma intrinsic(_mm_xor_pd) 351#pragma intrinsic(_mm_cmpeq_pd) 352#pragma intrinsic(_mm_cmplt_pd) 353#pragma intrinsic(_mm_cmple_pd) 354#pragma intrinsic(_mm_cmpgt_pd) 355#pragma intrinsic(_mm_cmpge_pd) 356#pragma intrinsic(_mm_cmpord_pd) 357#pragma intrinsic(_mm_cmpunord_pd) 358#pragma intrinsic(_mm_cmpneq_pd) 359#pragma intrinsic(_mm_cmpnlt_pd) 360#pragma intrinsic(_mm_cmpnle_pd) 361#pragma intrinsic(_mm_cmpngt_pd) 362#pragma intrinsic(_mm_cmpnge_pd) 363#pragma intrinsic(_mm_cmpeq_sd) 364#pragma intrinsic(_mm_cmplt_sd) 365#pragma intrinsic(_mm_cmple_sd) 366#pragma intrinsic(_mm_cmpgt_sd) 367#pragma intrinsic(_mm_cmpge_sd) 368#pragma intrinsic(_mm_cmpord_sd) 369#pragma intrinsic(_mm_cmpunord_sd) 370#pragma intrinsic(_mm_cmpneq_sd) 371#pragma intrinsic(_mm_cmpnlt_sd) 372#pragma intrinsic(_mm_cmpnle_sd) 373#pragma intrinsic(_mm_cmpngt_sd) 374#pragma intrinsic(_mm_cmpnge_sd) 375#pragma intrinsic(_mm_comieq_sd) 376#pragma intrinsic(_mm_comilt_sd) 377#pragma intrinsic(_mm_comile_sd) 378#pragma intrinsic(_mm_comigt_sd) 379#pragma intrinsic(_mm_comige_sd) 380#pragma intrinsic(_mm_comineq_sd) 381#pragma intrinsic(_mm_ucomieq_sd) 382#pragma intrinsic(_mm_ucomilt_sd) 383#pragma intrinsic(_mm_ucomile_sd) 384#pragma intrinsic(_mm_ucomigt_sd) 385#pragma intrinsic(_mm_ucomige_sd) 386#pragma intrinsic(_mm_ucomineq_sd) 387#pragma intrinsic(_mm_cvtpd_ps) 388#pragma intrinsic(_mm_cvtps_pd) 389#pragma intrinsic(_mm_cvtepi32_pd) 390#pragma intrinsic(_mm_cvtpd_epi32) 391#pragma intrinsic(_mm_cvtsd_si32) 392#pragma intrinsic(_mm_cvtsd_ss) 393#pragma intrinsic(_mm_cvtsi32_sd) 394#pragma intrinsic(_mm_cvtss_sd) 395#pragma intrinsic(_mm_cvttpd_epi32) 396#pragma intrinsic(_mm_cvttsd_si32) 397//#pragma intrinsic(_mm_cvtpd_pi32) 398//#pragma intrinsic(_mm_cvttpd_pi32) 399//#pragma intrinsic(_mm_cvtpi32_pd) 400#pragma intrinsic(_mm_cvtsd_f64) 401#pragma intrinsic(_mm_load_pd) 402#pragma intrinsic(_mm_load1_pd) 403#pragma intrinsic(_mm_loadr_pd) 404#pragma intrinsic(_mm_loadu_pd) 405//#pragma intrinsic(_mm_loadu_si64) 406//#pragma intrinsic(_mm_loadu_si32) 407//#pragma intrinsic(_mm_loadu_si16) 408#pragma intrinsic(_mm_load_sd) 409#pragma intrinsic(_mm_loadh_pd) 410#pragma intrinsic(_mm_loadl_pd) 411//#pragma intrinsic(_mm_undefined_pd) 412#pragma intrinsic(_mm_set_sd) 413#pragma intrinsic(_mm_set1_pd) 414#pragma intrinsic(_mm_set_pd) 415#pragma intrinsic(_mm_setr_pd) 416#pragma intrinsic(_mm_setzero_pd) 417#pragma intrinsic(_mm_move_sd) 418#pragma intrinsic(_mm_store_sd) 419#pragma intrinsic(_mm_store_pd) 420#pragma intrinsic(_mm_store1_pd) 421#pragma intrinsic(_mm_storeu_pd) 422#pragma intrinsic(_mm_storer_pd) 423#pragma intrinsic(_mm_storeh_pd) 424#pragma intrinsic(_mm_storel_pd) 425#pragma intrinsic(_mm_add_epi8) 426#pragma intrinsic(_mm_add_epi16) 427#pragma intrinsic(_mm_add_epi32) 428//#pragma intrinsic(_mm_add_si64) 429#pragma intrinsic(_mm_add_epi64) 430#pragma intrinsic(_mm_adds_epi8) 431#pragma intrinsic(_mm_adds_epi16) 432#pragma intrinsic(_mm_adds_epu8) 433#pragma intrinsic(_mm_adds_epu16) 434#pragma intrinsic(_mm_avg_epu8) 435#pragma intrinsic(_mm_avg_epu16) 436#pragma intrinsic(_mm_madd_epi16) 437#pragma intrinsic(_mm_max_epi16) 438#pragma intrinsic(_mm_max_epu8) 439#pragma intrinsic(_mm_min_epi16) 440#pragma intrinsic(_mm_min_epu8) 441#pragma intrinsic(_mm_mulhi_epi16) 442#pragma intrinsic(_mm_mulhi_epu16) 443#pragma intrinsic(_mm_mullo_epi16) 444//#pragma intrinsic(_mm_mul_su32) 445#pragma intrinsic(_mm_mul_epu32) 446#pragma intrinsic(_mm_sad_epu8) 447#pragma intrinsic(_mm_sub_epi8) 448#pragma intrinsic(_mm_sub_epi16) 449#pragma intrinsic(_mm_sub_epi32) 450//#pragma intrinsic(_mm_sub_si64) 451#pragma intrinsic(_mm_sub_epi64) 452#pragma intrinsic(_mm_subs_epi8) 453#pragma intrinsic(_mm_subs_epi16) 454#pragma intrinsic(_mm_subs_epu8) 455#pragma intrinsic(_mm_subs_epu16) 456#pragma intrinsic(_mm_and_si128) 457#pragma intrinsic(_mm_andnot_si128) 458#pragma intrinsic(_mm_or_si128) 459#pragma intrinsic(_mm_xor_si128) 460#pragma intrinsic(_mm_slli_si128) 461#pragma intrinsic(_mm_slli_epi16) 462#pragma intrinsic(_mm_sll_epi16) 463#pragma intrinsic(_mm_slli_epi32) 464#pragma intrinsic(_mm_sll_epi32) 465#pragma intrinsic(_mm_slli_epi64) 466#pragma intrinsic(_mm_sll_epi64) 467#pragma intrinsic(_mm_srai_epi16) 468#pragma intrinsic(_mm_sra_epi16) 469#pragma intrinsic(_mm_srai_epi32) 470#pragma intrinsic(_mm_sra_epi32) 471#pragma intrinsic(_mm_srli_si128) 472#pragma intrinsic(_mm_srli_epi16) 473#pragma intrinsic(_mm_srl_epi16) 474#pragma intrinsic(_mm_srli_epi32) 475#pragma intrinsic(_mm_srl_epi32) 476#pragma intrinsic(_mm_srli_epi64) 477#pragma intrinsic(_mm_srl_epi64) 478#pragma intrinsic(_mm_cmpeq_epi8) 479#pragma intrinsic(_mm_cmpeq_epi16) 480#pragma intrinsic(_mm_cmpeq_epi32) 481#pragma intrinsic(_mm_cmpgt_epi8) 482#pragma intrinsic(_mm_cmpgt_epi16) 483#pragma intrinsic(_mm_cmpgt_epi32) 484#pragma intrinsic(_mm_cmplt_epi8) 485#pragma intrinsic(_mm_cmplt_epi16) 486#pragma intrinsic(_mm_cmplt_epi32) 487#ifdef _M_AMD64 488#pragma intrinsic(_mm_cvtsi64_sd) 489#pragma intrinsic(_mm_cvtsd_si64) 490#pragma intrinsic(_mm_cvttsd_si64) 491#endif 492#pragma intrinsic(_mm_cvtepi32_ps) 493#pragma intrinsic(_mm_cvtps_epi32) 494#pragma intrinsic(_mm_cvttps_epi32) 495#pragma intrinsic(_mm_cvtsi32_si128) 496#ifdef _M_AMD64 497#pragma intrinsic(_mm_cvtsi64_si128) 498#endif 499#pragma intrinsic(_mm_cvtsi128_si32) 500#ifdef _M_AMD64 501#pragma intrinsic(_mm_cvtsi128_si64) 502#endif 503#pragma intrinsic(_mm_load_si128) 504#pragma intrinsic(_mm_loadu_si128) 505#pragma intrinsic(_mm_loadl_epi64) 506//#pragma intrinsic(_mm_undefined_si128) 507//#pragma intrinsic(_mm_set_epi64x) 508//#pragma intrinsic(_mm_set_epi64) 509#pragma intrinsic(_mm_set_epi32) 510#pragma intrinsic(_mm_set_epi16) 511#pragma intrinsic(_mm_set_epi8) 512//#pragma intrinsic(_mm_set1_epi64x) 513//#pragma intrinsic(_mm_set1_epi64) 514#pragma intrinsic(_mm_set1_epi32) 515#pragma intrinsic(_mm_set1_epi16) 516#pragma intrinsic(_mm_set1_epi8) 517#pragma intrinsic(_mm_setl_epi64) 518//#pragma intrinsic(_mm_setr_epi64) 519#pragma intrinsic(_mm_setr_epi32) 520#pragma intrinsic(_mm_setr_epi16) 521#pragma intrinsic(_mm_setr_epi8) 522#pragma intrinsic(_mm_setzero_si128) 523#pragma intrinsic(_mm_store_si128) 524#pragma intrinsic(_mm_storeu_si128) 525//#pragma intrinsic(_mm_storeu_si64) 526//#pragma intrinsic(_mm_storeu_si32) 527//#pragma intrinsic(_mm_storeu_si16) 528#pragma intrinsic(_mm_maskmoveu_si128) 529#pragma intrinsic(_mm_storel_epi64) 530#pragma intrinsic(_mm_stream_pd) 531#pragma intrinsic(_mm_stream_si128) 532#pragma intrinsic(_mm_stream_si32) 533#pragma intrinsic(_mm_clflush) 534#pragma intrinsic(_mm_lfence) 535#pragma intrinsic(_mm_mfence) 536#pragma intrinsic(_mm_packs_epi16) 537#pragma intrinsic(_mm_packs_epi32) 538#pragma intrinsic(_mm_packus_epi16) 539#pragma intrinsic(_mm_extract_epi16) 540#pragma intrinsic(_mm_insert_epi16) 541#pragma intrinsic(_mm_movemask_epi8) 542#pragma intrinsic(_mm_shuffle_epi32) 543#pragma intrinsic(_mm_shufflelo_epi16) 544#pragma intrinsic(_mm_shufflehi_epi16) 545#pragma intrinsic(_mm_unpackhi_epi8) 546#pragma intrinsic(_mm_unpackhi_epi16) 547#pragma intrinsic(_mm_unpackhi_epi32) 548#pragma intrinsic(_mm_unpackhi_epi64) 549#pragma intrinsic(_mm_unpacklo_epi8) 550#pragma intrinsic(_mm_unpacklo_epi16) 551#pragma intrinsic(_mm_unpacklo_epi32) 552#pragma intrinsic(_mm_unpacklo_epi64) 553//#pragma intrinsic(_mm_movepi64_pi64) 554//#pragma intrinsic(_mm_movpi64_epi64) 555#pragma intrinsic(_mm_move_epi64) 556#pragma intrinsic(_mm_unpackhi_pd) 557#pragma intrinsic(_mm_unpacklo_pd) 558#pragma intrinsic(_mm_movemask_pd) 559#pragma intrinsic(_mm_shuffle_pd) 560#pragma intrinsic(_mm_castpd_ps) 561#pragma intrinsic(_mm_castpd_si128) 562#pragma intrinsic(_mm_castps_pd) 563#pragma intrinsic(_mm_castps_si128) 564#pragma intrinsic(_mm_castsi128_ps) 565#pragma intrinsic(_mm_castsi128_pd) 566#pragma intrinsic(_mm_pause) 567 568#else /* _MSC_VER */ 569 570/* 571 Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/emmintrin.h 572 Clang older version: https://github.com/llvm/llvm-project/blob/3ef88b31843e040c95f23ff2c3c206f1fa399c05/clang/lib/Headers/emmintrin.h 573 unikraft: https://github.com/unikraft/lib-intel-intrinsics/blob/staging/include/emmintrin.h 574*/ 575 576__INTRIN_INLINE_SSE2 __m128d _mm_add_sd(__m128d a, __m128d b) 577{ 578 a[0] += b[0]; 579 return a; 580} 581 582__INTRIN_INLINE_SSE2 __m128d _mm_add_pd(__m128d a, __m128d b) 583{ 584 return (__m128d)((__v2df)a + (__v2df)b); 585} 586 587__INTRIN_INLINE_SSE2 __m128d _mm_sub_sd(__m128d a, __m128d b) 588{ 589 a[0] -= b[0]; 590 return a; 591} 592 593__INTRIN_INLINE_SSE2 __m128d _mm_sub_pd(__m128d a, __m128d b) 594{ 595 return (__m128d)((__v2df)a - (__v2df)b); 596} 597 598__INTRIN_INLINE_SSE2 __m128d _mm_mul_sd(__m128d a, __m128d b) 599{ 600 a[0] *= b[0]; 601 return a; 602} 603 604__INTRIN_INLINE_SSE2 __m128d _mm_mul_pd(__m128d a, __m128d b) 605{ 606 return (__m128d)((__v2df)a * (__v2df)b); 607} 608 609__INTRIN_INLINE_SSE2 __m128d _mm_div_sd(__m128d a, __m128d b) 610{ 611 a[0] /= b[0]; 612 return a; 613} 614 615__INTRIN_INLINE_SSE2 __m128d _mm_div_pd(__m128d a, __m128d b) 616{ 617 return (__m128d)((__v2df)a / (__v2df)b); 618} 619 620__INTRIN_INLINE_SSE2 __m128d _mm_sqrt_sd(__m128d a, __m128d b) 621{ 622 __m128d __c = __builtin_ia32_sqrtsd((__v2df)b); 623 return __extension__(__m128d){__c[0], a[1]}; 624} 625 626__INTRIN_INLINE_SSE2 __m128d _mm_sqrt_pd(__m128d a) 627{ 628 return __builtin_ia32_sqrtpd((__v2df)a); 629} 630 631__INTRIN_INLINE_SSE2 __m128d _mm_min_sd(__m128d a, __m128d b) 632{ 633 return __builtin_ia32_minsd((__v2df)a, (__v2df)b); 634} 635 636__INTRIN_INLINE_SSE2 __m128d _mm_min_pd(__m128d a, __m128d b) 637{ 638 return __builtin_ia32_minpd((__v2df)a, (__v2df)b); 639} 640 641__INTRIN_INLINE_SSE2 __m128d _mm_max_sd(__m128d a, __m128d b) 642{ 643 return __builtin_ia32_maxsd((__v2df)a, (__v2df)b); 644} 645 646__INTRIN_INLINE_SSE2 __m128d _mm_max_pd(__m128d a, __m128d b) 647{ 648 return __builtin_ia32_maxpd((__v2df)a, (__v2df)b); 649} 650 651__INTRIN_INLINE_SSE2 __m128d _mm_and_pd(__m128d a, __m128d b) 652{ 653 return (__m128d)((__v2du)a & (__v2du)b); 654} 655 656__INTRIN_INLINE_SSE2 __m128d _mm_andnot_pd(__m128d a, __m128d b) 657{ 658 return (__m128d)(~(__v2du)a & (__v2du)b); 659} 660 661__INTRIN_INLINE_SSE2 __m128d _mm_or_pd(__m128d a, __m128d b) 662{ 663 return (__m128d)((__v2du)a | (__v2du)b); 664} 665 666__INTRIN_INLINE_SSE2 __m128d _mm_xor_pd(__m128d a, __m128d b) 667{ 668 return (__m128d)((__v2du)a ^ (__v2du)b); 669} 670 671__INTRIN_INLINE_SSE2 __m128d _mm_cmpeq_pd(__m128d a, __m128d b) 672{ 673 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)a, (__v2df)b); 674} 675 676__INTRIN_INLINE_SSE2 __m128d _mm_cmplt_pd(__m128d a, __m128d b) 677{ 678 return (__m128d)__builtin_ia32_cmpltpd((__v2df)a, (__v2df)b); 679} 680 681__INTRIN_INLINE_SSE2 __m128d _mm_cmple_pd(__m128d a, __m128d b) 682{ 683 return (__m128d)__builtin_ia32_cmplepd((__v2df)a, (__v2df)b); 684} 685 686__INTRIN_INLINE_SSE2 __m128d _mm_cmpgt_pd(__m128d a, __m128d b) 687{ 688 return (__m128d)__builtin_ia32_cmpltpd((__v2df)b, (__v2df)a); 689} 690 691__INTRIN_INLINE_SSE2 __m128d _mm_cmpge_pd(__m128d a, __m128d b) 692{ 693 return (__m128d)__builtin_ia32_cmplepd((__v2df)b, (__v2df)a); 694} 695 696__INTRIN_INLINE_SSE2 __m128d _mm_cmpord_pd(__m128d a, __m128d b) 697{ 698 return (__m128d)__builtin_ia32_cmpordpd((__v2df)a, (__v2df)b); 699} 700 701__INTRIN_INLINE_SSE2 __m128d _mm_cmpunord_pd(__m128d a, __m128d b) 702{ 703 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)a, (__v2df)b); 704} 705 706__INTRIN_INLINE_SSE2 __m128d _mm_cmpneq_pd(__m128d a, __m128d b) 707{ 708 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)a, (__v2df)b); 709} 710 711__INTRIN_INLINE_SSE2 __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) 712{ 713 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)a, (__v2df)b); 714} 715 716__INTRIN_INLINE_SSE2 __m128d _mm_cmpnle_pd(__m128d a, __m128d b) 717{ 718 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)a, (__v2df)b); 719} 720 721__INTRIN_INLINE_SSE2 __m128d _mm_cmpngt_pd(__m128d a, __m128d b) 722{ 723 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)b, (__v2df)a); 724} 725 726__INTRIN_INLINE_SSE2 __m128d _mm_cmpnge_pd(__m128d a, __m128d b) 727{ 728 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)b, (__v2df)a); 729} 730 731__INTRIN_INLINE_SSE2 __m128d _mm_cmpeq_sd(__m128d a, __m128d b) 732{ 733 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)a, (__v2df)b); 734} 735 736__INTRIN_INLINE_SSE2 __m128d _mm_cmplt_sd(__m128d a, __m128d b) 737{ 738 return (__m128d)__builtin_ia32_cmpltsd((__v2df)a, (__v2df)b); 739} 740 741__INTRIN_INLINE_SSE2 __m128d _mm_cmple_sd(__m128d a, __m128d b) 742{ 743 return (__m128d)__builtin_ia32_cmplesd((__v2df)a, (__v2df)b); 744} 745 746__INTRIN_INLINE_SSE2 __m128d _mm_cmpgt_sd(__m128d a, __m128d b) 747{ 748 __m128d __c = __builtin_ia32_cmpltsd((__v2df)b, (__v2df)a); 749 return __extension__(__m128d){__c[0], a[1]}; 750} 751 752__INTRIN_INLINE_SSE2 __m128d _mm_cmpge_sd(__m128d a, __m128d b) 753{ 754 __m128d __c = __builtin_ia32_cmplesd((__v2df)b, (__v2df)a); 755 return __extension__(__m128d){__c[0], a[1]}; 756} 757 758__INTRIN_INLINE_SSE2 __m128d _mm_cmpord_sd(__m128d a, __m128d b) 759{ 760 return (__m128d)__builtin_ia32_cmpordsd((__v2df)a, (__v2df)b); 761} 762 763__INTRIN_INLINE_SSE2 __m128d _mm_cmpunord_sd(__m128d a, __m128d b) 764{ 765 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)a, (__v2df)b); 766} 767 768__INTRIN_INLINE_SSE2 __m128d _mm_cmpneq_sd(__m128d a, __m128d b) 769{ 770 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)a, (__v2df)b); 771} 772 773__INTRIN_INLINE_SSE2 __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) 774{ 775 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)a, (__v2df)b); 776} 777 778__INTRIN_INLINE_SSE2 __m128d _mm_cmpnle_sd(__m128d a, __m128d b) 779{ 780 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)a, (__v2df)b); 781} 782 783__INTRIN_INLINE_SSE2 __m128d _mm_cmpngt_sd(__m128d a, __m128d b) 784{ 785 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)b, (__v2df)a); 786 return __extension__(__m128d){__c[0], a[1]}; 787} 788 789__INTRIN_INLINE_SSE2 __m128d _mm_cmpnge_sd(__m128d a, __m128d b) 790{ 791 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)b, (__v2df)a); 792 return __extension__(__m128d){__c[0], a[1]}; 793} 794 795__INTRIN_INLINE_SSE2 int _mm_comieq_sd(__m128d a, __m128d b) 796{ 797 return __builtin_ia32_comisdeq((__v2df)a, (__v2df)b); 798} 799 800__INTRIN_INLINE_SSE2 int _mm_comilt_sd(__m128d a, __m128d b) 801{ 802 return __builtin_ia32_comisdlt((__v2df)a, (__v2df)b); 803} 804 805__INTRIN_INLINE_SSE2 int _mm_comile_sd(__m128d a, __m128d b) 806{ 807 return __builtin_ia32_comisdle((__v2df)a, (__v2df)b); 808} 809 810__INTRIN_INLINE_SSE2 int _mm_comigt_sd(__m128d a, __m128d b) 811{ 812 return __builtin_ia32_comisdgt((__v2df)a, (__v2df)b); 813} 814 815__INTRIN_INLINE_SSE2 int _mm_comige_sd(__m128d a, __m128d b) 816{ 817 return __builtin_ia32_comisdge((__v2df)a, (__v2df)b); 818} 819 820__INTRIN_INLINE_SSE2 int _mm_comineq_sd(__m128d a, __m128d b) 821{ 822 return __builtin_ia32_comisdneq((__v2df)a, (__v2df)b); 823} 824 825__INTRIN_INLINE_SSE2 int _mm_ucomieq_sd(__m128d a, __m128d b) 826{ 827 return __builtin_ia32_ucomisdeq((__v2df)a, (__v2df)b); 828} 829 830__INTRIN_INLINE_SSE2 int _mm_ucomilt_sd(__m128d a, __m128d b) 831{ 832 return __builtin_ia32_ucomisdlt((__v2df)a, (__v2df)b); 833} 834 835__INTRIN_INLINE_SSE2 int _mm_ucomile_sd(__m128d a, __m128d b) 836{ 837 return __builtin_ia32_ucomisdle((__v2df)a, (__v2df)b); 838} 839 840__INTRIN_INLINE_SSE2 int _mm_ucomigt_sd(__m128d a, __m128d b) 841{ 842 return __builtin_ia32_ucomisdgt((__v2df)a, (__v2df)b); 843} 844 845__INTRIN_INLINE_SSE2 int _mm_ucomige_sd(__m128d a, __m128d b) 846{ 847 return __builtin_ia32_ucomisdge((__v2df)a, (__v2df)b); 848} 849 850__INTRIN_INLINE_SSE2 int _mm_ucomineq_sd(__m128d a, __m128d b) 851{ 852 return __builtin_ia32_ucomisdneq((__v2df)a, (__v2df)b); 853} 854 855__INTRIN_INLINE_SSE2 __m128 _mm_cvtpd_ps(__m128d a) 856{ 857 return __builtin_ia32_cvtpd2ps((__v2df)a); 858} 859 860__INTRIN_INLINE_SSE2 __m128d _mm_cvtps_pd(__m128 a) 861{ 862#if HAS_BUILTIN(__builtin_convertvector) 863 return (__m128d)__builtin_convertvector(__builtin_shufflevector((__v4sf)a, (__v4sf)a, 0, 1), __v2df); 864#else 865 return __builtin_ia32_cvtps2pd(a); 866#endif 867} 868 869__INTRIN_INLINE_SSE2 __m128d _mm_cvtepi32_pd(__m128i a) 870{ 871#if HAS_BUILTIN(__builtin_convertvector) 872 return (__m128d)__builtin_convertvector(__builtin_shufflevector((__v4si)a, (__v4si)a, 0, 1), __v2df); 873#else 874 return __builtin_ia32_cvtdq2pd((__v4si)a); 875#endif 876} 877 878__INTRIN_INLINE_SSE2 __m128i _mm_cvtpd_epi32(__m128d a) 879{ 880 return (__m128i)__builtin_ia32_cvtpd2dq((__v2df)a); 881} 882 883__INTRIN_INLINE_SSE2 int _mm_cvtsd_si32(__m128d a) 884{ 885 return __builtin_ia32_cvtsd2si((__v2df)a); 886} 887 888__INTRIN_INLINE_SSE2 __m128 _mm_cvtsd_ss(__m128 a, __m128d b) 889{ 890 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)a, (__v2df)b); 891} 892 893__INTRIN_INLINE_SSE2 __m128d _mm_cvtsi32_sd(__m128d a, 894 int b) 895{ 896 a[0] = b; 897 return a; 898} 899 900__INTRIN_INLINE_SSE2 __m128d _mm_cvtss_sd(__m128d a, __m128 b) 901{ 902 a[0] = b[0]; 903 return a; 904} 905 906__INTRIN_INLINE_SSE2 __m128i _mm_cvttpd_epi32(__m128d a) 907{ 908 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)a); 909} 910 911__INTRIN_INLINE_SSE2 int _mm_cvttsd_si32(__m128d a) 912{ 913 return __builtin_ia32_cvttsd2si((__v2df)a); 914} 915 916__INTRIN_INLINE_MMXSSE2 __m64 _mm_cvtpd_pi32(__m128d a) 917{ 918 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)a); 919} 920 921__INTRIN_INLINE_MMXSSE2 __m64 _mm_cvttpd_pi32(__m128d a) 922{ 923 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)a); 924} 925 926__INTRIN_INLINE_MMXSSE2 __m128d _mm_cvtpi32_pd(__m64 a) 927{ 928 return __builtin_ia32_cvtpi2pd((__v2si)a); 929} 930 931__INTRIN_INLINE_SSE2 double _mm_cvtsd_f64(__m128d a) 932{ 933 return a[0]; 934} 935 936__INTRIN_INLINE_SSE2 __m128d _mm_load_pd(double const *dp) 937{ 938 return *(const __m128d *)dp; 939} 940 941__INTRIN_INLINE_SSE2 __m128d _mm_load1_pd(double const *dp) 942{ 943 struct __mm_load1_pd_struct { 944 double __u; 945 } __attribute__((__packed__, __may_alias__)); 946 double __u = ((const struct __mm_load1_pd_struct *)dp)->__u; 947 return __extension__(__m128d){__u, __u}; 948} 949 950// GCC: 951/* Create a selector for use with the SHUFPD instruction. */ 952#define _MM_SHUFFLE2(fp1,fp0) \ 953 (((fp1) << 1) | (fp0)) 954 955__INTRIN_INLINE_SSE2 __m128d _mm_loadr_pd(double const *dp) 956{ 957#if HAS_BUILTIN(__builtin_shufflevector) 958 __m128d u = *(const __m128d *)dp; 959 return __builtin_shufflevector((__v2df)u, (__v2df)u, 1, 0); 960#else 961 return (__m128d){ dp[1], dp[0] }; 962#endif 963} 964 965__INTRIN_INLINE_SSE2 __m128d _mm_loadu_pd(double const *dp) 966{ 967 struct __loadu_pd { 968 __m128d_u __v; 969 } __attribute__((__packed__, __may_alias__)); 970 return ((const struct __loadu_pd *)dp)->__v; 971} 972 973__INTRIN_INLINE_SSE2 __m128i _mm_loadu_si64(void const *a) 974{ 975 struct __loadu_si64 { 976 long long __v; 977 } __attribute__((__packed__, __may_alias__)); 978 long long __u = ((const struct __loadu_si64 *)a)->__v; 979 return __extension__(__m128i)(__v2di){__u, 0LL}; 980} 981 982__INTRIN_INLINE_SSE2 __m128i _mm_loadu_si32(void const *a) 983{ 984 struct __loadu_si32 { 985 int __v; 986 } __attribute__((__packed__, __may_alias__)); 987 int __u = ((const struct __loadu_si32 *)a)->__v; 988 return __extension__(__m128i)(__v4si){__u, 0, 0, 0}; 989} 990 991__INTRIN_INLINE_SSE2 __m128i _mm_loadu_si16(void const *a) 992{ 993 struct __loadu_si16 { 994 short __v; 995 } __attribute__((__packed__, __may_alias__)); 996 short __u = ((const struct __loadu_si16 *)a)->__v; 997 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0}; 998} 999 1000__INTRIN_INLINE_SSE2 __m128d _mm_load_sd(double const *dp) 1001{ 1002 struct __mm_load_sd_struct { 1003 double __u; 1004 } __attribute__((__packed__, __may_alias__)); 1005 double __u = ((const struct __mm_load_sd_struct *)dp)->__u; 1006 return __extension__(__m128d){__u, 0}; 1007} 1008 1009__INTRIN_INLINE_SSE2 __m128d _mm_loadh_pd(__m128d a, double const *dp) 1010{ 1011 struct __mm_loadh_pd_struct { 1012 double __u; 1013 } __attribute__((__packed__, __may_alias__)); 1014 double __u = ((const struct __mm_loadh_pd_struct *)dp)->__u; 1015 return __extension__(__m128d){a[0], __u}; 1016} 1017 1018__INTRIN_INLINE_SSE2 __m128d _mm_loadl_pd(__m128d a, double const *dp) 1019{ 1020 struct __mm_loadl_pd_struct { 1021 double __u; 1022 } __attribute__((__packed__, __may_alias__)); 1023 double __u = ((const struct __mm_loadl_pd_struct *)dp)->__u; 1024 return __extension__(__m128d){__u, a[1]}; 1025} 1026 1027__INTRIN_INLINE_SSE2 __m128d _mm_undefined_pd(void) 1028{ 1029#if HAS_BUILTIN(__builtin_ia32_undef128) 1030 return (__m128d)__builtin_ia32_undef128(); 1031#else 1032 __m128d undef = undef; 1033 return undef; 1034#endif 1035} 1036 1037__INTRIN_INLINE_SSE2 __m128d _mm_set_sd(double w) 1038{ 1039 return __extension__(__m128d){w, 0}; 1040} 1041 1042__INTRIN_INLINE_SSE2 __m128d _mm_set1_pd(double w) 1043{ 1044 return __extension__(__m128d){w, w}; 1045} 1046 1047__INTRIN_INLINE_SSE2 __m128d _mm_set_pd(double w, double x) 1048{ 1049 return __extension__(__m128d){x, w}; 1050} 1051 1052__INTRIN_INLINE_SSE2 __m128d _mm_setr_pd(double w, double x) 1053{ 1054 return __extension__(__m128d){w, x}; 1055} 1056 1057__INTRIN_INLINE_SSE2 __m128d _mm_setzero_pd(void) 1058{ 1059 return __extension__(__m128d){0, 0}; 1060} 1061 1062__INTRIN_INLINE_SSE2 __m128d _mm_move_sd(__m128d a, __m128d b) 1063{ 1064 a[0] = b[0]; 1065 return a; 1066} 1067 1068__INTRIN_INLINE_SSE2 void _mm_store_sd(double *dp, __m128d a) 1069{ 1070 struct __mm_store_sd_struct { 1071 double __u; 1072 } __attribute__((__packed__, __may_alias__)); 1073 ((struct __mm_store_sd_struct *)dp)->__u = a[0]; 1074} 1075 1076__INTRIN_INLINE_SSE2 void _mm_store_pd(double *dp, __m128d a) 1077{ 1078 *(__m128d *)dp = a; 1079} 1080 1081__INTRIN_INLINE_SSE2 void _mm_store1_pd(double *dp, __m128d a) 1082{ 1083#if HAS_BUILTIN(__builtin_shufflevector) 1084 a = __builtin_shufflevector((__v2df)a, (__v2df)a, 0, 0); 1085 _mm_store_pd(dp, a); 1086#else 1087 dp[0] = a[0]; 1088 dp[1] = a[0]; 1089#endif 1090} 1091 1092__INTRIN_INLINE_SSE2 void _mm_storeu_pd(double *dp, __m128d a) 1093{ 1094 struct __storeu_pd { 1095 __m128d_u __v; 1096 } __attribute__((__packed__, __may_alias__)); 1097 ((struct __storeu_pd *)dp)->__v = a; 1098} 1099 1100__INTRIN_INLINE_SSE2 void _mm_storer_pd(double *dp, __m128d a) 1101{ 1102#if HAS_BUILTIN(__builtin_shufflevector) 1103 a = __builtin_shufflevector((__v2df)a, (__v2df)a, 1, 0); 1104 *(__m128d *)dp = a; 1105#else 1106 dp[0] = a[1]; 1107 dp[1] = a[0]; 1108#endif 1109} 1110 1111__INTRIN_INLINE_SSE2 void _mm_storeh_pd(double *dp, __m128d a) 1112{ 1113 struct __mm_storeh_pd_struct { 1114 double __u; 1115 } __attribute__((__packed__, __may_alias__)); 1116 ((struct __mm_storeh_pd_struct *)dp)->__u = a[1]; 1117} 1118 1119__INTRIN_INLINE_SSE2 void _mm_storel_pd(double *dp, __m128d a) 1120{ 1121 struct __mm_storeh_pd_struct { 1122 double __u; 1123 } __attribute__((__packed__, __may_alias__)); 1124 ((struct __mm_storeh_pd_struct *)dp)->__u = a[0]; 1125} 1126 1127__INTRIN_INLINE_SSE2 __m128i _mm_add_epi8(__m128i a, __m128i b) 1128{ 1129 return (__m128i)((__v16qu)a + (__v16qu)b); 1130} 1131 1132__INTRIN_INLINE_SSE2 __m128i _mm_add_epi16(__m128i a, __m128i b) 1133{ 1134 return (__m128i)((__v8hu)a + (__v8hu)b); 1135} 1136 1137__INTRIN_INLINE_SSE2 __m128i _mm_add_epi32(__m128i a, __m128i b) 1138{ 1139 return (__m128i)((__v4su)a + (__v4su)b); 1140} 1141 1142__INTRIN_INLINE_MMXSSE2 __m64 _mm_add_si64(__m64 a, __m64 b) 1143{ 1144 return (__m64)__builtin_ia32_paddq((__v1di)a, (__v1di)b); 1145} 1146 1147__INTRIN_INLINE_SSE2 __m128i _mm_add_epi64(__m128i a, __m128i b) 1148{ 1149 return (__m128i)((__v2du)a + (__v2du)b); 1150} 1151 1152__INTRIN_INLINE_SSE2 __m128i _mm_adds_epi8(__m128i a, __m128i b) 1153{ 1154#if HAS_BUILTIN(__builtin_elementwise_add_sat) 1155 return (__m128i)__builtin_elementwise_add_sat((__v16qs)a, (__v16qs)b); 1156#else 1157 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b); 1158#endif 1159} 1160 1161__INTRIN_INLINE_SSE2 __m128i _mm_adds_epi16(__m128i a, __m128i b) 1162{ 1163#if HAS_BUILTIN(__builtin_elementwise_add_sat) 1164 return (__m128i)__builtin_elementwise_add_sat((__v8hi)a, (__v8hi)b); 1165#else 1166 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b); 1167#endif 1168} 1169 1170__INTRIN_INLINE_SSE2 __m128i _mm_adds_epu8(__m128i a, __m128i b) 1171{ 1172#if HAS_BUILTIN(__builtin_elementwise_add_sat) 1173 return (__m128i)__builtin_elementwise_add_sat((__v16qu)a, (__v16qu)b); 1174#else 1175 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b); 1176#endif 1177} 1178 1179__INTRIN_INLINE_SSE2 __m128i _mm_adds_epu16(__m128i a, __m128i b) 1180{ 1181#if HAS_BUILTIN(__builtin_elementwise_add_sat) 1182 return (__m128i)__builtin_elementwise_add_sat((__v8hu)a, (__v8hu)b); 1183#else 1184 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b); 1185#endif 1186} 1187 1188__INTRIN_INLINE_SSE2 __m128i _mm_avg_epu8(__m128i a, __m128i b) 1189{ 1190 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b); 1191} 1192 1193__INTRIN_INLINE_SSE2 __m128i _mm_avg_epu16(__m128i a, __m128i b) 1194{ 1195 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b); 1196} 1197 1198__INTRIN_INLINE_SSE2 __m128i _mm_madd_epi16(__m128i a, __m128i b) 1199{ 1200 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b); 1201} 1202 1203__INTRIN_INLINE_SSE2 __m128i _mm_max_epi16(__m128i a, __m128i b) 1204{ 1205#if HAS_BUILTIN(__builtin_elementwise_max) 1206 return (__m128i)__builtin_elementwise_max((__v8hi)a, (__v8hi)b); 1207#else 1208 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b); 1209#endif 1210} 1211 1212__INTRIN_INLINE_SSE2 __m128i _mm_max_epu8(__m128i a, __m128i b) 1213{ 1214#if HAS_BUILTIN(__builtin_elementwise_max) 1215 return (__m128i)__builtin_elementwise_max((__v16qu)a, (__v16qu)b); 1216#else 1217 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b); 1218#endif 1219} 1220 1221__INTRIN_INLINE_SSE2 __m128i _mm_min_epi16(__m128i a, __m128i b) 1222{ 1223#if HAS_BUILTIN(__builtin_elementwise_min) 1224 return (__m128i)__builtin_elementwise_min((__v8hi)a, (__v8hi)b); 1225#else 1226 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b); 1227#endif 1228} 1229 1230__INTRIN_INLINE_SSE2 __m128i _mm_min_epu8(__m128i a, __m128i b) 1231{ 1232#if HAS_BUILTIN(__builtin_elementwise_min) 1233 return (__m128i)__builtin_elementwise_min((__v16qu)a, (__v16qu)b); 1234#else 1235 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b); 1236#endif 1237} 1238 1239__INTRIN_INLINE_SSE2 __m128i _mm_mulhi_epi16(__m128i a, __m128i b) 1240{ 1241 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b); 1242} 1243 1244__INTRIN_INLINE_SSE2 __m128i _mm_mulhi_epu16(__m128i a, __m128i b) 1245{ 1246 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b); 1247} 1248 1249__INTRIN_INLINE_SSE2 __m128i _mm_mullo_epi16(__m128i a, __m128i b) 1250{ 1251 return (__m128i)((__v8hu)a * (__v8hu)b); 1252} 1253 1254__INTRIN_INLINE_MMXSSE2 __m64 _mm_mul_su32(__m64 a, __m64 b) 1255{ 1256 return (__m64)__builtin_ia32_pmuludq((__v2si)a, (__v2si)b); 1257} 1258 1259__INTRIN_INLINE_SSE2 __m128i _mm_mul_epu32(__m128i a, __m128i b) 1260{ 1261 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b); 1262} 1263 1264__INTRIN_INLINE_SSE2 __m128i _mm_sad_epu8(__m128i a, __m128i b) 1265{ 1266 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b); 1267} 1268 1269__INTRIN_INLINE_SSE2 __m128i _mm_sub_epi8(__m128i a, __m128i b) 1270{ 1271 return (__m128i)((__v16qu)a - (__v16qu)b); 1272} 1273 1274__INTRIN_INLINE_SSE2 __m128i _mm_sub_epi16(__m128i a, __m128i b) 1275{ 1276 return (__m128i)((__v8hu)a - (__v8hu)b); 1277} 1278 1279__INTRIN_INLINE_SSE2 __m128i _mm_sub_epi32(__m128i a, __m128i b) 1280{ 1281 return (__m128i)((__v4su)a - (__v4su)b); 1282} 1283 1284__INTRIN_INLINE_MMXSSE2 __m64 _mm_sub_si64(__m64 a, __m64 b) 1285{ 1286 return (__m64)__builtin_ia32_psubq((__v1di)a, (__v1di)b); 1287} 1288 1289__INTRIN_INLINE_SSE2 __m128i _mm_sub_epi64(__m128i a, __m128i b) 1290{ 1291 return (__m128i)((__v2du)a - (__v2du)b); 1292} 1293 1294__INTRIN_INLINE_SSE2 __m128i _mm_subs_epi8(__m128i a, __m128i b) 1295{ 1296#if HAS_BUILTIN(__builtin_elementwise_sub_sat) 1297 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)a, (__v16qs)b); 1298#else 1299 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b); 1300#endif 1301} 1302 1303__INTRIN_INLINE_SSE2 __m128i _mm_subs_epi16(__m128i a, __m128i b) 1304{ 1305#if HAS_BUILTIN(__builtin_elementwise_sub_sat) 1306 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)a, (__v8hi)b); 1307#else 1308 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b); 1309#endif 1310} 1311 1312__INTRIN_INLINE_SSE2 __m128i _mm_subs_epu8(__m128i a, __m128i b) 1313{ 1314#if HAS_BUILTIN(__builtin_elementwise_sub_sat) 1315 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)a, (__v16qu)b); 1316#else 1317 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b); 1318#endif 1319} 1320 1321__INTRIN_INLINE_SSE2 __m128i _mm_subs_epu16(__m128i a, __m128i b) 1322{ 1323#if HAS_BUILTIN(__builtin_elementwise_sub_sat) 1324 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)a, (__v8hu)b); 1325#else 1326 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b); 1327#endif 1328} 1329 1330__INTRIN_INLINE_SSE2 __m128i _mm_and_si128(__m128i a, __m128i b) 1331{ 1332 return (__m128i)((__v2du)a & (__v2du)b); 1333} 1334 1335__INTRIN_INLINE_SSE2 __m128i _mm_andnot_si128(__m128i a, __m128i b) 1336{ 1337 return (__m128i)(~(__v2du)a & (__v2du)b); 1338} 1339 1340__INTRIN_INLINE_SSE2 __m128i _mm_or_si128(__m128i a, __m128i b) 1341{ 1342 return (__m128i)((__v2du)a | (__v2du)b); 1343} 1344 1345__INTRIN_INLINE_SSE2 __m128i _mm_xor_si128(__m128i a, __m128i b) 1346{ 1347 return (__m128i)((__v2du)a ^ (__v2du)b); 1348} 1349 1350#ifdef __clang__ 1351#define _mm_slli_si128(a, imm) \ 1352 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) 1353#else 1354__INTRIN_INLINE_SSE2 __m128i _mm_slli_si128(__m128i a, const int imm) 1355{ 1356 return (__m128i)__builtin_ia32_pslldqi128(a, imm * 8); 1357} 1358#endif 1359 1360__INTRIN_INLINE_SSE2 __m128i _mm_slli_epi16(__m128i a, int count) 1361{ 1362 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count); 1363} 1364 1365__INTRIN_INLINE_SSE2 __m128i _mm_sll_epi16(__m128i a, __m128i count) 1366{ 1367 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count); 1368} 1369 1370__INTRIN_INLINE_SSE2 __m128i _mm_slli_epi32(__m128i a, int count) 1371{ 1372 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count); 1373} 1374 1375__INTRIN_INLINE_SSE2 __m128i _mm_sll_epi32(__m128i a, __m128i count) 1376{ 1377 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count); 1378} 1379 1380__INTRIN_INLINE_SSE2 __m128i _mm_slli_epi64(__m128i a, int count) 1381{ 1382 return __builtin_ia32_psllqi128((__v2di)a, count); 1383} 1384 1385__INTRIN_INLINE_SSE2 __m128i _mm_sll_epi64(__m128i a, __m128i count) 1386{ 1387 return __builtin_ia32_psllq128((__v2di)a, (__v2di)count); 1388} 1389 1390__INTRIN_INLINE_SSE2 __m128i _mm_srai_epi16(__m128i a, int count) 1391{ 1392 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count); 1393} 1394 1395__INTRIN_INLINE_SSE2 __m128i _mm_sra_epi16(__m128i a, __m128i count) 1396{ 1397 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count); 1398} 1399 1400__INTRIN_INLINE_SSE2 __m128i _mm_srai_epi32(__m128i a, int count) 1401{ 1402 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count); 1403} 1404 1405__INTRIN_INLINE_SSE2 __m128i _mm_sra_epi32(__m128i a, __m128i count) 1406{ 1407 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count); 1408} 1409 1410#ifdef __clang__ 1411#define _mm_srli_si128(a, imm) \ 1412 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) 1413#else 1414__INTRIN_INLINE_SSE2 __m128i _mm_srli_si128(__m128i a, const int imm) 1415{ 1416 return (__m128i)__builtin_ia32_psrldqi128(a, imm * 8); 1417} 1418#endif 1419 1420__INTRIN_INLINE_SSE2 __m128i _mm_srli_epi16(__m128i a, int count) 1421{ 1422 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count); 1423} 1424 1425__INTRIN_INLINE_SSE2 __m128i _mm_srl_epi16(__m128i a, __m128i count) 1426{ 1427 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count); 1428} 1429 1430__INTRIN_INLINE_SSE2 __m128i _mm_srli_epi32(__m128i a, int count) 1431{ 1432 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count); 1433} 1434 1435__INTRIN_INLINE_SSE2 __m128i _mm_srl_epi32(__m128i a, __m128i count) 1436{ 1437 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count); 1438} 1439 1440__INTRIN_INLINE_SSE2 __m128i _mm_srli_epi64(__m128i a, int count) 1441{ 1442 return __builtin_ia32_psrlqi128((__v2di)a, count); 1443} 1444 1445__INTRIN_INLINE_SSE2 __m128i _mm_srl_epi64(__m128i a, __m128i count) 1446{ 1447 return __builtin_ia32_psrlq128((__v2di)a, (__v2di)count); 1448} 1449 1450__INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) 1451{ 1452 return (__m128i)((__v16qi)a == (__v16qi)b); 1453} 1454 1455__INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) 1456{ 1457 return (__m128i)((__v8hi)a == (__v8hi)b); 1458} 1459 1460__INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) 1461{ 1462 return (__m128i)((__v4si)a == (__v4si)b); 1463} 1464 1465__INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) 1466{ 1467 /* This function always performs a signed comparison, but __v16qi is a char 1468 which may be signed or unsigned, so use __v16qs. */ 1469 return (__m128i)((__v16qs)a > (__v16qs)b); 1470} 1471 1472__INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) 1473{ 1474 return (__m128i)((__v8hi)a > (__v8hi)b); 1475} 1476 1477__INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) 1478{ 1479 return (__m128i)((__v4si)a > (__v4si)b); 1480} 1481 1482__INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi8(__m128i a, __m128i b) 1483{ 1484 return _mm_cmpgt_epi8(b, a); 1485} 1486 1487__INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi16(__m128i a, __m128i b) 1488{ 1489 return _mm_cmpgt_epi16(b, a); 1490} 1491 1492__INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi32(__m128i a, __m128i b) 1493{ 1494 return _mm_cmpgt_epi32(b, a); 1495} 1496 1497#ifdef _M_AMD64 1498 1499__INTRIN_INLINE_SSE2 __m128d _mm_cvtsi64_sd(__m128d a, long long b) 1500{ 1501 a[0] = b; 1502 return a; 1503} 1504 1505__INTRIN_INLINE_SSE2 long long _mm_cvtsd_si64(__m128d a) 1506{ 1507 return __builtin_ia32_cvtsd2si64((__v2df)a); 1508} 1509 1510__INTRIN_INLINE_SSE2 long long _mm_cvttsd_si64(__m128d a) 1511{ 1512 return __builtin_ia32_cvttsd2si64((__v2df)a); 1513} 1514#endif 1515 1516__INTRIN_INLINE_SSE2 __m128 _mm_cvtepi32_ps(__m128i a) 1517{ 1518#if HAS_BUILTIN(__builtin_convertvector) 1519 return (__m128)__builtin_convertvector((__v4si)a, __v4sf); 1520#else 1521 return __builtin_ia32_cvtdq2ps((__v4si)a); 1522#endif 1523} 1524 1525__INTRIN_INLINE_SSE2 __m128i _mm_cvtps_epi32(__m128 a) 1526{ 1527 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)a); 1528} 1529 1530__INTRIN_INLINE_SSE2 __m128i _mm_cvttps_epi32(__m128 a) 1531{ 1532 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)a); 1533} 1534 1535__INTRIN_INLINE_SSE2 __m128i _mm_cvtsi32_si128(int a) 1536{ 1537 return __extension__(__m128i)(__v4si){a, 0, 0, 0}; 1538} 1539 1540__INTRIN_INLINE_SSE2 __m128i _mm_cvtsi64_si128(long long a) 1541{ 1542 return __extension__(__m128i)(__v2di){a, 0}; 1543} 1544 1545__INTRIN_INLINE_SSE2 int _mm_cvtsi128_si32(__m128i a) 1546{ 1547 __v4si b = (__v4si)a; 1548 return b[0]; 1549} 1550 1551__INTRIN_INLINE_SSE2 long long _mm_cvtsi128_si64(__m128i a) 1552{ 1553 return a[0]; 1554} 1555 1556__INTRIN_INLINE_SSE2 __m128i _mm_load_si128(__m128i const *p) 1557{ 1558 return *p; 1559} 1560 1561__INTRIN_INLINE_SSE2 __m128i _mm_loadu_si128(__m128i_u const *p) 1562{ 1563 struct __loadu_si128 { 1564 __m128i_u __v; 1565 } __attribute__((__packed__, __may_alias__)); 1566 return ((const struct __loadu_si128 *)p)->__v; 1567} 1568 1569__INTRIN_INLINE_SSE2 __m128i _mm_loadl_epi64(__m128i_u const *p) 1570{ 1571 struct __mm_loadl_epi64_struct { 1572 long long __u; 1573 } __attribute__((__packed__, __may_alias__)); 1574 return __extension__(__m128i){ 1575 ((const struct __mm_loadl_epi64_struct *)p)->__u, 0}; 1576} 1577 1578__INTRIN_INLINE_SSE2 __m128i _mm_undefined_si128(void) 1579{ 1580#if HAS_BUILTIN(__builtin_ia32_undef128) 1581 return (__m128i)__builtin_ia32_undef128(); 1582#else 1583 __m128i undef = undef; 1584 return undef; 1585#endif 1586} 1587 1588__INTRIN_INLINE_SSE2 __m128i _mm_set_epi64x(long long q1, long long q0) 1589{ 1590 return __extension__(__m128i)(__v2di){q0, q1}; 1591} 1592 1593__INTRIN_INLINE_SSE2 __m128i _mm_set_epi64(__m64 q1, __m64 q0) 1594{ 1595 return _mm_set_epi64x((long long)q1, (long long)q0); 1596} 1597 1598__INTRIN_INLINE_SSE2 __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) 1599{ 1600 return __extension__(__m128i)(__v4si){i0, i1, i2, i3}; 1601} 1602 1603__INTRIN_INLINE_SSE2 __m128i _mm_set_epi16( 1604 short w7, short w6, short w5, short w4, 1605 short w3, short w2, short w1, short w0) 1606{ 1607 return __extension__(__m128i)(__v8hi){w0, w1, w2, w3, w4, w5, w6, w7}; 1608} 1609 1610__INTRIN_INLINE_SSE2 __m128i _mm_set_epi8( 1611 char b15, char b14, char b13, char b12, 1612 char b11, char b10, char b9, char b8, 1613 char b7, char b6, char b5, char b4, 1614 char b3, char b2, char b1, char b0) 1615{ 1616 return __extension__(__m128i)(__v16qi){ 1617 b0, b1, b2, b3, b4, b5, b6, b7, 1618 b8, b9, b10, b11, b12, b13, b14, b15}; 1619} 1620 1621__INTRIN_INLINE_SSE2 __m128i _mm_set1_epi64x(long long q) 1622{ 1623 return _mm_set_epi64x(q, q); 1624} 1625 1626__INTRIN_INLINE_SSE2 __m128i _mm_set1_epi64(__m64 q) 1627{ 1628 return _mm_set_epi64(q, q); 1629} 1630 1631__INTRIN_INLINE_SSE2 __m128i _mm_set1_epi32(int i) 1632{ 1633 return _mm_set_epi32(i, i, i, i); 1634} 1635 1636__INTRIN_INLINE_SSE2 __m128i _mm_set1_epi16(short w) 1637{ 1638 return _mm_set_epi16(w, w, w, w, w, w, w, w); 1639} 1640 1641__INTRIN_INLINE_SSE2 __m128i _mm_set1_epi8(char b) 1642{ 1643 return _mm_set_epi8(b, b, b, b, b, b, b, b, b, b, b, 1644 b, b, b, b, b); 1645} 1646 1647__INTRIN_INLINE_SSE2 __m128i _mm_setr_epi64(__m64 q0, __m64 q1) 1648{ 1649 return _mm_set_epi64(q1, q0); 1650} 1651 1652__INTRIN_INLINE_SSE2 __m128i _mm_setr_epi32(int i0, int i1, int i2, int i3) 1653{ 1654 return _mm_set_epi32(i3, i2, i1, i0); 1655} 1656 1657__INTRIN_INLINE_SSE2 __m128i _mm_setr_epi16( 1658 short w0, short w1, short w2, short w3, 1659 short w4, short w5, short w6, short w7) 1660{ 1661 return _mm_set_epi16(w7, w6, w5, w4, w3, w2, w1, w0); 1662} 1663 1664__INTRIN_INLINE_SSE2 __m128i _mm_setr_epi8( 1665 char b0, char b1, char b2, char b3, 1666 char b4, char b5, char b6, char b7, 1667 char b8, char b9, char b10, char b11, 1668 char b12, char b13, char b14, char b15) 1669{ 1670 return _mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, 1671 b7, b6, b5, b4, b3, b2, b1, b0); 1672} 1673 1674__INTRIN_INLINE_SSE2 __m128i _mm_setzero_si128(void) 1675{ 1676 return __extension__(__m128i)(__v2di){0LL, 0LL}; 1677} 1678 1679__INTRIN_INLINE_SSE2 void _mm_store_si128(__m128i *p, __m128i b) 1680{ 1681 *p = b; 1682} 1683 1684__INTRIN_INLINE_SSE2 void _mm_storeu_si128(__m128i_u *p, __m128i b) 1685{ 1686 struct __storeu_si128 { 1687 __m128i_u __v; 1688 } __attribute__((__packed__, __may_alias__)); 1689 ((struct __storeu_si128 *)p)->__v = b; 1690} 1691 1692__INTRIN_INLINE_SSE2 void _mm_storeu_si64(void *p, __m128i b) 1693{ 1694 struct __storeu_si64 { 1695 long long __v; 1696 } __attribute__((__packed__, __may_alias__)); 1697 ((struct __storeu_si64 *)p)->__v = ((__v2di)b)[0]; 1698} 1699 1700__INTRIN_INLINE_SSE2 void _mm_storeu_si32(void *p, __m128i b) 1701{ 1702 struct __storeu_si32 { 1703 int __v; 1704 } __attribute__((__packed__, __may_alias__)); 1705 ((struct __storeu_si32 *)p)->__v = ((__v4si)b)[0]; 1706} 1707 1708__INTRIN_INLINE_SSE2 void _mm_storeu_si16(void *p, __m128i b) 1709{ 1710 struct __storeu_si16 { 1711 short __v; 1712 } __attribute__((__packed__, __may_alias__)); 1713 ((struct __storeu_si16 *)p)->__v = ((__v8hi)b)[0]; 1714} 1715 1716__INTRIN_INLINE_SSE2 void _mm_maskmoveu_si128(__m128i d, __m128i n, char *p) 1717{ 1718 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p); 1719} 1720 1721__INTRIN_INLINE_SSE2 void _mm_storel_epi64(__m128i_u *p, __m128i a) 1722{ 1723 struct __mm_storel_epi64_struct { 1724 long long __u; 1725 } __attribute__((__packed__, __may_alias__)); 1726 ((struct __mm_storel_epi64_struct *)p)->__u = a[0]; 1727} 1728 1729__INTRIN_INLINE_SSE2 void _mm_stream_pd(double *p, __m128d a) 1730{ 1731#if HAS_BUILTIN(__builtin_nontemporal_store) 1732 __builtin_nontemporal_store((__v2df)a, (__v2df *)p); 1733#else 1734 __builtin_ia32_movntpd(p, a); 1735#endif 1736} 1737 1738__INTRIN_INLINE_SSE2 void _mm_stream_si128(__m128i *p, __m128i a) 1739{ 1740#if HAS_BUILTIN(__builtin_nontemporal_store) 1741 __builtin_nontemporal_store((__v2di)a, (__v2di*)p); 1742#else 1743 __builtin_ia32_movntdq(p, a); 1744#endif 1745} 1746 1747__INTRIN_INLINE_SSE2 void _mm_stream_si32(int *p, int a) 1748{ 1749 __builtin_ia32_movnti(p, a); 1750} 1751 1752#ifdef _M_AMD64 1753__INTRIN_INLINE_SSE2 void _mm_stream_si64(long long *p, long long a) 1754{ 1755 __builtin_ia32_movnti64(p, a); 1756} 1757#endif 1758 1759void _mm_clflush(void const *p); 1760 1761void _mm_lfence(void); 1762 1763void _mm_mfence(void); 1764 1765__INTRIN_INLINE_SSE2 __m128i _mm_packs_epi16(__m128i a, __m128i b) 1766{ 1767 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b); 1768} 1769 1770__INTRIN_INLINE_SSE2 __m128i _mm_packs_epi32(__m128i a, __m128i b) 1771{ 1772 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b); 1773} 1774 1775__INTRIN_INLINE_SSE2 __m128i _mm_packus_epi16(__m128i a, __m128i b) 1776{ 1777 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b); 1778} 1779 1780#define _mm_extract_epi16(a, imm) \ 1781 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ 1782 (int)(imm))) 1783 1784#define _mm_insert_epi16(a, b, imm) \ 1785 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \ 1786 (int)(imm))) 1787 1788__INTRIN_INLINE_SSE2 int _mm_movemask_epi8(__m128i a) 1789{ 1790 return __builtin_ia32_pmovmskb128((__v16qi)a); 1791} 1792 1793#define _mm_shuffle_epi32(a, imm) \ 1794 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))) 1795 1796#define _mm_shufflelo_epi16(a, imm) \ 1797 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))) 1798 1799#define _mm_shufflehi_epi16(a, imm) \ 1800 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))) 1801 1802__INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) 1803{ 1804#if HAS_BUILTIN(__builtin_shufflevector) 1805 return (__m128i)__builtin_shufflevector( 1806 (__v16qi)a, (__v16qi)b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11, 1807 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15); 1808#else 1809 return (__m128i)__builtin_ia32_punpckhbw128((__v16qi)a, (__v16qi)b); 1810#endif 1811} 1812 1813__INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) 1814{ 1815#if HAS_BUILTIN(__builtin_shufflevector) 1816 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8 + 4, 5, 1817 8 + 5, 6, 8 + 6, 7, 8 + 7); 1818#else 1819 return (__m128i)__builtin_ia32_punpckhwd128((__v8hi)a, (__v8hi)b); 1820#endif 1821} 1822 1823__INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) 1824{ 1825#if HAS_BUILTIN(__builtin_shufflevector) 1826 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4 + 2, 3, 1827 4 + 3); 1828#else 1829 return (__m128i)__builtin_ia32_punpckhdq128((__v4si)a, (__v4si)b); 1830#endif 1831} 1832 1833__INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) 1834{ 1835#if HAS_BUILTIN(__builtin_shufflevector) 1836 return (__m128i)__builtin_shufflevector((__v2di)a, (__v2di)b, 1, 2 + 1); 1837#else 1838 return (__m128i)__builtin_ia32_punpckhqdq128((__v2di)a, (__v2di)b); 1839#endif 1840} 1841 1842__INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) 1843{ 1844#if HAS_BUILTIN(__builtin_shufflevector) 1845 return (__m128i)__builtin_shufflevector( 1846 (__v16qi)a, (__v16qi)b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4, 1847 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7); 1848#else 1849 return (__m128i)__builtin_ia32_punpcklbw128((__v16qi)a, (__v16qi)b); 1850#endif 1851} 1852 1853__INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) 1854{ 1855#if HAS_BUILTIN(__builtin_shufflevector) 1856 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8 + 0, 1, 1857 8 + 1, 2, 8 + 2, 3, 8 + 3); 1858#else 1859 return (__m128i)__builtin_ia32_punpcklwd128((__v8hi)a, (__v8hi)b); 1860#endif 1861} 1862 1863__INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) 1864{ 1865#if HAS_BUILTIN(__builtin_shufflevector) 1866 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4 + 0, 1, 1867 4 + 1); 1868#else 1869 return (__m128i)__builtin_ia32_punpckldq128((__v4si)a, (__v4si)b); 1870#endif 1871} 1872 1873__INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) 1874{ 1875#if HAS_BUILTIN(__builtin_shufflevector) 1876 return (__m128i)__builtin_shufflevector((__v2di)a, (__v2di)b, 0, 2 + 0); 1877#else 1878 return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)a, (__v2di)b); 1879#endif 1880} 1881 1882__INTRIN_INLINE_SSE2 __m64 _mm_movepi64_pi64(__m128i a) 1883{ 1884 return (__m64)a[0]; 1885} 1886 1887__INTRIN_INLINE_SSE2 __m128i _mm_movpi64_epi64(__m64 a) 1888{ 1889 return __extension__(__m128i)(__v2di){(long long)a, 0}; 1890} 1891 1892__INTRIN_INLINE_SSE2 __m128i _mm_move_epi64(__m128i a) 1893{ 1894#if HAS_BUILTIN(__builtin_shufflevector) 1895 return __builtin_shufflevector((__v2di)a, _mm_setzero_si128(), 0, 2); 1896#else 1897 return (__m128i)__builtin_ia32_movq128((__v2di)a); 1898#endif 1899} 1900 1901__INTRIN_INLINE_SSE2 __m128d _mm_unpackhi_pd(__m128d a, __m128d b) 1902{ 1903#if HAS_BUILTIN(__builtin_shufflevector) 1904 return __builtin_shufflevector((__v2df)a, (__v2df)b, 1, 2 + 1); 1905#else 1906 return (__m128d)__builtin_ia32_unpckhpd((__v2df)a, (__v2df)b); 1907#endif 1908} 1909 1910__INTRIN_INLINE_SSE2 __m128d _mm_unpacklo_pd(__m128d a, __m128d b) 1911{ 1912#if HAS_BUILTIN(__builtin_shufflevector) 1913 return __builtin_shufflevector((__v2df)a, (__v2df)b, 0, 2 + 0); 1914#else 1915 return (__m128d)__builtin_ia32_unpcklpd((__v2df)a, (__v2df)b); 1916#endif 1917} 1918 1919__INTRIN_INLINE_SSE2 int _mm_movemask_pd(__m128d a) 1920{ 1921 return __builtin_ia32_movmskpd((__v2df)a); 1922} 1923 1924#define _mm_shuffle_pd(a, b, i) \ 1925 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 1926 (int)(i))) 1927 1928__INTRIN_INLINE_SSE2 __m128 _mm_castpd_ps(__m128d a) 1929{ 1930 return (__m128)a; 1931} 1932 1933__INTRIN_INLINE_SSE2 __m128i _mm_castpd_si128(__m128d a) 1934{ 1935 return (__m128i)a; 1936} 1937 1938__INTRIN_INLINE_SSE2 __m128d _mm_castps_pd(__m128 a) 1939{ 1940 return (__m128d)a; 1941} 1942 1943__INTRIN_INLINE_SSE2 __m128i _mm_castps_si128(__m128 a) 1944{ 1945 return (__m128i)a; 1946} 1947 1948__INTRIN_INLINE_SSE2 __m128 _mm_castsi128_ps(__m128i a) 1949{ 1950 return (__m128)a; 1951} 1952 1953__INTRIN_INLINE_SSE2 __m128d _mm_castsi128_pd(__m128i a) 1954{ 1955 return (__m128d)a; 1956} 1957 1958void _mm_pause(void); 1959 1960#endif /* _MSC_VER */ 1961 1962#ifdef __cplusplus 1963} // extern "C" 1964#endif 1965 1966#endif /* _INCLUDED_EMM */