Reactos
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#pragma once
11#ifndef _INCLUDED_EMM
12#define _INCLUDED_EMM
13
14#include <vcruntime.h>
15#include <xmmintrin.h>
16
17#if defined(_MSC_VER) && !defined(__clang__)
18
19typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128i
20{
21 __int8 m128i_i8[16];
22 __int16 m128i_i16[8];
23 __int32 m128i_i32[4];
24 __int64 m128i_i64[2];
25 unsigned __int8 m128i_u8[16];
26 unsigned __int16 m128i_u16[8];
27 unsigned __int32 m128i_u32[4];
28 unsigned __int64 m128i_u64[2];
29} __m128i;
30#ifdef _STATIC_ASSERT
31_STATIC_ASSERT(sizeof(__m128i) == 16);
32#endif
33
34typedef struct _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128d
35{
36 double m128d_f64[2];
37} __m128d;
38
39typedef __declspec(align(1)) __m128i __m128i_u;
40
41#define __ATTRIBUTE_SSE2__
42
43#else /* _MSC_VER */
44
45typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
46typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
47
48typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
49typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
50
51/* Type defines. */
52typedef double __v2df __attribute__((__vector_size__(16)));
53typedef long long __v2di __attribute__((__vector_size__(16)));
54typedef short __v8hi __attribute__((__vector_size__(16)));
55typedef char __v16qi __attribute__((__vector_size__(16)));
56
57/* Unsigned types */
58typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
59typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
60typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
61
62/* We need an explicitly signed variant for char. Note that this shouldn't
63 * appear in the interface though. */
64typedef signed char __v16qs __attribute__((__vector_size__(16)));
65
66#ifdef __clang__
67#define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2"),__min_vector_width__(128)))
68#define __ATTRIBUTE_MMXSSE2__ __attribute__((__target__("mmx,sse2"),__min_vector_width__(128)))
69#else
70#define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2")))
71#define __ATTRIBUTE_MMXSSE2__ __attribute__((__target__("mmx,sse2")))
72#endif
73#define __INTRIN_INLINE_SSE2 __INTRIN_INLINE __ATTRIBUTE_SSE2__
74#define __INTRIN_INLINE_MMXSSE2 __INTRIN_INLINE __ATTRIBUTE_MMXSSE2__
75
76#endif /* _MSC_VER */
77
78#ifdef __cplusplus
79extern "C" {
80#endif
81
82extern __m128d _mm_add_sd(__m128d a, __m128d b);
83extern __m128d _mm_add_pd(__m128d a, __m128d b);
84extern __m128d _mm_sub_sd(__m128d a, __m128d b);
85extern __m128d _mm_sub_pd(__m128d a, __m128d b);
86extern __m128d _mm_mul_sd(__m128d a, __m128d b);
87extern __m128d _mm_mul_pd(__m128d a, __m128d b);
88extern __m128d _mm_div_sd(__m128d a, __m128d b);
89extern __m128d _mm_div_pd(__m128d a, __m128d b);
90extern __m128d _mm_sqrt_sd(__m128d a, __m128d b);
91extern __m128d _mm_sqrt_pd(__m128d a);
92extern __m128d _mm_min_sd(__m128d a, __m128d b);
93extern __m128d _mm_min_pd(__m128d a, __m128d b);
94extern __m128d _mm_max_sd(__m128d a, __m128d b);
95extern __m128d _mm_max_pd(__m128d a, __m128d b);
96extern __m128d _mm_and_pd(__m128d a, __m128d b);
97extern __m128d _mm_andnot_pd(__m128d a, __m128d b);
98extern __m128d _mm_or_pd(__m128d a, __m128d b);
99extern __m128d _mm_xor_pd(__m128d a, __m128d b);
100extern __m128d _mm_cmpeq_pd(__m128d a, __m128d b);
101extern __m128d _mm_cmplt_pd(__m128d a, __m128d b);
102extern __m128d _mm_cmple_pd(__m128d a, __m128d b);
103extern __m128d _mm_cmpgt_pd(__m128d a, __m128d b);
104extern __m128d _mm_cmpge_pd(__m128d a, __m128d b);
105extern __m128d _mm_cmpord_pd(__m128d a, __m128d b);
106extern __m128d _mm_cmpunord_pd(__m128d a, __m128d b);
107extern __m128d _mm_cmpneq_pd(__m128d a, __m128d b);
108extern __m128d _mm_cmpnlt_pd(__m128d a, __m128d b);
109extern __m128d _mm_cmpnle_pd(__m128d a, __m128d b);
110extern __m128d _mm_cmpngt_pd(__m128d a, __m128d b);
111extern __m128d _mm_cmpnge_pd(__m128d a, __m128d b);
112extern __m128d _mm_cmpeq_sd(__m128d a, __m128d b);
113extern __m128d _mm_cmplt_sd(__m128d a, __m128d b);
114extern __m128d _mm_cmple_sd(__m128d a, __m128d b);
115extern __m128d _mm_cmpgt_sd(__m128d a, __m128d b);
116extern __m128d _mm_cmpge_sd(__m128d a, __m128d b);
117extern __m128d _mm_cmpord_sd(__m128d a, __m128d b);
118extern __m128d _mm_cmpunord_sd(__m128d a, __m128d b);
119extern __m128d _mm_cmpneq_sd(__m128d a, __m128d b);
120extern __m128d _mm_cmpnlt_sd(__m128d a, __m128d b);
121extern __m128d _mm_cmpnle_sd(__m128d a, __m128d b);
122extern __m128d _mm_cmpngt_sd(__m128d a, __m128d b);
123extern __m128d _mm_cmpnge_sd(__m128d a, __m128d b);
124extern int _mm_comieq_sd(__m128d a, __m128d b);
125extern int _mm_comilt_sd(__m128d a, __m128d b);
126extern int _mm_comile_sd(__m128d a, __m128d b);
127extern int _mm_comigt_sd(__m128d a, __m128d b);
128extern int _mm_comige_sd(__m128d a, __m128d b);
129extern int _mm_comineq_sd(__m128d a, __m128d b);
130extern int _mm_ucomieq_sd(__m128d a, __m128d b);
131extern int _mm_ucomilt_sd(__m128d a, __m128d b);
132extern int _mm_ucomile_sd(__m128d a, __m128d b);
133extern int _mm_ucomigt_sd(__m128d a, __m128d b);
134extern int _mm_ucomige_sd(__m128d a, __m128d b);
135extern int _mm_ucomineq_sd(__m128d a, __m128d b);
136extern __m128 _mm_cvtpd_ps(__m128d a);
137extern __m128d _mm_cvtps_pd(__m128 a);
138extern __m128d _mm_cvtepi32_pd(__m128i a);
139extern __m128i _mm_cvtpd_epi32(__m128d a);
140extern int _mm_cvtsd_si32(__m128d a);
141extern __m128 _mm_cvtsd_ss(__m128 a, __m128d b);
142extern __m128d _mm_cvtsi32_sd(__m128d a, int b);
143extern __m128d _mm_cvtss_sd(__m128d a, __m128 b);
144extern __m128i _mm_cvttpd_epi32(__m128d a);
145extern int _mm_cvttsd_si32(__m128d a);
146extern __m64 _mm_cvtpd_pi32(__m128d a);
147extern __m64 _mm_cvttpd_pi32(__m128d a);
148extern __m128d _mm_cvtpi32_pd(__m64 a);
149extern double _mm_cvtsd_f64(__m128d a);
150extern __m128d _mm_load_pd(double const *dp);
151extern __m128d _mm_load1_pd(double const *dp);
152extern __m128d _mm_loadr_pd(double const *dp);
153extern __m128d _mm_loadu_pd(double const *dp);
154//extern __m128i _mm_loadu_si64(void const *a);
155//extern __m128i _mm_loadu_si32(void const *a);
156//extern __m128i _mm_loadu_si16(void const *a);
157extern __m128d _mm_load_sd(double const *dp);
158extern __m128d _mm_loadh_pd(__m128d a, double const *dp);
159extern __m128d _mm_loadl_pd(__m128d a, double const *dp);
160//extern __m128d _mm_undefined_pd(void);
161extern __m128d _mm_set_sd(double w);
162extern __m128d _mm_set1_pd(double w);
163extern __m128d _mm_set_pd(double w, double x);
164extern __m128d _mm_setr_pd(double w, double x);
165extern __m128d _mm_setzero_pd(void);
166extern __m128d _mm_move_sd(__m128d a, __m128d b);
167extern void _mm_store_sd(double *dp, __m128d a);
168extern void _mm_store_pd(double *dp, __m128d a);
169extern void _mm_store1_pd(double *dp, __m128d a);
170extern void _mm_storeu_pd(double *dp, __m128d a);
171extern void _mm_storer_pd(double *dp, __m128d a);
172extern void _mm_storeh_pd(double *dp, __m128d a);
173extern void _mm_storel_pd(double *dp, __m128d a);
174extern __m128i _mm_add_epi8(__m128i a, __m128i b);
175extern __m128i _mm_add_epi16(__m128i a, __m128i b);
176extern __m128i _mm_add_epi32(__m128i a, __m128i b);
177extern __m64 _mm_add_si64(__m64 a, __m64 b);
178extern __m128i _mm_add_epi64(__m128i a, __m128i b);
179extern __m128i _mm_adds_epi8(__m128i a, __m128i b);
180extern __m128i _mm_adds_epi16(__m128i a, __m128i b);
181extern __m128i _mm_adds_epu8(__m128i a, __m128i b);
182extern __m128i _mm_adds_epu16(__m128i a, __m128i b);
183extern __m128i _mm_avg_epu8(__m128i a, __m128i b);
184extern __m128i _mm_avg_epu16(__m128i a, __m128i b);
185extern __m128i _mm_madd_epi16(__m128i a, __m128i b);
186extern __m128i _mm_max_epi16(__m128i a, __m128i b);
187extern __m128i _mm_max_epu8(__m128i a, __m128i b);
188extern __m128i _mm_min_epi16(__m128i a, __m128i b);
189extern __m128i _mm_min_epu8(__m128i a, __m128i b);
190extern __m128i _mm_mulhi_epi16(__m128i a, __m128i b);
191extern __m128i _mm_mulhi_epu16(__m128i a, __m128i b);
192extern __m128i _mm_mullo_epi16(__m128i a, __m128i b);
193extern __m64 _mm_mul_su32(__m64 a, __m64 b);
194extern __m128i _mm_mul_epu32(__m128i a, __m128i b);
195extern __m128i _mm_sad_epu8(__m128i a, __m128i b);
196extern __m128i _mm_sub_epi8(__m128i a, __m128i b);
197extern __m128i _mm_sub_epi16(__m128i a, __m128i b);
198extern __m128i _mm_sub_epi32(__m128i a, __m128i b);
199extern __m64 _mm_sub_si64(__m64 a, __m64 b);
200extern __m128i _mm_sub_epi64(__m128i a, __m128i b);
201extern __m128i _mm_subs_epi8(__m128i a, __m128i b);
202extern __m128i _mm_subs_epi16(__m128i a, __m128i b);
203extern __m128i _mm_subs_epu8(__m128i a, __m128i b);
204extern __m128i _mm_subs_epu16(__m128i a, __m128i b);
205extern __m128i _mm_and_si128(__m128i a, __m128i b);
206extern __m128i _mm_andnot_si128(__m128i a, __m128i b);
207extern __m128i _mm_or_si128(__m128i a, __m128i b);
208extern __m128i _mm_xor_si128(__m128i a, __m128i b);
209extern __m128i _mm_slli_si128(__m128i a, int i);
210extern __m128i _mm_slli_epi16(__m128i a, int count);
211extern __m128i _mm_sll_epi16(__m128i a, __m128i count);
212extern __m128i _mm_slli_epi32(__m128i a, int count);
213extern __m128i _mm_sll_epi32(__m128i a, __m128i count);
214extern __m128i _mm_slli_epi64(__m128i a, int count);
215extern __m128i _mm_sll_epi64(__m128i a, __m128i count);
216extern __m128i _mm_srai_epi16(__m128i a, int count);
217extern __m128i _mm_sra_epi16(__m128i a, __m128i count);
218extern __m128i _mm_srai_epi32(__m128i a, int count);
219extern __m128i _mm_sra_epi32(__m128i a, __m128i count);
220extern __m128i _mm_srli_si128(__m128i a, int imm);
221extern __m128i _mm_srli_epi16(__m128i a, int count);
222extern __m128i _mm_srl_epi16(__m128i a, __m128i count);
223extern __m128i _mm_srli_epi32(__m128i a, int count);
224extern __m128i _mm_srl_epi32(__m128i a, __m128i count);
225extern __m128i _mm_srli_epi64(__m128i a, int count);
226extern __m128i _mm_srl_epi64(__m128i a, __m128i count);
227extern __m128i _mm_cmpeq_epi8(__m128i a, __m128i b);
228extern __m128i _mm_cmpeq_epi16(__m128i a, __m128i b);
229extern __m128i _mm_cmpeq_epi32(__m128i a, __m128i b);
230extern __m128i _mm_cmpgt_epi8(__m128i a, __m128i b);
231extern __m128i _mm_cmpgt_epi16(__m128i a, __m128i b);
232extern __m128i _mm_cmpgt_epi32(__m128i a, __m128i b);
233extern __m128i _mm_cmplt_epi8(__m128i a, __m128i b);
234extern __m128i _mm_cmplt_epi16(__m128i a, __m128i b);
235extern __m128i _mm_cmplt_epi32(__m128i a, __m128i b);
236#ifdef _M_AMD64
237extern __m128d _mm_cvtsi64_sd(__m128d a, long long b);
238extern long long _mm_cvtsd_si64(__m128d a);
239extern long long _mm_cvttsd_si64(__m128d a);
240#endif
241extern __m128 _mm_cvtepi32_ps(__m128i a);
242extern __m128i _mm_cvtps_epi32(__m128 a);
243extern __m128i _mm_cvttps_epi32(__m128 a);
244extern __m128i _mm_cvtsi32_si128(int a);
245#ifdef _M_AMD64
246extern __m128i _mm_cvtsi64_si128(long long a);
247#endif
248extern int _mm_cvtsi128_si32(__m128i a);
249#ifdef _M_AMD64
250extern long long _mm_cvtsi128_si64(__m128i a);
251#endif
252extern __m128i _mm_load_si128(__m128i const *p);
253extern __m128i _mm_loadu_si128(__m128i_u const *p);
254extern __m128i _mm_loadl_epi64(__m128i_u const *p);
255//extern __m128i _mm_undefined_si128(void);
256//extern __m128i _mm_set_epi64x(long long q1, long long q0); // FIXME
257extern __m128i _mm_set_epi64(__m64 q1, __m64 q0);
258//extern __m128i _mm_set_epi32(int i3, int i1, int i0);
259extern __m128i _mm_set_epi32(int i3, int i2, int i1, int i0);
260//extern __m128i _mm_set_epi16(short w7, short w2, short w1, short w0);
261extern __m128i _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0);
262//extern __m128i _mm_set_epi8(char b15, char b10, char b4, char b3, char b2, char b1, char b0);
263extern __m128i _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0);
264//extern __m128i _mm_set1_epi64x(long long q); // FIXME
265extern __m128i _mm_set1_epi64(__m64 q);
266extern __m128i _mm_set1_epi32(int i);
267extern __m128i _mm_set1_epi16(short w);
268extern __m128i _mm_set1_epi8(char b);
269extern __m128i _mm_setl_epi64(__m128i q); // FIXME: clang?
270extern __m128i _mm_setr_epi64(__m64 q0, __m64 q1);
271//extern __m128i _mm_setr_epi32(int i0, int i2, int i3);
272extern __m128i _mm_setr_epi32(int i0, int i1, int i2, int i3);
273//extern __m128i _mm_setr_epi16(short w0, short w5, short w6, short w7);
274extern __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7);
275//extern __m128i _mm_setr_epi8(char b0, char b6, char b11, char b12, char b13, char b14, char b15);
276extern __m128i _mm_setr_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0);
277extern __m128i _mm_setzero_si128(void);
278extern void _mm_store_si128(__m128i *p, __m128i b);
279extern void _mm_storeu_si128(__m128i_u *p, __m128i b);
280//extern void _mm_storeu_si64(void *p, __m128i b);
281//extern void _mm_storeu_si32(void *p, __m128i b);
282//extern void _mm_storeu_si16(void *p, __m128i b);
283extern void _mm_maskmoveu_si128(__m128i d, __m128i n, _Out_writes_bytes_(16) char *p);
284extern void _mm_storel_epi64(__m128i_u *p, __m128i a);
285extern void _mm_stream_pd(double *p, __m128d a);
286extern void _mm_stream_si128(__m128i *p, __m128i a);
287extern void _mm_stream_si32(int *p, int a);
288extern void _mm_clflush(void const *p);
289extern void _mm_lfence(void);
290extern void _mm_mfence(void);
291extern __m128i _mm_packs_epi16(__m128i a, __m128i b);
292extern __m128i _mm_packs_epi32(__m128i a, __m128i b);
293extern __m128i _mm_packus_epi16(__m128i a, __m128i b);
294extern int _mm_extract_epi16(__m128i a, int imm);
295extern __m128i _mm_insert_epi16(__m128i a, int b, int imm);
296extern int _mm_movemask_epi8(__m128i a);
297extern __m128i _mm_shuffle_epi32(__m128i a, int imm);
298extern __m128i _mm_shufflelo_epi16(__m128i a, int imm);
299extern __m128i _mm_shufflehi_epi16(__m128i a, int imm);
300extern __m128i _mm_unpackhi_epi8(__m128i a, __m128i b);
301extern __m128i _mm_unpackhi_epi16(__m128i a, __m128i b);
302extern __m128i _mm_unpackhi_epi32(__m128i a, __m128i b);
303extern __m128i _mm_unpackhi_epi64(__m128i a, __m128i b);
304extern __m128i _mm_unpacklo_epi8(__m128i a, __m128i b);
305extern __m128i _mm_unpacklo_epi16(__m128i a, __m128i b);
306extern __m128i _mm_unpacklo_epi32(__m128i a, __m128i b);
307extern __m128i _mm_unpacklo_epi64(__m128i a, __m128i b);
308extern __m64 _mm_movepi64_pi64(__m128i a);
309extern __m128i _mm_movpi64_epi64(__m64 a);
310extern __m128i _mm_move_epi64(__m128i a);
311extern __m128d _mm_unpackhi_pd(__m128d a, __m128d b);
312extern __m128d _mm_unpacklo_pd(__m128d a, __m128d b);
313extern int _mm_movemask_pd(__m128d a);
314extern __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm);
315extern __m128 _mm_castpd_ps(__m128d a);
316extern __m128i _mm_castpd_si128(__m128d a);
317extern __m128d _mm_castps_pd(__m128 a);
318extern __m128i _mm_castps_si128(__m128 a);
319extern __m128 _mm_castsi128_ps(__m128i a);
320extern __m128d _mm_castsi128_pd(__m128i a);
321void _mm_pause(void);
322
323/* Alternate names */
324#define _mm_set_pd1(a) _mm_set1_pd(a)
325#define _mm_load_pd1(p) _mm_load1_pd(p)
326#define _mm_store_pd1(p, a) _mm_store1_pd((p), (a))
327#define _mm_bslli_si128 _mm_slli_si128
328#define _mm_bsrli_si128 _mm_srli_si128
329#define _mm_stream_si64 _mm_stream_si64x
330
331#if defined(_MSC_VER) && !defined(__clang__)
332
333#pragma intrinsic(_mm_add_sd)
334#pragma intrinsic(_mm_add_pd)
335#pragma intrinsic(_mm_sub_sd)
336#pragma intrinsic(_mm_sub_pd)
337#pragma intrinsic(_mm_mul_sd)
338#pragma intrinsic(_mm_mul_pd)
339#pragma intrinsic(_mm_div_sd)
340#pragma intrinsic(_mm_div_pd)
341#pragma intrinsic(_mm_sqrt_sd)
342#pragma intrinsic(_mm_sqrt_pd)
343#pragma intrinsic(_mm_min_sd)
344#pragma intrinsic(_mm_min_pd)
345#pragma intrinsic(_mm_max_sd)
346#pragma intrinsic(_mm_max_pd)
347#pragma intrinsic(_mm_and_pd)
348#pragma intrinsic(_mm_andnot_pd)
349#pragma intrinsic(_mm_or_pd)
350#pragma intrinsic(_mm_xor_pd)
351#pragma intrinsic(_mm_cmpeq_pd)
352#pragma intrinsic(_mm_cmplt_pd)
353#pragma intrinsic(_mm_cmple_pd)
354#pragma intrinsic(_mm_cmpgt_pd)
355#pragma intrinsic(_mm_cmpge_pd)
356#pragma intrinsic(_mm_cmpord_pd)
357#pragma intrinsic(_mm_cmpunord_pd)
358#pragma intrinsic(_mm_cmpneq_pd)
359#pragma intrinsic(_mm_cmpnlt_pd)
360#pragma intrinsic(_mm_cmpnle_pd)
361#pragma intrinsic(_mm_cmpngt_pd)
362#pragma intrinsic(_mm_cmpnge_pd)
363#pragma intrinsic(_mm_cmpeq_sd)
364#pragma intrinsic(_mm_cmplt_sd)
365#pragma intrinsic(_mm_cmple_sd)
366#pragma intrinsic(_mm_cmpgt_sd)
367#pragma intrinsic(_mm_cmpge_sd)
368#pragma intrinsic(_mm_cmpord_sd)
369#pragma intrinsic(_mm_cmpunord_sd)
370#pragma intrinsic(_mm_cmpneq_sd)
371#pragma intrinsic(_mm_cmpnlt_sd)
372#pragma intrinsic(_mm_cmpnle_sd)
373#pragma intrinsic(_mm_cmpngt_sd)
374#pragma intrinsic(_mm_cmpnge_sd)
375#pragma intrinsic(_mm_comieq_sd)
376#pragma intrinsic(_mm_comilt_sd)
377#pragma intrinsic(_mm_comile_sd)
378#pragma intrinsic(_mm_comigt_sd)
379#pragma intrinsic(_mm_comige_sd)
380#pragma intrinsic(_mm_comineq_sd)
381#pragma intrinsic(_mm_ucomieq_sd)
382#pragma intrinsic(_mm_ucomilt_sd)
383#pragma intrinsic(_mm_ucomile_sd)
384#pragma intrinsic(_mm_ucomigt_sd)
385#pragma intrinsic(_mm_ucomige_sd)
386#pragma intrinsic(_mm_ucomineq_sd)
387#pragma intrinsic(_mm_cvtpd_ps)
388#pragma intrinsic(_mm_cvtps_pd)
389#pragma intrinsic(_mm_cvtepi32_pd)
390#pragma intrinsic(_mm_cvtpd_epi32)
391#pragma intrinsic(_mm_cvtsd_si32)
392#pragma intrinsic(_mm_cvtsd_ss)
393#pragma intrinsic(_mm_cvtsi32_sd)
394#pragma intrinsic(_mm_cvtss_sd)
395#pragma intrinsic(_mm_cvttpd_epi32)
396#pragma intrinsic(_mm_cvttsd_si32)
397//#pragma intrinsic(_mm_cvtpd_pi32)
398//#pragma intrinsic(_mm_cvttpd_pi32)
399//#pragma intrinsic(_mm_cvtpi32_pd)
400#pragma intrinsic(_mm_cvtsd_f64)
401#pragma intrinsic(_mm_load_pd)
402#pragma intrinsic(_mm_load1_pd)
403#pragma intrinsic(_mm_loadr_pd)
404#pragma intrinsic(_mm_loadu_pd)
405//#pragma intrinsic(_mm_loadu_si64)
406//#pragma intrinsic(_mm_loadu_si32)
407//#pragma intrinsic(_mm_loadu_si16)
408#pragma intrinsic(_mm_load_sd)
409#pragma intrinsic(_mm_loadh_pd)
410#pragma intrinsic(_mm_loadl_pd)
411//#pragma intrinsic(_mm_undefined_pd)
412#pragma intrinsic(_mm_set_sd)
413#pragma intrinsic(_mm_set1_pd)
414#pragma intrinsic(_mm_set_pd)
415#pragma intrinsic(_mm_setr_pd)
416#pragma intrinsic(_mm_setzero_pd)
417#pragma intrinsic(_mm_move_sd)
418#pragma intrinsic(_mm_store_sd)
419#pragma intrinsic(_mm_store_pd)
420#pragma intrinsic(_mm_store1_pd)
421#pragma intrinsic(_mm_storeu_pd)
422#pragma intrinsic(_mm_storer_pd)
423#pragma intrinsic(_mm_storeh_pd)
424#pragma intrinsic(_mm_storel_pd)
425#pragma intrinsic(_mm_add_epi8)
426#pragma intrinsic(_mm_add_epi16)
427#pragma intrinsic(_mm_add_epi32)
428//#pragma intrinsic(_mm_add_si64)
429#pragma intrinsic(_mm_add_epi64)
430#pragma intrinsic(_mm_adds_epi8)
431#pragma intrinsic(_mm_adds_epi16)
432#pragma intrinsic(_mm_adds_epu8)
433#pragma intrinsic(_mm_adds_epu16)
434#pragma intrinsic(_mm_avg_epu8)
435#pragma intrinsic(_mm_avg_epu16)
436#pragma intrinsic(_mm_madd_epi16)
437#pragma intrinsic(_mm_max_epi16)
438#pragma intrinsic(_mm_max_epu8)
439#pragma intrinsic(_mm_min_epi16)
440#pragma intrinsic(_mm_min_epu8)
441#pragma intrinsic(_mm_mulhi_epi16)
442#pragma intrinsic(_mm_mulhi_epu16)
443#pragma intrinsic(_mm_mullo_epi16)
444//#pragma intrinsic(_mm_mul_su32)
445#pragma intrinsic(_mm_mul_epu32)
446#pragma intrinsic(_mm_sad_epu8)
447#pragma intrinsic(_mm_sub_epi8)
448#pragma intrinsic(_mm_sub_epi16)
449#pragma intrinsic(_mm_sub_epi32)
450//#pragma intrinsic(_mm_sub_si64)
451#pragma intrinsic(_mm_sub_epi64)
452#pragma intrinsic(_mm_subs_epi8)
453#pragma intrinsic(_mm_subs_epi16)
454#pragma intrinsic(_mm_subs_epu8)
455#pragma intrinsic(_mm_subs_epu16)
456#pragma intrinsic(_mm_and_si128)
457#pragma intrinsic(_mm_andnot_si128)
458#pragma intrinsic(_mm_or_si128)
459#pragma intrinsic(_mm_xor_si128)
460#pragma intrinsic(_mm_slli_si128)
461#pragma intrinsic(_mm_slli_epi16)
462#pragma intrinsic(_mm_sll_epi16)
463#pragma intrinsic(_mm_slli_epi32)
464#pragma intrinsic(_mm_sll_epi32)
465#pragma intrinsic(_mm_slli_epi64)
466#pragma intrinsic(_mm_sll_epi64)
467#pragma intrinsic(_mm_srai_epi16)
468#pragma intrinsic(_mm_sra_epi16)
469#pragma intrinsic(_mm_srai_epi32)
470#pragma intrinsic(_mm_sra_epi32)
471#pragma intrinsic(_mm_srli_si128)
472#pragma intrinsic(_mm_srli_epi16)
473#pragma intrinsic(_mm_srl_epi16)
474#pragma intrinsic(_mm_srli_epi32)
475#pragma intrinsic(_mm_srl_epi32)
476#pragma intrinsic(_mm_srli_epi64)
477#pragma intrinsic(_mm_srl_epi64)
478#pragma intrinsic(_mm_cmpeq_epi8)
479#pragma intrinsic(_mm_cmpeq_epi16)
480#pragma intrinsic(_mm_cmpeq_epi32)
481#pragma intrinsic(_mm_cmpgt_epi8)
482#pragma intrinsic(_mm_cmpgt_epi16)
483#pragma intrinsic(_mm_cmpgt_epi32)
484#pragma intrinsic(_mm_cmplt_epi8)
485#pragma intrinsic(_mm_cmplt_epi16)
486#pragma intrinsic(_mm_cmplt_epi32)
487#ifdef _M_AMD64
488#pragma intrinsic(_mm_cvtsi64_sd)
489#pragma intrinsic(_mm_cvtsd_si64)
490#pragma intrinsic(_mm_cvttsd_si64)
491#endif
492#pragma intrinsic(_mm_cvtepi32_ps)
493#pragma intrinsic(_mm_cvtps_epi32)
494#pragma intrinsic(_mm_cvttps_epi32)
495#pragma intrinsic(_mm_cvtsi32_si128)
496#ifdef _M_AMD64
497#pragma intrinsic(_mm_cvtsi64_si128)
498#endif
499#pragma intrinsic(_mm_cvtsi128_si32)
500#ifdef _M_AMD64
501#pragma intrinsic(_mm_cvtsi128_si64)
502#endif
503#pragma intrinsic(_mm_load_si128)
504#pragma intrinsic(_mm_loadu_si128)
505#pragma intrinsic(_mm_loadl_epi64)
506//#pragma intrinsic(_mm_undefined_si128)
507//#pragma intrinsic(_mm_set_epi64x)
508//#pragma intrinsic(_mm_set_epi64)
509#pragma intrinsic(_mm_set_epi32)
510#pragma intrinsic(_mm_set_epi16)
511#pragma intrinsic(_mm_set_epi8)
512//#pragma intrinsic(_mm_set1_epi64x)
513//#pragma intrinsic(_mm_set1_epi64)
514#pragma intrinsic(_mm_set1_epi32)
515#pragma intrinsic(_mm_set1_epi16)
516#pragma intrinsic(_mm_set1_epi8)
517#pragma intrinsic(_mm_setl_epi64)
518//#pragma intrinsic(_mm_setr_epi64)
519#pragma intrinsic(_mm_setr_epi32)
520#pragma intrinsic(_mm_setr_epi16)
521#pragma intrinsic(_mm_setr_epi8)
522#pragma intrinsic(_mm_setzero_si128)
523#pragma intrinsic(_mm_store_si128)
524#pragma intrinsic(_mm_storeu_si128)
525//#pragma intrinsic(_mm_storeu_si64)
526//#pragma intrinsic(_mm_storeu_si32)
527//#pragma intrinsic(_mm_storeu_si16)
528#pragma intrinsic(_mm_maskmoveu_si128)
529#pragma intrinsic(_mm_storel_epi64)
530#pragma intrinsic(_mm_stream_pd)
531#pragma intrinsic(_mm_stream_si128)
532#pragma intrinsic(_mm_stream_si32)
533#pragma intrinsic(_mm_clflush)
534#pragma intrinsic(_mm_lfence)
535#pragma intrinsic(_mm_mfence)
536#pragma intrinsic(_mm_packs_epi16)
537#pragma intrinsic(_mm_packs_epi32)
538#pragma intrinsic(_mm_packus_epi16)
539#pragma intrinsic(_mm_extract_epi16)
540#pragma intrinsic(_mm_insert_epi16)
541#pragma intrinsic(_mm_movemask_epi8)
542#pragma intrinsic(_mm_shuffle_epi32)
543#pragma intrinsic(_mm_shufflelo_epi16)
544#pragma intrinsic(_mm_shufflehi_epi16)
545#pragma intrinsic(_mm_unpackhi_epi8)
546#pragma intrinsic(_mm_unpackhi_epi16)
547#pragma intrinsic(_mm_unpackhi_epi32)
548#pragma intrinsic(_mm_unpackhi_epi64)
549#pragma intrinsic(_mm_unpacklo_epi8)
550#pragma intrinsic(_mm_unpacklo_epi16)
551#pragma intrinsic(_mm_unpacklo_epi32)
552#pragma intrinsic(_mm_unpacklo_epi64)
553//#pragma intrinsic(_mm_movepi64_pi64)
554//#pragma intrinsic(_mm_movpi64_epi64)
555#pragma intrinsic(_mm_move_epi64)
556#pragma intrinsic(_mm_unpackhi_pd)
557#pragma intrinsic(_mm_unpacklo_pd)
558#pragma intrinsic(_mm_movemask_pd)
559#pragma intrinsic(_mm_shuffle_pd)
560#pragma intrinsic(_mm_castpd_ps)
561#pragma intrinsic(_mm_castpd_si128)
562#pragma intrinsic(_mm_castps_pd)
563#pragma intrinsic(_mm_castps_si128)
564#pragma intrinsic(_mm_castsi128_ps)
565#pragma intrinsic(_mm_castsi128_pd)
566#pragma intrinsic(_mm_pause)
567
568#else /* _MSC_VER */
569
570/*
571 Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/emmintrin.h
572 Clang older version: https://github.com/llvm/llvm-project/blob/3ef88b31843e040c95f23ff2c3c206f1fa399c05/clang/lib/Headers/emmintrin.h
573 unikraft: https://github.com/unikraft/lib-intel-intrinsics/blob/staging/include/emmintrin.h
574*/
575
576__INTRIN_INLINE_SSE2 __m128d _mm_add_sd(__m128d a, __m128d b)
577{
578 a[0] += b[0];
579 return a;
580}
581
582__INTRIN_INLINE_SSE2 __m128d _mm_add_pd(__m128d a, __m128d b)
583{
584 return (__m128d)((__v2df)a + (__v2df)b);
585}
586
587__INTRIN_INLINE_SSE2 __m128d _mm_sub_sd(__m128d a, __m128d b)
588{
589 a[0] -= b[0];
590 return a;
591}
592
593__INTRIN_INLINE_SSE2 __m128d _mm_sub_pd(__m128d a, __m128d b)
594{
595 return (__m128d)((__v2df)a - (__v2df)b);
596}
597
598__INTRIN_INLINE_SSE2 __m128d _mm_mul_sd(__m128d a, __m128d b)
599{
600 a[0] *= b[0];
601 return a;
602}
603
604__INTRIN_INLINE_SSE2 __m128d _mm_mul_pd(__m128d a, __m128d b)
605{
606 return (__m128d)((__v2df)a * (__v2df)b);
607}
608
609__INTRIN_INLINE_SSE2 __m128d _mm_div_sd(__m128d a, __m128d b)
610{
611 a[0] /= b[0];
612 return a;
613}
614
615__INTRIN_INLINE_SSE2 __m128d _mm_div_pd(__m128d a, __m128d b)
616{
617 return (__m128d)((__v2df)a / (__v2df)b);
618}
619
620__INTRIN_INLINE_SSE2 __m128d _mm_sqrt_sd(__m128d a, __m128d b)
621{
622 __m128d __c = __builtin_ia32_sqrtsd((__v2df)b);
623 return __extension__(__m128d){__c[0], a[1]};
624}
625
626__INTRIN_INLINE_SSE2 __m128d _mm_sqrt_pd(__m128d a)
627{
628 return __builtin_ia32_sqrtpd((__v2df)a);
629}
630
631__INTRIN_INLINE_SSE2 __m128d _mm_min_sd(__m128d a, __m128d b)
632{
633 return __builtin_ia32_minsd((__v2df)a, (__v2df)b);
634}
635
636__INTRIN_INLINE_SSE2 __m128d _mm_min_pd(__m128d a, __m128d b)
637{
638 return __builtin_ia32_minpd((__v2df)a, (__v2df)b);
639}
640
641__INTRIN_INLINE_SSE2 __m128d _mm_max_sd(__m128d a, __m128d b)
642{
643 return __builtin_ia32_maxsd((__v2df)a, (__v2df)b);
644}
645
646__INTRIN_INLINE_SSE2 __m128d _mm_max_pd(__m128d a, __m128d b)
647{
648 return __builtin_ia32_maxpd((__v2df)a, (__v2df)b);
649}
650
651__INTRIN_INLINE_SSE2 __m128d _mm_and_pd(__m128d a, __m128d b)
652{
653 return (__m128d)((__v2du)a & (__v2du)b);
654}
655
656__INTRIN_INLINE_SSE2 __m128d _mm_andnot_pd(__m128d a, __m128d b)
657{
658 return (__m128d)(~(__v2du)a & (__v2du)b);
659}
660
661__INTRIN_INLINE_SSE2 __m128d _mm_or_pd(__m128d a, __m128d b)
662{
663 return (__m128d)((__v2du)a | (__v2du)b);
664}
665
666__INTRIN_INLINE_SSE2 __m128d _mm_xor_pd(__m128d a, __m128d b)
667{
668 return (__m128d)((__v2du)a ^ (__v2du)b);
669}
670
671__INTRIN_INLINE_SSE2 __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
672{
673 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)a, (__v2df)b);
674}
675
676__INTRIN_INLINE_SSE2 __m128d _mm_cmplt_pd(__m128d a, __m128d b)
677{
678 return (__m128d)__builtin_ia32_cmpltpd((__v2df)a, (__v2df)b);
679}
680
681__INTRIN_INLINE_SSE2 __m128d _mm_cmple_pd(__m128d a, __m128d b)
682{
683 return (__m128d)__builtin_ia32_cmplepd((__v2df)a, (__v2df)b);
684}
685
686__INTRIN_INLINE_SSE2 __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
687{
688 return (__m128d)__builtin_ia32_cmpltpd((__v2df)b, (__v2df)a);
689}
690
691__INTRIN_INLINE_SSE2 __m128d _mm_cmpge_pd(__m128d a, __m128d b)
692{
693 return (__m128d)__builtin_ia32_cmplepd((__v2df)b, (__v2df)a);
694}
695
696__INTRIN_INLINE_SSE2 __m128d _mm_cmpord_pd(__m128d a, __m128d b)
697{
698 return (__m128d)__builtin_ia32_cmpordpd((__v2df)a, (__v2df)b);
699}
700
701__INTRIN_INLINE_SSE2 __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
702{
703 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)a, (__v2df)b);
704}
705
706__INTRIN_INLINE_SSE2 __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
707{
708 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)a, (__v2df)b);
709}
710
711__INTRIN_INLINE_SSE2 __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
712{
713 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)a, (__v2df)b);
714}
715
716__INTRIN_INLINE_SSE2 __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
717{
718 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)a, (__v2df)b);
719}
720
721__INTRIN_INLINE_SSE2 __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
722{
723 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)b, (__v2df)a);
724}
725
726__INTRIN_INLINE_SSE2 __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
727{
728 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)b, (__v2df)a);
729}
730
731__INTRIN_INLINE_SSE2 __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
732{
733 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)a, (__v2df)b);
734}
735
736__INTRIN_INLINE_SSE2 __m128d _mm_cmplt_sd(__m128d a, __m128d b)
737{
738 return (__m128d)__builtin_ia32_cmpltsd((__v2df)a, (__v2df)b);
739}
740
741__INTRIN_INLINE_SSE2 __m128d _mm_cmple_sd(__m128d a, __m128d b)
742{
743 return (__m128d)__builtin_ia32_cmplesd((__v2df)a, (__v2df)b);
744}
745
746__INTRIN_INLINE_SSE2 __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
747{
748 __m128d __c = __builtin_ia32_cmpltsd((__v2df)b, (__v2df)a);
749 return __extension__(__m128d){__c[0], a[1]};
750}
751
752__INTRIN_INLINE_SSE2 __m128d _mm_cmpge_sd(__m128d a, __m128d b)
753{
754 __m128d __c = __builtin_ia32_cmplesd((__v2df)b, (__v2df)a);
755 return __extension__(__m128d){__c[0], a[1]};
756}
757
758__INTRIN_INLINE_SSE2 __m128d _mm_cmpord_sd(__m128d a, __m128d b)
759{
760 return (__m128d)__builtin_ia32_cmpordsd((__v2df)a, (__v2df)b);
761}
762
763__INTRIN_INLINE_SSE2 __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
764{
765 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)a, (__v2df)b);
766}
767
768__INTRIN_INLINE_SSE2 __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
769{
770 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)a, (__v2df)b);
771}
772
773__INTRIN_INLINE_SSE2 __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
774{
775 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)a, (__v2df)b);
776}
777
778__INTRIN_INLINE_SSE2 __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
779{
780 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)a, (__v2df)b);
781}
782
783__INTRIN_INLINE_SSE2 __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
784{
785 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)b, (__v2df)a);
786 return __extension__(__m128d){__c[0], a[1]};
787}
788
789__INTRIN_INLINE_SSE2 __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
790{
791 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)b, (__v2df)a);
792 return __extension__(__m128d){__c[0], a[1]};
793}
794
795__INTRIN_INLINE_SSE2 int _mm_comieq_sd(__m128d a, __m128d b)
796{
797 return __builtin_ia32_comisdeq((__v2df)a, (__v2df)b);
798}
799
800__INTRIN_INLINE_SSE2 int _mm_comilt_sd(__m128d a, __m128d b)
801{
802 return __builtin_ia32_comisdlt((__v2df)a, (__v2df)b);
803}
804
805__INTRIN_INLINE_SSE2 int _mm_comile_sd(__m128d a, __m128d b)
806{
807 return __builtin_ia32_comisdle((__v2df)a, (__v2df)b);
808}
809
810__INTRIN_INLINE_SSE2 int _mm_comigt_sd(__m128d a, __m128d b)
811{
812 return __builtin_ia32_comisdgt((__v2df)a, (__v2df)b);
813}
814
815__INTRIN_INLINE_SSE2 int _mm_comige_sd(__m128d a, __m128d b)
816{
817 return __builtin_ia32_comisdge((__v2df)a, (__v2df)b);
818}
819
820__INTRIN_INLINE_SSE2 int _mm_comineq_sd(__m128d a, __m128d b)
821{
822 return __builtin_ia32_comisdneq((__v2df)a, (__v2df)b);
823}
824
825__INTRIN_INLINE_SSE2 int _mm_ucomieq_sd(__m128d a, __m128d b)
826{
827 return __builtin_ia32_ucomisdeq((__v2df)a, (__v2df)b);
828}
829
830__INTRIN_INLINE_SSE2 int _mm_ucomilt_sd(__m128d a, __m128d b)
831{
832 return __builtin_ia32_ucomisdlt((__v2df)a, (__v2df)b);
833}
834
835__INTRIN_INLINE_SSE2 int _mm_ucomile_sd(__m128d a, __m128d b)
836{
837 return __builtin_ia32_ucomisdle((__v2df)a, (__v2df)b);
838}
839
840__INTRIN_INLINE_SSE2 int _mm_ucomigt_sd(__m128d a, __m128d b)
841{
842 return __builtin_ia32_ucomisdgt((__v2df)a, (__v2df)b);
843}
844
845__INTRIN_INLINE_SSE2 int _mm_ucomige_sd(__m128d a, __m128d b)
846{
847 return __builtin_ia32_ucomisdge((__v2df)a, (__v2df)b);
848}
849
850__INTRIN_INLINE_SSE2 int _mm_ucomineq_sd(__m128d a, __m128d b)
851{
852 return __builtin_ia32_ucomisdneq((__v2df)a, (__v2df)b);
853}
854
855__INTRIN_INLINE_SSE2 __m128 _mm_cvtpd_ps(__m128d a)
856{
857 return __builtin_ia32_cvtpd2ps((__v2df)a);
858}
859
860__INTRIN_INLINE_SSE2 __m128d _mm_cvtps_pd(__m128 a)
861{
862#if HAS_BUILTIN(__builtin_convertvector)
863 return (__m128d)__builtin_convertvector(__builtin_shufflevector((__v4sf)a, (__v4sf)a, 0, 1), __v2df);
864#else
865 return __builtin_ia32_cvtps2pd(a);
866#endif
867}
868
869__INTRIN_INLINE_SSE2 __m128d _mm_cvtepi32_pd(__m128i a)
870{
871#if HAS_BUILTIN(__builtin_convertvector)
872 return (__m128d)__builtin_convertvector(__builtin_shufflevector((__v4si)a, (__v4si)a, 0, 1), __v2df);
873#else
874 return __builtin_ia32_cvtdq2pd((__v4si)a);
875#endif
876}
877
878__INTRIN_INLINE_SSE2 __m128i _mm_cvtpd_epi32(__m128d a)
879{
880 return (__m128i)__builtin_ia32_cvtpd2dq((__v2df)a);
881}
882
883__INTRIN_INLINE_SSE2 int _mm_cvtsd_si32(__m128d a)
884{
885 return __builtin_ia32_cvtsd2si((__v2df)a);
886}
887
888__INTRIN_INLINE_SSE2 __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
889{
890 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)a, (__v2df)b);
891}
892
893__INTRIN_INLINE_SSE2 __m128d _mm_cvtsi32_sd(__m128d a,
894 int b)
895{
896 a[0] = b;
897 return a;
898}
899
900__INTRIN_INLINE_SSE2 __m128d _mm_cvtss_sd(__m128d a, __m128 b)
901{
902 a[0] = b[0];
903 return a;
904}
905
906__INTRIN_INLINE_SSE2 __m128i _mm_cvttpd_epi32(__m128d a)
907{
908 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)a);
909}
910
911__INTRIN_INLINE_SSE2 int _mm_cvttsd_si32(__m128d a)
912{
913 return __builtin_ia32_cvttsd2si((__v2df)a);
914}
915
916__INTRIN_INLINE_MMXSSE2 __m64 _mm_cvtpd_pi32(__m128d a)
917{
918 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)a);
919}
920
921__INTRIN_INLINE_MMXSSE2 __m64 _mm_cvttpd_pi32(__m128d a)
922{
923 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)a);
924}
925
926__INTRIN_INLINE_MMXSSE2 __m128d _mm_cvtpi32_pd(__m64 a)
927{
928 return __builtin_ia32_cvtpi2pd((__v2si)a);
929}
930
931__INTRIN_INLINE_SSE2 double _mm_cvtsd_f64(__m128d a)
932{
933 return a[0];
934}
935
936__INTRIN_INLINE_SSE2 __m128d _mm_load_pd(double const *dp)
937{
938 return *(const __m128d *)dp;
939}
940
941__INTRIN_INLINE_SSE2 __m128d _mm_load1_pd(double const *dp)
942{
943 struct __mm_load1_pd_struct {
944 double __u;
945 } __attribute__((__packed__, __may_alias__));
946 double __u = ((const struct __mm_load1_pd_struct *)dp)->__u;
947 return __extension__(__m128d){__u, __u};
948}
949
950// GCC:
951/* Create a selector for use with the SHUFPD instruction. */
952#define _MM_SHUFFLE2(fp1,fp0) \
953 (((fp1) << 1) | (fp0))
954
955__INTRIN_INLINE_SSE2 __m128d _mm_loadr_pd(double const *dp)
956{
957#if HAS_BUILTIN(__builtin_shufflevector)
958 __m128d u = *(const __m128d *)dp;
959 return __builtin_shufflevector((__v2df)u, (__v2df)u, 1, 0);
960#else
961 return (__m128d){ dp[1], dp[0] };
962#endif
963}
964
965__INTRIN_INLINE_SSE2 __m128d _mm_loadu_pd(double const *dp)
966{
967 struct __loadu_pd {
968 __m128d_u __v;
969 } __attribute__((__packed__, __may_alias__));
970 return ((const struct __loadu_pd *)dp)->__v;
971}
972
973__INTRIN_INLINE_SSE2 __m128i _mm_loadu_si64(void const *a)
974{
975 struct __loadu_si64 {
976 long long __v;
977 } __attribute__((__packed__, __may_alias__));
978 long long __u = ((const struct __loadu_si64 *)a)->__v;
979 return __extension__(__m128i)(__v2di){__u, 0LL};
980}
981
982__INTRIN_INLINE_SSE2 __m128i _mm_loadu_si32(void const *a)
983{
984 struct __loadu_si32 {
985 int __v;
986 } __attribute__((__packed__, __may_alias__));
987 int __u = ((const struct __loadu_si32 *)a)->__v;
988 return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
989}
990
991__INTRIN_INLINE_SSE2 __m128i _mm_loadu_si16(void const *a)
992{
993 struct __loadu_si16 {
994 short __v;
995 } __attribute__((__packed__, __may_alias__));
996 short __u = ((const struct __loadu_si16 *)a)->__v;
997 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
998}
999
1000__INTRIN_INLINE_SSE2 __m128d _mm_load_sd(double const *dp)
1001{
1002 struct __mm_load_sd_struct {
1003 double __u;
1004 } __attribute__((__packed__, __may_alias__));
1005 double __u = ((const struct __mm_load_sd_struct *)dp)->__u;
1006 return __extension__(__m128d){__u, 0};
1007}
1008
1009__INTRIN_INLINE_SSE2 __m128d _mm_loadh_pd(__m128d a, double const *dp)
1010{
1011 struct __mm_loadh_pd_struct {
1012 double __u;
1013 } __attribute__((__packed__, __may_alias__));
1014 double __u = ((const struct __mm_loadh_pd_struct *)dp)->__u;
1015 return __extension__(__m128d){a[0], __u};
1016}
1017
1018__INTRIN_INLINE_SSE2 __m128d _mm_loadl_pd(__m128d a, double const *dp)
1019{
1020 struct __mm_loadl_pd_struct {
1021 double __u;
1022 } __attribute__((__packed__, __may_alias__));
1023 double __u = ((const struct __mm_loadl_pd_struct *)dp)->__u;
1024 return __extension__(__m128d){__u, a[1]};
1025}
1026
1027__INTRIN_INLINE_SSE2 __m128d _mm_undefined_pd(void)
1028{
1029#if HAS_BUILTIN(__builtin_ia32_undef128)
1030 return (__m128d)__builtin_ia32_undef128();
1031#else
1032 __m128d undef = undef;
1033 return undef;
1034#endif
1035}
1036
1037__INTRIN_INLINE_SSE2 __m128d _mm_set_sd(double w)
1038{
1039 return __extension__(__m128d){w, 0};
1040}
1041
1042__INTRIN_INLINE_SSE2 __m128d _mm_set1_pd(double w)
1043{
1044 return __extension__(__m128d){w, w};
1045}
1046
1047__INTRIN_INLINE_SSE2 __m128d _mm_set_pd(double w, double x)
1048{
1049 return __extension__(__m128d){x, w};
1050}
1051
1052__INTRIN_INLINE_SSE2 __m128d _mm_setr_pd(double w, double x)
1053{
1054 return __extension__(__m128d){w, x};
1055}
1056
1057__INTRIN_INLINE_SSE2 __m128d _mm_setzero_pd(void)
1058{
1059 return __extension__(__m128d){0, 0};
1060}
1061
1062__INTRIN_INLINE_SSE2 __m128d _mm_move_sd(__m128d a, __m128d b)
1063{
1064 a[0] = b[0];
1065 return a;
1066}
1067
1068__INTRIN_INLINE_SSE2 void _mm_store_sd(double *dp, __m128d a)
1069{
1070 struct __mm_store_sd_struct {
1071 double __u;
1072 } __attribute__((__packed__, __may_alias__));
1073 ((struct __mm_store_sd_struct *)dp)->__u = a[0];
1074}
1075
1076__INTRIN_INLINE_SSE2 void _mm_store_pd(double *dp, __m128d a)
1077{
1078 *(__m128d *)dp = a;
1079}
1080
1081__INTRIN_INLINE_SSE2 void _mm_store1_pd(double *dp, __m128d a)
1082{
1083#if HAS_BUILTIN(__builtin_shufflevector)
1084 a = __builtin_shufflevector((__v2df)a, (__v2df)a, 0, 0);
1085 _mm_store_pd(dp, a);
1086#else
1087 dp[0] = a[0];
1088 dp[1] = a[0];
1089#endif
1090}
1091
1092__INTRIN_INLINE_SSE2 void _mm_storeu_pd(double *dp, __m128d a)
1093{
1094 struct __storeu_pd {
1095 __m128d_u __v;
1096 } __attribute__((__packed__, __may_alias__));
1097 ((struct __storeu_pd *)dp)->__v = a;
1098}
1099
1100__INTRIN_INLINE_SSE2 void _mm_storer_pd(double *dp, __m128d a)
1101{
1102#if HAS_BUILTIN(__builtin_shufflevector)
1103 a = __builtin_shufflevector((__v2df)a, (__v2df)a, 1, 0);
1104 *(__m128d *)dp = a;
1105#else
1106 dp[0] = a[1];
1107 dp[1] = a[0];
1108#endif
1109}
1110
1111__INTRIN_INLINE_SSE2 void _mm_storeh_pd(double *dp, __m128d a)
1112{
1113 struct __mm_storeh_pd_struct {
1114 double __u;
1115 } __attribute__((__packed__, __may_alias__));
1116 ((struct __mm_storeh_pd_struct *)dp)->__u = a[1];
1117}
1118
1119__INTRIN_INLINE_SSE2 void _mm_storel_pd(double *dp, __m128d a)
1120{
1121 struct __mm_storeh_pd_struct {
1122 double __u;
1123 } __attribute__((__packed__, __may_alias__));
1124 ((struct __mm_storeh_pd_struct *)dp)->__u = a[0];
1125}
1126
1127__INTRIN_INLINE_SSE2 __m128i _mm_add_epi8(__m128i a, __m128i b)
1128{
1129 return (__m128i)((__v16qu)a + (__v16qu)b);
1130}
1131
1132__INTRIN_INLINE_SSE2 __m128i _mm_add_epi16(__m128i a, __m128i b)
1133{
1134 return (__m128i)((__v8hu)a + (__v8hu)b);
1135}
1136
1137__INTRIN_INLINE_SSE2 __m128i _mm_add_epi32(__m128i a, __m128i b)
1138{
1139 return (__m128i)((__v4su)a + (__v4su)b);
1140}
1141
1142__INTRIN_INLINE_MMXSSE2 __m64 _mm_add_si64(__m64 a, __m64 b)
1143{
1144 return (__m64)__builtin_ia32_paddq((__v1di)a, (__v1di)b);
1145}
1146
1147__INTRIN_INLINE_SSE2 __m128i _mm_add_epi64(__m128i a, __m128i b)
1148{
1149 return (__m128i)((__v2du)a + (__v2du)b);
1150}
1151
1152__INTRIN_INLINE_SSE2 __m128i _mm_adds_epi8(__m128i a, __m128i b)
1153{
1154#if HAS_BUILTIN(__builtin_elementwise_add_sat)
1155 return (__m128i)__builtin_elementwise_add_sat((__v16qs)a, (__v16qs)b);
1156#else
1157 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
1158#endif
1159}
1160
1161__INTRIN_INLINE_SSE2 __m128i _mm_adds_epi16(__m128i a, __m128i b)
1162{
1163#if HAS_BUILTIN(__builtin_elementwise_add_sat)
1164 return (__m128i)__builtin_elementwise_add_sat((__v8hi)a, (__v8hi)b);
1165#else
1166 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
1167#endif
1168}
1169
1170__INTRIN_INLINE_SSE2 __m128i _mm_adds_epu8(__m128i a, __m128i b)
1171{
1172#if HAS_BUILTIN(__builtin_elementwise_add_sat)
1173 return (__m128i)__builtin_elementwise_add_sat((__v16qu)a, (__v16qu)b);
1174#else
1175 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
1176#endif
1177}
1178
1179__INTRIN_INLINE_SSE2 __m128i _mm_adds_epu16(__m128i a, __m128i b)
1180{
1181#if HAS_BUILTIN(__builtin_elementwise_add_sat)
1182 return (__m128i)__builtin_elementwise_add_sat((__v8hu)a, (__v8hu)b);
1183#else
1184 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
1185#endif
1186}
1187
1188__INTRIN_INLINE_SSE2 __m128i _mm_avg_epu8(__m128i a, __m128i b)
1189{
1190 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
1191}
1192
1193__INTRIN_INLINE_SSE2 __m128i _mm_avg_epu16(__m128i a, __m128i b)
1194{
1195 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
1196}
1197
1198__INTRIN_INLINE_SSE2 __m128i _mm_madd_epi16(__m128i a, __m128i b)
1199{
1200 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
1201}
1202
1203__INTRIN_INLINE_SSE2 __m128i _mm_max_epi16(__m128i a, __m128i b)
1204{
1205#if HAS_BUILTIN(__builtin_elementwise_max)
1206 return (__m128i)__builtin_elementwise_max((__v8hi)a, (__v8hi)b);
1207#else
1208 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
1209#endif
1210}
1211
1212__INTRIN_INLINE_SSE2 __m128i _mm_max_epu8(__m128i a, __m128i b)
1213{
1214#if HAS_BUILTIN(__builtin_elementwise_max)
1215 return (__m128i)__builtin_elementwise_max((__v16qu)a, (__v16qu)b);
1216#else
1217 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
1218#endif
1219}
1220
1221__INTRIN_INLINE_SSE2 __m128i _mm_min_epi16(__m128i a, __m128i b)
1222{
1223#if HAS_BUILTIN(__builtin_elementwise_min)
1224 return (__m128i)__builtin_elementwise_min((__v8hi)a, (__v8hi)b);
1225#else
1226 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
1227#endif
1228}
1229
1230__INTRIN_INLINE_SSE2 __m128i _mm_min_epu8(__m128i a, __m128i b)
1231{
1232#if HAS_BUILTIN(__builtin_elementwise_min)
1233 return (__m128i)__builtin_elementwise_min((__v16qu)a, (__v16qu)b);
1234#else
1235 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
1236#endif
1237}
1238
1239__INTRIN_INLINE_SSE2 __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
1240{
1241 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
1242}
1243
1244__INTRIN_INLINE_SSE2 __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
1245{
1246 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
1247}
1248
1249__INTRIN_INLINE_SSE2 __m128i _mm_mullo_epi16(__m128i a, __m128i b)
1250{
1251 return (__m128i)((__v8hu)a * (__v8hu)b);
1252}
1253
1254__INTRIN_INLINE_MMXSSE2 __m64 _mm_mul_su32(__m64 a, __m64 b)
1255{
1256 return (__m64)__builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
1257}
1258
1259__INTRIN_INLINE_SSE2 __m128i _mm_mul_epu32(__m128i a, __m128i b)
1260{
1261 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
1262}
1263
1264__INTRIN_INLINE_SSE2 __m128i _mm_sad_epu8(__m128i a, __m128i b)
1265{
1266 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
1267}
1268
1269__INTRIN_INLINE_SSE2 __m128i _mm_sub_epi8(__m128i a, __m128i b)
1270{
1271 return (__m128i)((__v16qu)a - (__v16qu)b);
1272}
1273
1274__INTRIN_INLINE_SSE2 __m128i _mm_sub_epi16(__m128i a, __m128i b)
1275{
1276 return (__m128i)((__v8hu)a - (__v8hu)b);
1277}
1278
1279__INTRIN_INLINE_SSE2 __m128i _mm_sub_epi32(__m128i a, __m128i b)
1280{
1281 return (__m128i)((__v4su)a - (__v4su)b);
1282}
1283
1284__INTRIN_INLINE_MMXSSE2 __m64 _mm_sub_si64(__m64 a, __m64 b)
1285{
1286 return (__m64)__builtin_ia32_psubq((__v1di)a, (__v1di)b);
1287}
1288
1289__INTRIN_INLINE_SSE2 __m128i _mm_sub_epi64(__m128i a, __m128i b)
1290{
1291 return (__m128i)((__v2du)a - (__v2du)b);
1292}
1293
1294__INTRIN_INLINE_SSE2 __m128i _mm_subs_epi8(__m128i a, __m128i b)
1295{
1296#if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1297 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)a, (__v16qs)b);
1298#else
1299 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
1300#endif
1301}
1302
1303__INTRIN_INLINE_SSE2 __m128i _mm_subs_epi16(__m128i a, __m128i b)
1304{
1305#if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1306 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)a, (__v8hi)b);
1307#else
1308 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
1309#endif
1310}
1311
1312__INTRIN_INLINE_SSE2 __m128i _mm_subs_epu8(__m128i a, __m128i b)
1313{
1314#if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1315 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)a, (__v16qu)b);
1316#else
1317 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
1318#endif
1319}
1320
1321__INTRIN_INLINE_SSE2 __m128i _mm_subs_epu16(__m128i a, __m128i b)
1322{
1323#if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1324 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)a, (__v8hu)b);
1325#else
1326 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
1327#endif
1328}
1329
1330__INTRIN_INLINE_SSE2 __m128i _mm_and_si128(__m128i a, __m128i b)
1331{
1332 return (__m128i)((__v2du)a & (__v2du)b);
1333}
1334
1335__INTRIN_INLINE_SSE2 __m128i _mm_andnot_si128(__m128i a, __m128i b)
1336{
1337 return (__m128i)(~(__v2du)a & (__v2du)b);
1338}
1339
1340__INTRIN_INLINE_SSE2 __m128i _mm_or_si128(__m128i a, __m128i b)
1341{
1342 return (__m128i)((__v2du)a | (__v2du)b);
1343}
1344
1345__INTRIN_INLINE_SSE2 __m128i _mm_xor_si128(__m128i a, __m128i b)
1346{
1347 return (__m128i)((__v2du)a ^ (__v2du)b);
1348}
1349
1350#ifdef __clang__
1351#define _mm_slli_si128(a, imm) \
1352 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
1353#else
1354__INTRIN_INLINE_SSE2 __m128i _mm_slli_si128(__m128i a, const int imm)
1355{
1356 return (__m128i)__builtin_ia32_pslldqi128(a, imm * 8);
1357}
1358#endif
1359
1360__INTRIN_INLINE_SSE2 __m128i _mm_slli_epi16(__m128i a, int count)
1361{
1362 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
1363}
1364
1365__INTRIN_INLINE_SSE2 __m128i _mm_sll_epi16(__m128i a, __m128i count)
1366{
1367 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
1368}
1369
1370__INTRIN_INLINE_SSE2 __m128i _mm_slli_epi32(__m128i a, int count)
1371{
1372 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
1373}
1374
1375__INTRIN_INLINE_SSE2 __m128i _mm_sll_epi32(__m128i a, __m128i count)
1376{
1377 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
1378}
1379
1380__INTRIN_INLINE_SSE2 __m128i _mm_slli_epi64(__m128i a, int count)
1381{
1382 return __builtin_ia32_psllqi128((__v2di)a, count);
1383}
1384
1385__INTRIN_INLINE_SSE2 __m128i _mm_sll_epi64(__m128i a, __m128i count)
1386{
1387 return __builtin_ia32_psllq128((__v2di)a, (__v2di)count);
1388}
1389
1390__INTRIN_INLINE_SSE2 __m128i _mm_srai_epi16(__m128i a, int count)
1391{
1392 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
1393}
1394
1395__INTRIN_INLINE_SSE2 __m128i _mm_sra_epi16(__m128i a, __m128i count)
1396{
1397 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
1398}
1399
1400__INTRIN_INLINE_SSE2 __m128i _mm_srai_epi32(__m128i a, int count)
1401{
1402 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
1403}
1404
1405__INTRIN_INLINE_SSE2 __m128i _mm_sra_epi32(__m128i a, __m128i count)
1406{
1407 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
1408}
1409
1410#ifdef __clang__
1411#define _mm_srli_si128(a, imm) \
1412 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
1413#else
1414__INTRIN_INLINE_SSE2 __m128i _mm_srli_si128(__m128i a, const int imm)
1415{
1416 return (__m128i)__builtin_ia32_psrldqi128(a, imm * 8);
1417}
1418#endif
1419
1420__INTRIN_INLINE_SSE2 __m128i _mm_srli_epi16(__m128i a, int count)
1421{
1422 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
1423}
1424
1425__INTRIN_INLINE_SSE2 __m128i _mm_srl_epi16(__m128i a, __m128i count)
1426{
1427 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
1428}
1429
1430__INTRIN_INLINE_SSE2 __m128i _mm_srli_epi32(__m128i a, int count)
1431{
1432 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
1433}
1434
1435__INTRIN_INLINE_SSE2 __m128i _mm_srl_epi32(__m128i a, __m128i count)
1436{
1437 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
1438}
1439
1440__INTRIN_INLINE_SSE2 __m128i _mm_srli_epi64(__m128i a, int count)
1441{
1442 return __builtin_ia32_psrlqi128((__v2di)a, count);
1443}
1444
1445__INTRIN_INLINE_SSE2 __m128i _mm_srl_epi64(__m128i a, __m128i count)
1446{
1447 return __builtin_ia32_psrlq128((__v2di)a, (__v2di)count);
1448}
1449
1450__INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
1451{
1452 return (__m128i)((__v16qi)a == (__v16qi)b);
1453}
1454
1455__INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
1456{
1457 return (__m128i)((__v8hi)a == (__v8hi)b);
1458}
1459
1460__INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
1461{
1462 return (__m128i)((__v4si)a == (__v4si)b);
1463}
1464
1465__INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
1466{
1467 /* This function always performs a signed comparison, but __v16qi is a char
1468 which may be signed or unsigned, so use __v16qs. */
1469 return (__m128i)((__v16qs)a > (__v16qs)b);
1470}
1471
1472__INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
1473{
1474 return (__m128i)((__v8hi)a > (__v8hi)b);
1475}
1476
1477__INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
1478{
1479 return (__m128i)((__v4si)a > (__v4si)b);
1480}
1481
1482__INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
1483{
1484 return _mm_cmpgt_epi8(b, a);
1485}
1486
1487__INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
1488{
1489 return _mm_cmpgt_epi16(b, a);
1490}
1491
1492__INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
1493{
1494 return _mm_cmpgt_epi32(b, a);
1495}
1496
1497#ifdef _M_AMD64
1498
1499__INTRIN_INLINE_SSE2 __m128d _mm_cvtsi64_sd(__m128d a, long long b)
1500{
1501 a[0] = b;
1502 return a;
1503}
1504
1505__INTRIN_INLINE_SSE2 long long _mm_cvtsd_si64(__m128d a)
1506{
1507 return __builtin_ia32_cvtsd2si64((__v2df)a);
1508}
1509
1510__INTRIN_INLINE_SSE2 long long _mm_cvttsd_si64(__m128d a)
1511{
1512 return __builtin_ia32_cvttsd2si64((__v2df)a);
1513}
1514#endif
1515
1516__INTRIN_INLINE_SSE2 __m128 _mm_cvtepi32_ps(__m128i a)
1517{
1518#if HAS_BUILTIN(__builtin_convertvector)
1519 return (__m128)__builtin_convertvector((__v4si)a, __v4sf);
1520#else
1521 return __builtin_ia32_cvtdq2ps((__v4si)a);
1522#endif
1523}
1524
1525__INTRIN_INLINE_SSE2 __m128i _mm_cvtps_epi32(__m128 a)
1526{
1527 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)a);
1528}
1529
1530__INTRIN_INLINE_SSE2 __m128i _mm_cvttps_epi32(__m128 a)
1531{
1532 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)a);
1533}
1534
1535__INTRIN_INLINE_SSE2 __m128i _mm_cvtsi32_si128(int a)
1536{
1537 return __extension__(__m128i)(__v4si){a, 0, 0, 0};
1538}
1539
1540__INTRIN_INLINE_SSE2 __m128i _mm_cvtsi64_si128(long long a)
1541{
1542 return __extension__(__m128i)(__v2di){a, 0};
1543}
1544
1545__INTRIN_INLINE_SSE2 int _mm_cvtsi128_si32(__m128i a)
1546{
1547 __v4si b = (__v4si)a;
1548 return b[0];
1549}
1550
1551__INTRIN_INLINE_SSE2 long long _mm_cvtsi128_si64(__m128i a)
1552{
1553 return a[0];
1554}
1555
1556__INTRIN_INLINE_SSE2 __m128i _mm_load_si128(__m128i const *p)
1557{
1558 return *p;
1559}
1560
1561__INTRIN_INLINE_SSE2 __m128i _mm_loadu_si128(__m128i_u const *p)
1562{
1563 struct __loadu_si128 {
1564 __m128i_u __v;
1565 } __attribute__((__packed__, __may_alias__));
1566 return ((const struct __loadu_si128 *)p)->__v;
1567}
1568
1569__INTRIN_INLINE_SSE2 __m128i _mm_loadl_epi64(__m128i_u const *p)
1570{
1571 struct __mm_loadl_epi64_struct {
1572 long long __u;
1573 } __attribute__((__packed__, __may_alias__));
1574 return __extension__(__m128i){
1575 ((const struct __mm_loadl_epi64_struct *)p)->__u, 0};
1576}
1577
1578__INTRIN_INLINE_SSE2 __m128i _mm_undefined_si128(void)
1579{
1580#if HAS_BUILTIN(__builtin_ia32_undef128)
1581 return (__m128i)__builtin_ia32_undef128();
1582#else
1583 __m128i undef = undef;
1584 return undef;
1585#endif
1586}
1587
1588__INTRIN_INLINE_SSE2 __m128i _mm_set_epi64x(long long q1, long long q0)
1589{
1590 return __extension__(__m128i)(__v2di){q0, q1};
1591}
1592
1593__INTRIN_INLINE_SSE2 __m128i _mm_set_epi64(__m64 q1, __m64 q0)
1594{
1595 return _mm_set_epi64x((long long)q1, (long long)q0);
1596}
1597
1598__INTRIN_INLINE_SSE2 __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
1599{
1600 return __extension__(__m128i)(__v4si){i0, i1, i2, i3};
1601}
1602
1603__INTRIN_INLINE_SSE2 __m128i _mm_set_epi16(
1604 short w7, short w6, short w5, short w4,
1605 short w3, short w2, short w1, short w0)
1606{
1607 return __extension__(__m128i)(__v8hi){w0, w1, w2, w3, w4, w5, w6, w7};
1608}
1609
1610__INTRIN_INLINE_SSE2 __m128i _mm_set_epi8(
1611 char b15, char b14, char b13, char b12,
1612 char b11, char b10, char b9, char b8,
1613 char b7, char b6, char b5, char b4,
1614 char b3, char b2, char b1, char b0)
1615{
1616 return __extension__(__m128i)(__v16qi){
1617 b0, b1, b2, b3, b4, b5, b6, b7,
1618 b8, b9, b10, b11, b12, b13, b14, b15};
1619}
1620
1621__INTRIN_INLINE_SSE2 __m128i _mm_set1_epi64x(long long q)
1622{
1623 return _mm_set_epi64x(q, q);
1624}
1625
1626__INTRIN_INLINE_SSE2 __m128i _mm_set1_epi64(__m64 q)
1627{
1628 return _mm_set_epi64(q, q);
1629}
1630
1631__INTRIN_INLINE_SSE2 __m128i _mm_set1_epi32(int i)
1632{
1633 return _mm_set_epi32(i, i, i, i);
1634}
1635
1636__INTRIN_INLINE_SSE2 __m128i _mm_set1_epi16(short w)
1637{
1638 return _mm_set_epi16(w, w, w, w, w, w, w, w);
1639}
1640
1641__INTRIN_INLINE_SSE2 __m128i _mm_set1_epi8(char b)
1642{
1643 return _mm_set_epi8(b, b, b, b, b, b, b, b, b, b, b,
1644 b, b, b, b, b);
1645}
1646
1647__INTRIN_INLINE_SSE2 __m128i _mm_setr_epi64(__m64 q0, __m64 q1)
1648{
1649 return _mm_set_epi64(q1, q0);
1650}
1651
1652__INTRIN_INLINE_SSE2 __m128i _mm_setr_epi32(int i0, int i1, int i2, int i3)
1653{
1654 return _mm_set_epi32(i3, i2, i1, i0);
1655}
1656
1657__INTRIN_INLINE_SSE2 __m128i _mm_setr_epi16(
1658 short w0, short w1, short w2, short w3,
1659 short w4, short w5, short w6, short w7)
1660{
1661 return _mm_set_epi16(w7, w6, w5, w4, w3, w2, w1, w0);
1662}
1663
1664__INTRIN_INLINE_SSE2 __m128i _mm_setr_epi8(
1665 char b0, char b1, char b2, char b3,
1666 char b4, char b5, char b6, char b7,
1667 char b8, char b9, char b10, char b11,
1668 char b12, char b13, char b14, char b15)
1669{
1670 return _mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8,
1671 b7, b6, b5, b4, b3, b2, b1, b0);
1672}
1673
1674__INTRIN_INLINE_SSE2 __m128i _mm_setzero_si128(void)
1675{
1676 return __extension__(__m128i)(__v2di){0LL, 0LL};
1677}
1678
1679__INTRIN_INLINE_SSE2 void _mm_store_si128(__m128i *p, __m128i b)
1680{
1681 *p = b;
1682}
1683
1684__INTRIN_INLINE_SSE2 void _mm_storeu_si128(__m128i_u *p, __m128i b)
1685{
1686 struct __storeu_si128 {
1687 __m128i_u __v;
1688 } __attribute__((__packed__, __may_alias__));
1689 ((struct __storeu_si128 *)p)->__v = b;
1690}
1691
1692__INTRIN_INLINE_SSE2 void _mm_storeu_si64(void *p, __m128i b)
1693{
1694 struct __storeu_si64 {
1695 long long __v;
1696 } __attribute__((__packed__, __may_alias__));
1697 ((struct __storeu_si64 *)p)->__v = ((__v2di)b)[0];
1698}
1699
1700__INTRIN_INLINE_SSE2 void _mm_storeu_si32(void *p, __m128i b)
1701{
1702 struct __storeu_si32 {
1703 int __v;
1704 } __attribute__((__packed__, __may_alias__));
1705 ((struct __storeu_si32 *)p)->__v = ((__v4si)b)[0];
1706}
1707
1708__INTRIN_INLINE_SSE2 void _mm_storeu_si16(void *p, __m128i b)
1709{
1710 struct __storeu_si16 {
1711 short __v;
1712 } __attribute__((__packed__, __may_alias__));
1713 ((struct __storeu_si16 *)p)->__v = ((__v8hi)b)[0];
1714}
1715
1716__INTRIN_INLINE_SSE2 void _mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1717{
1718 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1719}
1720
1721__INTRIN_INLINE_SSE2 void _mm_storel_epi64(__m128i_u *p, __m128i a)
1722{
1723 struct __mm_storel_epi64_struct {
1724 long long __u;
1725 } __attribute__((__packed__, __may_alias__));
1726 ((struct __mm_storel_epi64_struct *)p)->__u = a[0];
1727}
1728
1729__INTRIN_INLINE_SSE2 void _mm_stream_pd(double *p, __m128d a)
1730{
1731#if HAS_BUILTIN(__builtin_nontemporal_store)
1732 __builtin_nontemporal_store((__v2df)a, (__v2df *)p);
1733#else
1734 __builtin_ia32_movntpd(p, a);
1735#endif
1736}
1737
1738__INTRIN_INLINE_SSE2 void _mm_stream_si128(__m128i *p, __m128i a)
1739{
1740#if HAS_BUILTIN(__builtin_nontemporal_store)
1741 __builtin_nontemporal_store((__v2di)a, (__v2di*)p);
1742#else
1743 __builtin_ia32_movntdq(p, a);
1744#endif
1745}
1746
1747__INTRIN_INLINE_SSE2 void _mm_stream_si32(int *p, int a)
1748{
1749 __builtin_ia32_movnti(p, a);
1750}
1751
1752#ifdef _M_AMD64
1753__INTRIN_INLINE_SSE2 void _mm_stream_si64(long long *p, long long a)
1754{
1755 __builtin_ia32_movnti64(p, a);
1756}
1757#endif
1758
1759void _mm_clflush(void const *p);
1760
1761void _mm_lfence(void);
1762
1763void _mm_mfence(void);
1764
1765__INTRIN_INLINE_SSE2 __m128i _mm_packs_epi16(__m128i a, __m128i b)
1766{
1767 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1768}
1769
1770__INTRIN_INLINE_SSE2 __m128i _mm_packs_epi32(__m128i a, __m128i b)
1771{
1772 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1773}
1774
1775__INTRIN_INLINE_SSE2 __m128i _mm_packus_epi16(__m128i a, __m128i b)
1776{
1777 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1778}
1779
1780#define _mm_extract_epi16(a, imm) \
1781 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
1782 (int)(imm)))
1783
1784#define _mm_insert_epi16(a, b, imm) \
1785 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
1786 (int)(imm)))
1787
1788__INTRIN_INLINE_SSE2 int _mm_movemask_epi8(__m128i a)
1789{
1790 return __builtin_ia32_pmovmskb128((__v16qi)a);
1791}
1792
1793#define _mm_shuffle_epi32(a, imm) \
1794 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
1795
1796#define _mm_shufflelo_epi16(a, imm) \
1797 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
1798
1799#define _mm_shufflehi_epi16(a, imm) \
1800 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
1801
1802__INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
1803{
1804#if HAS_BUILTIN(__builtin_shufflevector)
1805 return (__m128i)__builtin_shufflevector(
1806 (__v16qi)a, (__v16qi)b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
1807 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
1808#else
1809 return (__m128i)__builtin_ia32_punpckhbw128((__v16qi)a, (__v16qi)b);
1810#endif
1811}
1812
1813__INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
1814{
1815#if HAS_BUILTIN(__builtin_shufflevector)
1816 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8 + 4, 5,
1817 8 + 5, 6, 8 + 6, 7, 8 + 7);
1818#else
1819 return (__m128i)__builtin_ia32_punpckhwd128((__v8hi)a, (__v8hi)b);
1820#endif
1821}
1822
1823__INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
1824{
1825#if HAS_BUILTIN(__builtin_shufflevector)
1826 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4 + 2, 3,
1827 4 + 3);
1828#else
1829 return (__m128i)__builtin_ia32_punpckhdq128((__v4si)a, (__v4si)b);
1830#endif
1831}
1832
1833__INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
1834{
1835#if HAS_BUILTIN(__builtin_shufflevector)
1836 return (__m128i)__builtin_shufflevector((__v2di)a, (__v2di)b, 1, 2 + 1);
1837#else
1838 return (__m128i)__builtin_ia32_punpckhqdq128((__v2di)a, (__v2di)b);
1839#endif
1840}
1841
1842__INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
1843{
1844#if HAS_BUILTIN(__builtin_shufflevector)
1845 return (__m128i)__builtin_shufflevector(
1846 (__v16qi)a, (__v16qi)b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
1847 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
1848#else
1849 return (__m128i)__builtin_ia32_punpcklbw128((__v16qi)a, (__v16qi)b);
1850#endif
1851}
1852
1853__INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
1854{
1855#if HAS_BUILTIN(__builtin_shufflevector)
1856 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8 + 0, 1,
1857 8 + 1, 2, 8 + 2, 3, 8 + 3);
1858#else
1859 return (__m128i)__builtin_ia32_punpcklwd128((__v8hi)a, (__v8hi)b);
1860#endif
1861}
1862
1863__INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
1864{
1865#if HAS_BUILTIN(__builtin_shufflevector)
1866 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4 + 0, 1,
1867 4 + 1);
1868#else
1869 return (__m128i)__builtin_ia32_punpckldq128((__v4si)a, (__v4si)b);
1870#endif
1871}
1872
1873__INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
1874{
1875#if HAS_BUILTIN(__builtin_shufflevector)
1876 return (__m128i)__builtin_shufflevector((__v2di)a, (__v2di)b, 0, 2 + 0);
1877#else
1878 return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)a, (__v2di)b);
1879#endif
1880}
1881
1882__INTRIN_INLINE_SSE2 __m64 _mm_movepi64_pi64(__m128i a)
1883{
1884 return (__m64)a[0];
1885}
1886
1887__INTRIN_INLINE_SSE2 __m128i _mm_movpi64_epi64(__m64 a)
1888{
1889 return __extension__(__m128i)(__v2di){(long long)a, 0};
1890}
1891
1892__INTRIN_INLINE_SSE2 __m128i _mm_move_epi64(__m128i a)
1893{
1894#if HAS_BUILTIN(__builtin_shufflevector)
1895 return __builtin_shufflevector((__v2di)a, _mm_setzero_si128(), 0, 2);
1896#else
1897 return (__m128i)__builtin_ia32_movq128((__v2di)a);
1898#endif
1899}
1900
1901__INTRIN_INLINE_SSE2 __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
1902{
1903#if HAS_BUILTIN(__builtin_shufflevector)
1904 return __builtin_shufflevector((__v2df)a, (__v2df)b, 1, 2 + 1);
1905#else
1906 return (__m128d)__builtin_ia32_unpckhpd((__v2df)a, (__v2df)b);
1907#endif
1908}
1909
1910__INTRIN_INLINE_SSE2 __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
1911{
1912#if HAS_BUILTIN(__builtin_shufflevector)
1913 return __builtin_shufflevector((__v2df)a, (__v2df)b, 0, 2 + 0);
1914#else
1915 return (__m128d)__builtin_ia32_unpcklpd((__v2df)a, (__v2df)b);
1916#endif
1917}
1918
1919__INTRIN_INLINE_SSE2 int _mm_movemask_pd(__m128d a)
1920{
1921 return __builtin_ia32_movmskpd((__v2df)a);
1922}
1923
1924#define _mm_shuffle_pd(a, b, i) \
1925 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
1926 (int)(i)))
1927
1928__INTRIN_INLINE_SSE2 __m128 _mm_castpd_ps(__m128d a)
1929{
1930 return (__m128)a;
1931}
1932
1933__INTRIN_INLINE_SSE2 __m128i _mm_castpd_si128(__m128d a)
1934{
1935 return (__m128i)a;
1936}
1937
1938__INTRIN_INLINE_SSE2 __m128d _mm_castps_pd(__m128 a)
1939{
1940 return (__m128d)a;
1941}
1942
1943__INTRIN_INLINE_SSE2 __m128i _mm_castps_si128(__m128 a)
1944{
1945 return (__m128i)a;
1946}
1947
1948__INTRIN_INLINE_SSE2 __m128 _mm_castsi128_ps(__m128i a)
1949{
1950 return (__m128)a;
1951}
1952
1953__INTRIN_INLINE_SSE2 __m128d _mm_castsi128_pd(__m128i a)
1954{
1955 return (__m128d)a;
1956}
1957
1958void _mm_pause(void);
1959
1960#endif /* _MSC_VER */
1961
1962#ifdef __cplusplus
1963} // extern "C"
1964#endif
1965
1966#endif /* _INCLUDED_EMM */