1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 *===-----------------------------------------------------------------------===
27 #include <xmmintrin.h>
29 typedef double __m128d __attribute__((__vector_size__(16)));
30 typedef long long __m128i __attribute__((__vector_size__(16)));
33 typedef double __v2df __attribute__ ((__vector_size__ (16)));
34 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
35 typedef short __v8hi __attribute__((__vector_size__(16)));
36 typedef char __v16qi __attribute__((__vector_size__(16)));
39 typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
40 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
41 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
43 /* We need an explicitly signed variant for char. Note that this shouldn't
44 * appear in the interface though. */
45 typedef signed char __v16qs __attribute__((__vector_size__(16)));
47 #include <f16cintrin.h>
49 /* Define the default attributes for the functions in this file. */
50 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
52 /// \brief Adds lower double-precision values in both operands and returns the
53 /// sum in the lower 64 bits of the result. The upper 64 bits of the result
54 /// are copied from the upper double-precision value of the first operand.
56 /// \headerfile <x86intrin.h>
58 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
61 /// A 128-bit vector of [2 x double] containing one of the source operands.
63 /// A 128-bit vector of [2 x double] containing one of the source operands.
64 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
65 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied
66 /// from the upper 64 bits of the first source operand.
67 static __inline__ __m128d __DEFAULT_FN_ATTRS
68 _mm_add_sd(__m128d __a, __m128d __b)
74 /// \brief Adds two 128-bit vectors of [2 x double].
76 /// \headerfile <x86intrin.h>
78 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
81 /// A 128-bit vector of [2 x double] containing one of the source operands.
83 /// A 128-bit vector of [2 x double] containing one of the source operands.
84 /// \returns A 128-bit vector of [2 x double] containing the sums of both
86 static __inline__ __m128d __DEFAULT_FN_ATTRS
87 _mm_add_pd(__m128d __a, __m128d __b)
89 return (__m128d)((__v2df)__a + (__v2df)__b);
92 /// \brief Subtracts the lower double-precision value of the second operand
93 /// from the lower double-precision value of the first operand and returns
94 /// the difference in the lower 64 bits of the result. The upper 64 bits of
95 /// the result are copied from the upper double-precision value of the first
98 /// \headerfile <x86intrin.h>
100 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
103 /// A 128-bit vector of [2 x double] containing the minuend.
105 /// A 128-bit vector of [2 x double] containing the subtrahend.
106 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
107 /// difference of the lower 64 bits of both operands. The upper 64 bits are
108 /// copied from the upper 64 bits of the first source operand.
109 static __inline__ __m128d __DEFAULT_FN_ATTRS
110 _mm_sub_sd(__m128d __a, __m128d __b)
116 /// \brief Subtracts two 128-bit vectors of [2 x double].
118 /// \headerfile <x86intrin.h>
120 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
123 /// A 128-bit vector of [2 x double] containing the minuend.
125 /// A 128-bit vector of [2 x double] containing the subtrahend.
126 /// \returns A 128-bit vector of [2 x double] containing the differences between
128 static __inline__ __m128d __DEFAULT_FN_ATTRS
129 _mm_sub_pd(__m128d __a, __m128d __b)
131 return (__m128d)((__v2df)__a - (__v2df)__b);
134 /// \brief Multiplies lower double-precision values in both operands and returns
135 /// the product in the lower 64 bits of the result. The upper 64 bits of the
136 /// result are copied from the upper double-precision value of the first
139 /// \headerfile <x86intrin.h>
141 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
144 /// A 128-bit vector of [2 x double] containing one of the source operands.
146 /// A 128-bit vector of [2 x double] containing one of the source operands.
147 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
148 /// product of the lower 64 bits of both operands. The upper 64 bits are
149 /// copied from the upper 64 bits of the first source operand.
150 static __inline__ __m128d __DEFAULT_FN_ATTRS
151 _mm_mul_sd(__m128d __a, __m128d __b)
157 /// \brief Multiplies two 128-bit vectors of [2 x double].
159 /// \headerfile <x86intrin.h>
161 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
164 /// A 128-bit vector of [2 x double] containing one of the operands.
166 /// A 128-bit vector of [2 x double] containing one of the operands.
167 /// \returns A 128-bit vector of [2 x double] containing the products of both
169 static __inline__ __m128d __DEFAULT_FN_ATTRS
170 _mm_mul_pd(__m128d __a, __m128d __b)
172 return (__m128d)((__v2df)__a * (__v2df)__b);
175 /// \brief Divides the lower double-precision value of the first operand by the
176 /// lower double-precision value of the second operand and returns the
177 /// quotient in the lower 64 bits of the result. The upper 64 bits of the
178 /// result are copied from the upper double-precision value of the first
181 /// \headerfile <x86intrin.h>
183 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
186 /// A 128-bit vector of [2 x double] containing the dividend.
188 /// A 128-bit vector of [2 x double] containing divisor.
189 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
190 /// quotient of the lower 64 bits of both operands. The upper 64 bits are
191 /// copied from the upper 64 bits of the first source operand.
192 static __inline__ __m128d __DEFAULT_FN_ATTRS
193 _mm_div_sd(__m128d __a, __m128d __b)
199 /// \brief Performs an element-by-element division of two 128-bit vectors of
202 /// \headerfile <x86intrin.h>
204 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
207 /// A 128-bit vector of [2 x double] containing the dividend.
209 /// A 128-bit vector of [2 x double] containing the divisor.
210 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
212 static __inline__ __m128d __DEFAULT_FN_ATTRS
213 _mm_div_pd(__m128d __a, __m128d __b)
215 return (__m128d)((__v2df)__a / (__v2df)__b);
218 /// \brief Calculates the square root of the lower double-precision value of
219 /// the second operand and returns it in the lower 64 bits of the result.
220 /// The upper 64 bits of the result are copied from the upper double-
221 /// precision value of the first operand.
223 /// \headerfile <x86intrin.h>
225 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
228 /// A 128-bit vector of [2 x double] containing one of the operands. The
229 /// upper 64 bits of this operand are copied to the upper 64 bits of the
232 /// A 128-bit vector of [2 x double] containing one of the operands. The
233 /// square root is calculated using the lower 64 bits of this operand.
234 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
235 /// square root of the lower 64 bits of operand \a __b, and whose upper 64
236 /// bits are copied from the upper 64 bits of operand \a __a.
237 static __inline__ __m128d __DEFAULT_FN_ATTRS
238 _mm_sqrt_sd(__m128d __a, __m128d __b)
240 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
241 return (__m128d) { __c[0], __a[1] };
244 /// \brief Calculates the square root of the each of two values stored in a
245 /// 128-bit vector of [2 x double].
247 /// \headerfile <x86intrin.h>
249 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
252 /// A 128-bit vector of [2 x double].
253 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
254 /// values in the operand.
255 static __inline__ __m128d __DEFAULT_FN_ATTRS
256 _mm_sqrt_pd(__m128d __a)
258 return __builtin_ia32_sqrtpd((__v2df)__a);
261 /// \brief Compares lower 64-bit double-precision values of both operands, and
262 /// returns the lesser of the pair of values in the lower 64-bits of the
263 /// result. The upper 64 bits of the result are copied from the upper double-
264 /// precision value of the first operand.
266 /// \headerfile <x86intrin.h>
268 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
271 /// A 128-bit vector of [2 x double] containing one of the operands. The
272 /// lower 64 bits of this operand are used in the comparison.
274 /// A 128-bit vector of [2 x double] containing one of the operands. The
275 /// lower 64 bits of this operand are used in the comparison.
276 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
277 /// minimum value between both operands. The upper 64 bits are copied from
278 /// the upper 64 bits of the first source operand.
279 static __inline__ __m128d __DEFAULT_FN_ATTRS
280 _mm_min_sd(__m128d __a, __m128d __b)
282 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
285 /// \brief Performs element-by-element comparison of the two 128-bit vectors of
286 /// [2 x double] and returns the vector containing the lesser of each pair of
289 /// \headerfile <x86intrin.h>
291 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
294 /// A 128-bit vector of [2 x double] containing one of the operands.
296 /// A 128-bit vector of [2 x double] containing one of the operands.
297 /// \returns A 128-bit vector of [2 x double] containing the minimum values
298 /// between both operands.
299 static __inline__ __m128d __DEFAULT_FN_ATTRS
300 _mm_min_pd(__m128d __a, __m128d __b)
302 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
305 /// \brief Compares lower 64-bits double-precision values of both operands, and
306 /// returns the greater of the pair of values in the lower 64-bits of the
307 /// result. The upper 64 bits of the result are copied from the upper double-
308 /// precision value of the first operand.
310 /// \headerfile <x86intrin.h>
312 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
315 /// A 128-bit vector of [2 x double] containing one of the operands. The
316 /// lower 64 bits of this operand are used in the comparison.
318 /// A 128-bit vector of [2 x double] containing one of the operands. The
319 /// lower 64 bits of this operand are used in the comparison.
320 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
321 /// maximum value between both operands. The upper 64 bits are copied from
322 /// the upper 64 bits of the first source operand.
323 static __inline__ __m128d __DEFAULT_FN_ATTRS
324 _mm_max_sd(__m128d __a, __m128d __b)
326 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
329 /// \brief Performs element-by-element comparison of the two 128-bit vectors of
330 /// [2 x double] and returns the vector containing the greater of each pair
333 /// \headerfile <x86intrin.h>
335 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
338 /// A 128-bit vector of [2 x double] containing one of the operands.
340 /// A 128-bit vector of [2 x double] containing one of the operands.
341 /// \returns A 128-bit vector of [2 x double] containing the maximum values
342 /// between both operands.
343 static __inline__ __m128d __DEFAULT_FN_ATTRS
344 _mm_max_pd(__m128d __a, __m128d __b)
346 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
349 /// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double].
351 /// \headerfile <x86intrin.h>
353 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
356 /// A 128-bit vector of [2 x double] containing one of the source operands.
358 /// A 128-bit vector of [2 x double] containing one of the source operands.
359 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
360 /// values between both operands.
361 static __inline__ __m128d __DEFAULT_FN_ATTRS
362 _mm_and_pd(__m128d __a, __m128d __b)
364 return (__m128d)((__v2du)__a & (__v2du)__b);
367 /// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double], using
368 /// the one's complement of the values contained in the first source operand.
370 /// \headerfile <x86intrin.h>
372 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
375 /// A 128-bit vector of [2 x double] containing the left source operand. The
376 /// one's complement of this value is used in the bitwise AND.
378 /// A 128-bit vector of [2 x double] containing the right source operand.
379 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
380 /// values in the second operand and the one's complement of the first
382 static __inline__ __m128d __DEFAULT_FN_ATTRS
383 _mm_andnot_pd(__m128d __a, __m128d __b)
385 return (__m128d)(~(__v2du)__a & (__v2du)__b);
388 /// \brief Performs a bitwise OR of two 128-bit vectors of [2 x double].
390 /// \headerfile <x86intrin.h>
392 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
395 /// A 128-bit vector of [2 x double] containing one of the source operands.
397 /// A 128-bit vector of [2 x double] containing one of the source operands.
398 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
399 /// values between both operands.
400 static __inline__ __m128d __DEFAULT_FN_ATTRS
401 _mm_or_pd(__m128d __a, __m128d __b)
403 return (__m128d)((__v2du)__a | (__v2du)__b);
406 /// \brief Performs a bitwise XOR of two 128-bit vectors of [2 x double].
408 /// \headerfile <x86intrin.h>
410 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
413 /// A 128-bit vector of [2 x double] containing one of the source operands.
415 /// A 128-bit vector of [2 x double] containing one of the source operands.
416 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
417 /// values between both operands.
418 static __inline__ __m128d __DEFAULT_FN_ATTRS
419 _mm_xor_pd(__m128d __a, __m128d __b)
421 return (__m128d)((__v2du)__a ^ (__v2du)__b);
424 /// \brief Compares each of the corresponding double-precision values of the
425 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0h
426 /// for false, FFFFFFFFFFFFFFFFh for true.
428 /// \headerfile <x86intrin.h>
430 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
433 /// A 128-bit vector of [2 x double].
435 /// A 128-bit vector of [2 x double].
436 /// \returns A 128-bit vector containing the comparison results.
437 static __inline__ __m128d __DEFAULT_FN_ATTRS
438 _mm_cmpeq_pd(__m128d __a, __m128d __b)
440 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
443 /// \brief Compares each of the corresponding double-precision values of the
444 /// 128-bit vectors of [2 x double] to determine if the values in the first
445 /// operand are less than those in the second operand. Each comparison
446 /// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
448 /// \headerfile <x86intrin.h>
450 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
453 /// A 128-bit vector of [2 x double].
455 /// A 128-bit vector of [2 x double].
456 /// \returns A 128-bit vector containing the comparison results.
457 static __inline__ __m128d __DEFAULT_FN_ATTRS
458 _mm_cmplt_pd(__m128d __a, __m128d __b)
460 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
463 /// \brief Compares each of the corresponding double-precision values of the
464 /// 128-bit vectors of [2 x double] to determine if the values in the first
465 /// operand are less than or equal to those in the second operand. Each
466 /// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
468 /// \headerfile <x86intrin.h>
470 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
473 /// A 128-bit vector of [2 x double].
475 /// A 128-bit vector of [2 x double].
476 /// \returns A 128-bit vector containing the comparison results.
477 static __inline__ __m128d __DEFAULT_FN_ATTRS
478 _mm_cmple_pd(__m128d __a, __m128d __b)
480 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
483 /// \brief Compares each of the corresponding double-precision values of the
484 /// 128-bit vectors of [2 x double] to determine if the values in the first
485 /// operand are greater than those in the second operand. Each comparison
486 /// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
488 /// \headerfile <x86intrin.h>
490 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
493 /// A 128-bit vector of [2 x double].
495 /// A 128-bit vector of [2 x double].
496 /// \returns A 128-bit vector containing the comparison results.
497 static __inline__ __m128d __DEFAULT_FN_ATTRS
498 _mm_cmpgt_pd(__m128d __a, __m128d __b)
500 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
503 /// \brief Compares each of the corresponding double-precision values of the
504 /// 128-bit vectors of [2 x double] to determine if the values in the first
505 /// operand are greater than or equal to those in the second operand. Each
506 /// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
508 /// \headerfile <x86intrin.h>
510 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
513 /// A 128-bit vector of [2 x double].
515 /// A 128-bit vector of [2 x double].
516 /// \returns A 128-bit vector containing the comparison results.
517 static __inline__ __m128d __DEFAULT_FN_ATTRS
518 _mm_cmpge_pd(__m128d __a, __m128d __b)
520 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
523 /// \brief Compares each of the corresponding double-precision values of the
524 /// 128-bit vectors of [2 x double] to determine if the values in the first
525 /// operand are ordered with respect to those in the second operand. A pair
526 /// of double-precision values are "ordered" with respect to each other if
527 /// neither value is a NaN. Each comparison yields 0h for false,
528 /// FFFFFFFFFFFFFFFFh for true.
530 /// \headerfile <x86intrin.h>
532 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
535 /// A 128-bit vector of [2 x double].
537 /// A 128-bit vector of [2 x double].
538 /// \returns A 128-bit vector containing the comparison results.
539 static __inline__ __m128d __DEFAULT_FN_ATTRS
540 _mm_cmpord_pd(__m128d __a, __m128d __b)
542 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
545 /// \brief Compares each of the corresponding double-precision values of the
546 /// 128-bit vectors of [2 x double] to determine if the values in the first
547 /// operand are unordered with respect to those in the second operand. A pair
548 /// of double-precision values are "unordered" with respect to each other if
549 /// one or both values are NaN. Each comparison yields 0h for false,
550 /// FFFFFFFFFFFFFFFFh for true.
552 /// \headerfile <x86intrin.h>
554 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
558 /// A 128-bit vector of [2 x double].
560 /// A 128-bit vector of [2 x double].
561 /// \returns A 128-bit vector containing the comparison results.
562 static __inline__ __m128d __DEFAULT_FN_ATTRS
563 _mm_cmpunord_pd(__m128d __a, __m128d __b)
565 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
568 /// \brief Compares each of the corresponding double-precision values of the
569 /// 128-bit vectors of [2 x double] to determine if the values in the first
570 /// operand are unequal to those in the second operand. Each comparison
571 /// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
573 /// \headerfile <x86intrin.h>
575 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
578 /// A 128-bit vector of [2 x double].
580 /// A 128-bit vector of [2 x double].
581 /// \returns A 128-bit vector containing the comparison results.
582 static __inline__ __m128d __DEFAULT_FN_ATTRS
583 _mm_cmpneq_pd(__m128d __a, __m128d __b)
585 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
588 /// \brief Compares each of the corresponding double-precision values of the
589 /// 128-bit vectors of [2 x double] to determine if the values in the first
590 /// operand are not less than those in the second operand. Each comparison
591 /// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
593 /// \headerfile <x86intrin.h>
595 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
598 /// A 128-bit vector of [2 x double].
600 /// A 128-bit vector of [2 x double].
601 /// \returns A 128-bit vector containing the comparison results.
602 static __inline__ __m128d __DEFAULT_FN_ATTRS
603 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
605 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
608 /// \brief Compares each of the corresponding double-precision values of the
609 /// 128-bit vectors of [2 x double] to determine if the values in the first
610 /// operand are not less than or equal to those in the second operand. Each
611 /// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
613 /// \headerfile <x86intrin.h>
615 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
618 /// A 128-bit vector of [2 x double].
620 /// A 128-bit vector of [2 x double].
621 /// \returns A 128-bit vector containing the comparison results.
622 static __inline__ __m128d __DEFAULT_FN_ATTRS
623 _mm_cmpnle_pd(__m128d __a, __m128d __b)
625 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
628 /// \brief Compares each of the corresponding double-precision values of the
629 /// 128-bit vectors of [2 x double] to determine if the values in the first
630 /// operand are not greater than those in the second operand. Each
631 /// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
633 /// \headerfile <x86intrin.h>
635 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
638 /// A 128-bit vector of [2 x double].
640 /// A 128-bit vector of [2 x double].
641 /// \returns A 128-bit vector containing the comparison results.
642 static __inline__ __m128d __DEFAULT_FN_ATTRS
643 _mm_cmpngt_pd(__m128d __a, __m128d __b)
645 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
648 /// \brief Compares each of the corresponding double-precision values of the
649 /// 128-bit vectors of [2 x double] to determine if the values in the first
650 /// operand are not greater than or equal to those in the second operand.
651 /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
653 /// \headerfile <x86intrin.h>
655 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
658 /// A 128-bit vector of [2 x double].
660 /// A 128-bit vector of [2 x double].
661 /// \returns A 128-bit vector containing the comparison results.
662 static __inline__ __m128d __DEFAULT_FN_ATTRS
663 _mm_cmpnge_pd(__m128d __a, __m128d __b)
665 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
668 /// \brief Compares the lower double-precision floating-point values in each of
669 /// the two 128-bit floating-point vectors of [2 x double] for equality. The
670 /// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
672 /// \headerfile <x86intrin.h>
674 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
677 /// A 128-bit vector of [2 x double]. The lower double-precision value is
678 /// compared to the lower double-precision value of \a __b.
680 /// A 128-bit vector of [2 x double]. The lower double-precision value is
681 /// compared to the lower double-precision value of \a __a.
682 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
683 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
684 static __inline__ __m128d __DEFAULT_FN_ATTRS
685 _mm_cmpeq_sd(__m128d __a, __m128d __b)
687 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
690 /// \brief Compares the lower double-precision floating-point values in each of
691 /// the two 128-bit floating-point vectors of [2 x double] to determine if
692 /// the value in the first parameter is less than the corresponding value in
693 /// the second parameter. The comparison yields 0h for false,
694 /// FFFFFFFFFFFFFFFFh for true.
696 /// \headerfile <x86intrin.h>
698 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
701 /// A 128-bit vector of [2 x double]. The lower double-precision value is
702 /// compared to the lower double-precision value of \a __b.
704 /// A 128-bit vector of [2 x double]. The lower double-precision value is
705 /// compared to the lower double-precision value of \a __a.
706 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
707 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
708 static __inline__ __m128d __DEFAULT_FN_ATTRS
709 _mm_cmplt_sd(__m128d __a, __m128d __b)
711 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
714 /// \brief Compares the lower double-precision floating-point values in each of
715 /// the two 128-bit floating-point vectors of [2 x double] to determine if
716 /// the value in the first parameter is less than or equal to the
717 /// corresponding value in the second parameter. The comparison yields 0h for
718 /// false, FFFFFFFFFFFFFFFFh for true.
720 /// \headerfile <x86intrin.h>
722 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
725 /// A 128-bit vector of [2 x double]. The lower double-precision value is
726 /// compared to the lower double-precision value of \a __b.
728 /// A 128-bit vector of [2 x double]. The lower double-precision value is
729 /// compared to the lower double-precision value of \a __a.
730 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
731 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
732 static __inline__ __m128d __DEFAULT_FN_ATTRS
733 _mm_cmple_sd(__m128d __a, __m128d __b)
735 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
738 /// \brief Compares the lower double-precision floating-point values in each of
739 /// the two 128-bit floating-point vectors of [2 x double] to determine if
740 /// the value in the first parameter is greater than the corresponding value
741 /// in the second parameter. The comparison yields 0h for false,
742 /// FFFFFFFFFFFFFFFFh for true.
744 /// \headerfile <x86intrin.h>
746 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
749 /// A 128-bit vector of [2 x double]. The lower double-precision value is
750 /// compared to the lower double-precision value of \a __b.
752 /// A 128-bit vector of [2 x double]. The lower double-precision value is
753 /// compared to the lower double-precision value of \a __a.
754 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
755 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
756 static __inline__ __m128d __DEFAULT_FN_ATTRS
757 _mm_cmpgt_sd(__m128d __a, __m128d __b)
759 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
760 return (__m128d) { __c[0], __a[1] };
763 /// \brief Compares the lower double-precision floating-point values in each of
764 /// the two 128-bit floating-point vectors of [2 x double] to determine if
765 /// the value in the first parameter is greater than or equal to the
766 /// corresponding value in the second parameter. The comparison yields 0h for
767 /// false, FFFFFFFFFFFFFFFFh for true.
769 /// \headerfile <x86intrin.h>
771 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
774 /// A 128-bit vector of [2 x double]. The lower double-precision value is
775 /// compared to the lower double-precision value of \a __b.
777 /// A 128-bit vector of [2 x double]. The lower double-precision value is
778 /// compared to the lower double-precision value of \a __a.
779 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
780 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
781 static __inline__ __m128d __DEFAULT_FN_ATTRS
782 _mm_cmpge_sd(__m128d __a, __m128d __b)
784 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
785 return (__m128d) { __c[0], __a[1] };
788 /// \brief Compares the lower double-precision floating-point values in each of
789 /// the two 128-bit floating-point vectors of [2 x double] to determine if
790 /// the value in the first parameter is "ordered" with respect to the
791 /// corresponding value in the second parameter. The comparison yields 0h for
792 /// false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values are
793 /// "ordered" with respect to each other if neither value is a NaN.
795 /// \headerfile <x86intrin.h>
797 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
800 /// A 128-bit vector of [2 x double]. The lower double-precision value is
801 /// compared to the lower double-precision value of \a __b.
803 /// A 128-bit vector of [2 x double]. The lower double-precision value is
804 /// compared to the lower double-precision value of \a __a.
805 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
806 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
807 static __inline__ __m128d __DEFAULT_FN_ATTRS
808 _mm_cmpord_sd(__m128d __a, __m128d __b)
810 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
813 /// \brief Compares the lower double-precision floating-point values in each of
814 /// the two 128-bit floating-point vectors of [2 x double] to determine if
815 /// the value in the first parameter is "unordered" with respect to the
816 /// corresponding value in the second parameter. The comparison yields 0h
817 /// for false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values
818 /// are "unordered" with respect to each other if one or both values are NaN.
820 /// \headerfile <x86intrin.h>
822 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
826 /// A 128-bit vector of [2 x double]. The lower double-precision value is
827 /// compared to the lower double-precision value of \a __b.
829 /// A 128-bit vector of [2 x double]. The lower double-precision value is
830 /// compared to the lower double-precision value of \a __a.
831 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
832 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
833 static __inline__ __m128d __DEFAULT_FN_ATTRS
834 _mm_cmpunord_sd(__m128d __a, __m128d __b)
836 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
839 /// \brief Compares the lower double-precision floating-point values in each of
840 /// the two 128-bit floating-point vectors of [2 x double] to determine if
841 /// the value in the first parameter is unequal to the corresponding value in
842 /// the second parameter. The comparison yields 0h for false,
843 /// FFFFFFFFFFFFFFFFh for true.
845 /// \headerfile <x86intrin.h>
847 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
850 /// A 128-bit vector of [2 x double]. The lower double-precision value is
851 /// compared to the lower double-precision value of \a __b.
853 /// A 128-bit vector of [2 x double]. The lower double-precision value is
854 /// compared to the lower double-precision value of \a __a.
855 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
856 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
857 static __inline__ __m128d __DEFAULT_FN_ATTRS
858 _mm_cmpneq_sd(__m128d __a, __m128d __b)
860 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
863 /// \brief Compares the lower double-precision floating-point values in each of
864 /// the two 128-bit floating-point vectors of [2 x double] to determine if
865 /// the value in the first parameter is not less than the corresponding
866 /// value in the second parameter. The comparison yields 0h for false,
867 /// FFFFFFFFFFFFFFFFh for true.
869 /// \headerfile <x86intrin.h>
871 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
874 /// A 128-bit vector of [2 x double]. The lower double-precision value is
875 /// compared to the lower double-precision value of \a __b.
877 /// A 128-bit vector of [2 x double]. The lower double-precision value is
878 /// compared to the lower double-precision value of \a __a.
879 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
880 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
881 static __inline__ __m128d __DEFAULT_FN_ATTRS
882 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
884 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
887 /// \brief Compares the lower double-precision floating-point values in each of
888 /// the two 128-bit floating-point vectors of [2 x double] to determine if
889 /// the value in the first parameter is not less than or equal to the
890 /// corresponding value in the second parameter. The comparison yields 0h
891 /// for false, FFFFFFFFFFFFFFFFh for true.
893 /// \headerfile <x86intrin.h>
895 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
898 /// A 128-bit vector of [2 x double]. The lower double-precision value is
899 /// compared to the lower double-precision value of \a __b.
901 /// A 128-bit vector of [2 x double]. The lower double-precision value is
902 /// compared to the lower double-precision value of \a __a.
903 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
904 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
905 static __inline__ __m128d __DEFAULT_FN_ATTRS
906 _mm_cmpnle_sd(__m128d __a, __m128d __b)
908 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
911 /// \brief Compares the lower double-precision floating-point values in each of
912 /// the two 128-bit floating-point vectors of [2 x double] to determine if
913 /// the value in the first parameter is not greater than the corresponding
914 /// value in the second parameter. The comparison yields 0h for false,
915 /// FFFFFFFFFFFFFFFFh for true.
917 /// \headerfile <x86intrin.h>
919 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
922 /// A 128-bit vector of [2 x double]. The lower double-precision value is
923 /// compared to the lower double-precision value of \a __b.
925 /// A 128-bit vector of [2 x double]. The lower double-precision value is
926 /// compared to the lower double-precision value of \a __a.
927 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
928 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
929 static __inline__ __m128d __DEFAULT_FN_ATTRS
930 _mm_cmpngt_sd(__m128d __a, __m128d __b)
932 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
933 return (__m128d) { __c[0], __a[1] };
936 /// \brief Compares the lower double-precision floating-point values in each of
937 /// the two 128-bit floating-point vectors of [2 x double] to determine if
938 /// the value in the first parameter is not greater than or equal to the
939 /// corresponding value in the second parameter. The comparison yields 0h
940 /// for false, FFFFFFFFFFFFFFFFh for true.
942 /// \headerfile <x86intrin.h>
944 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
947 /// A 128-bit vector of [2 x double]. The lower double-precision value is
948 /// compared to the lower double-precision value of \a __b.
950 /// A 128-bit vector of [2 x double]. The lower double-precision value is
951 /// compared to the lower double-precision value of \a __a.
952 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
953 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
954 static __inline__ __m128d __DEFAULT_FN_ATTRS
955 _mm_cmpnge_sd(__m128d __a, __m128d __b)
957 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
958 return (__m128d) { __c[0], __a[1] };
961 /// \brief Compares the lower double-precision floating-point values in each of
962 /// the two 128-bit floating-point vectors of [2 x double] for equality. The
963 /// comparison yields 0 for false, 1 for true.
965 /// \headerfile <x86intrin.h>
967 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
970 /// A 128-bit vector of [2 x double]. The lower double-precision value is
971 /// compared to the lower double-precision value of \a __b.
973 /// A 128-bit vector of [2 x double]. The lower double-precision value is
974 /// compared to the lower double-precision value of \a __a.
975 /// \returns An integer containing the comparison results.
976 static __inline__ int __DEFAULT_FN_ATTRS
977 _mm_comieq_sd(__m128d __a, __m128d __b)
979 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
982 /// \brief Compares the lower double-precision floating-point values in each of
983 /// the two 128-bit floating-point vectors of [2 x double] to determine if
984 /// the value in the first parameter is less than the corresponding value in
985 /// the second parameter. The comparison yields 0 for false, 1 for true.
987 /// \headerfile <x86intrin.h>
989 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
992 /// A 128-bit vector of [2 x double]. The lower double-precision value is
993 /// compared to the lower double-precision value of \a __b.
995 /// A 128-bit vector of [2 x double]. The lower double-precision value is
996 /// compared to the lower double-precision value of \a __a.
997 /// \returns An integer containing the comparison results.
998 static __inline__ int __DEFAULT_FN_ATTRS
999 _mm_comilt_sd(__m128d __a, __m128d __b)
1001 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1004 /// \brief Compares the lower double-precision floating-point values in each of
1005 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1006 /// the value in the first parameter is less than or equal to the
1007 /// corresponding value in the second parameter. The comparison yields 0 for
1008 /// false, 1 for true.
1010 /// \headerfile <x86intrin.h>
1012 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1015 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1016 /// compared to the lower double-precision value of \a __b.
1018 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1019 /// compared to the lower double-precision value of \a __a.
1020 /// \returns An integer containing the comparison results.
1021 static __inline__ int __DEFAULT_FN_ATTRS
1022 _mm_comile_sd(__m128d __a, __m128d __b)
1024 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1027 /// \brief Compares the lower double-precision floating-point values in each of
1028 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1029 /// the value in the first parameter is greater than the corresponding value
1030 /// in the second parameter. The comparison yields 0 for false, 1 for true.
1032 /// \headerfile <x86intrin.h>
1034 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1037 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1038 /// compared to the lower double-precision value of \a __b.
1040 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1041 /// compared to the lower double-precision value of \a __a.
1042 /// \returns An integer containing the comparison results.
1043 static __inline__ int __DEFAULT_FN_ATTRS
1044 _mm_comigt_sd(__m128d __a, __m128d __b)
1046 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1049 /// \brief Compares the lower double-precision floating-point values in each of
1050 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1051 /// the value in the first parameter is greater than or equal to the
1052 /// corresponding value in the second parameter. The comparison yields 0 for
1053 /// false, 1 for true.
1055 /// \headerfile <x86intrin.h>
1057 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1060 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1061 /// compared to the lower double-precision value of \a __b.
1063 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1064 /// compared to the lower double-precision value of \a __a.
1065 /// \returns An integer containing the comparison results.
1066 static __inline__ int __DEFAULT_FN_ATTRS
1067 _mm_comige_sd(__m128d __a, __m128d __b)
1069 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1072 /// \brief Compares the lower double-precision floating-point values in each of
1073 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1074 /// the value in the first parameter is unequal to the corresponding value in
1075 /// the second parameter. The comparison yields 0 for false, 1 for true.
1077 /// \headerfile <x86intrin.h>
1079 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1082 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1083 /// compared to the lower double-precision value of \a __b.
1085 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1086 /// compared to the lower double-precision value of \a __a.
1087 /// \returns An integer containing the comparison results.
1088 static __inline__ int __DEFAULT_FN_ATTRS
1089 _mm_comineq_sd(__m128d __a, __m128d __b)
1091 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1094 /// \brief Compares the lower double-precision floating-point values in each of
1095 /// the two 128-bit floating-point vectors of [2 x double] for equality. The
1096 /// comparison yields 0 for false, 1 for true. If either of the two lower
1097 /// double-precision values is NaN, 1 is returned.
1099 /// \headerfile <x86intrin.h>
1101 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1104 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1105 /// compared to the lower double-precision value of \a __b.
1107 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1108 /// compared to the lower double-precision value of \a __a.
1109 /// \returns An integer containing the comparison results. If either of the two
1110 /// lower double-precision values is NaN, 1 is returned.
1111 static __inline__ int __DEFAULT_FN_ATTRS
1112 _mm_ucomieq_sd(__m128d __a, __m128d __b)
1114 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1117 /// \brief Compares the lower double-precision floating-point values in each of
1118 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1119 /// the value in the first parameter is less than the corresponding value in
1120 /// the second parameter. The comparison yields 0 for false, 1 for true. If
1121 /// either of the two lower double-precision values is NaN, 1 is returned.
1123 /// \headerfile <x86intrin.h>
1125 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1128 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1129 /// compared to the lower double-precision value of \a __b.
1131 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1132 /// compared to the lower double-precision value of \a __a.
1133 /// \returns An integer containing the comparison results. If either of the two
1134 /// lower double-precision values is NaN, 1 is returned.
1135 static __inline__ int __DEFAULT_FN_ATTRS
1136 _mm_ucomilt_sd(__m128d __a, __m128d __b)
1138 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1141 /// \brief Compares the lower double-precision floating-point values in each of
1142 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1143 /// the value in the first parameter is less than or equal to the
1144 /// corresponding value in the second parameter. The comparison yields 0 for
1145 /// false, 1 for true. If either of the two lower double-precision values is
1146 /// NaN, 1 is returned.
1148 /// \headerfile <x86intrin.h>
1150 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1153 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1154 /// compared to the lower double-precision value of \a __b.
1156 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1157 /// compared to the lower double-precision value of \a __a.
1158 /// \returns An integer containing the comparison results. If either of the two
1159 /// lower double-precision values is NaN, 1 is returned.
1160 static __inline__ int __DEFAULT_FN_ATTRS
1161 _mm_ucomile_sd(__m128d __a, __m128d __b)
1163 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1166 /// \brief Compares the lower double-precision floating-point values in each of
1167 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1168 /// the value in the first parameter is greater than the corresponding value
1169 /// in the second parameter. The comparison yields 0 for false, 1 for true.
1170 /// If either of the two lower double-precision values is NaN, 0 is returned.
1172 /// \headerfile <x86intrin.h>
1174 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1177 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1178 /// compared to the lower double-precision value of \a __b.
1180 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1181 /// compared to the lower double-precision value of \a __a.
1182 /// \returns An integer containing the comparison results. If either of the two
1183 /// lower double-precision values is NaN, 0 is returned.
1184 static __inline__ int __DEFAULT_FN_ATTRS
1185 _mm_ucomigt_sd(__m128d __a, __m128d __b)
1187 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1190 /// \brief Compares the lower double-precision floating-point values in each of
1191 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1192 /// the value in the first parameter is greater than or equal to the
1193 /// corresponding value in the second parameter. The comparison yields 0 for
1194 /// false, 1 for true. If either of the two lower double-precision values
1195 /// is NaN, 0 is returned.
1197 /// \headerfile <x86intrin.h>
1199 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1202 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1203 /// compared to the lower double-precision value of \a __b.
1205 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1206 /// compared to the lower double-precision value of \a __a.
1207 /// \returns An integer containing the comparison results. If either of the two
1208 /// lower double-precision values is NaN, 0 is returned.
1209 static __inline__ int __DEFAULT_FN_ATTRS
1210 _mm_ucomige_sd(__m128d __a, __m128d __b)
1212 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1215 /// \brief Compares the lower double-precision floating-point values in each of
1216 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1217 /// the value in the first parameter is unequal to the corresponding value in
1218 /// the second parameter. The comparison yields 0 for false, 1 for true. If
1219 /// either of the two lower double-precision values is NaN, 0 is returned.
1221 /// \headerfile <x86intrin.h>
1223 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1226 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1227 /// compared to the lower double-precision value of \a __b.
1229 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1230 /// compared to the lower double-precision value of \a __a.
1231 /// \returns An integer containing the comparison result. If either of the two
1232 /// lower double-precision values is NaN, 0 is returned.
1233 static __inline__ int __DEFAULT_FN_ATTRS
1234 _mm_ucomineq_sd(__m128d __a, __m128d __b)
1236 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1239 /// \brief Converts the two double-precision floating-point elements of a
1240 /// 128-bit vector of [2 x double] into two single-precision floating-point
1241 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1242 /// The upper 64 bits of the result vector are set to zero.
1244 /// \headerfile <x86intrin.h>
1246 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1249 /// A 128-bit vector of [2 x double].
1250 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1251 /// converted values. The upper 64 bits are set to zero.
1252 static __inline__ __m128 __DEFAULT_FN_ATTRS
1253 _mm_cvtpd_ps(__m128d __a)
1255 return __builtin_ia32_cvtpd2ps((__v2df)__a);
1258 /// \brief Converts the lower two single-precision floating-point elements of a
1259 /// 128-bit vector of [4 x float] into two double-precision floating-point
1260 /// values, returned in a 128-bit vector of [2 x double]. The upper two
1261 /// elements of the input vector are unused.
1263 /// \headerfile <x86intrin.h>
1265 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1268 /// A 128-bit vector of [4 x float]. The lower two single-precision
1269 /// floating-point elements are converted to double-precision values. The
1270 /// upper two elements are unused.
1271 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1272 static __inline__ __m128d __DEFAULT_FN_ATTRS
1273 _mm_cvtps_pd(__m128 __a)
1275 return (__m128d) __builtin_convertvector(
1276 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1279 /// \brief Converts the lower two integer elements of a 128-bit vector of
1280 /// [4 x i32] into two double-precision floating-point values, returned in a
1281 /// 128-bit vector of [2 x double]. The upper two elements of the input
1282 /// vector are unused.
1284 /// \headerfile <x86intrin.h>
1286 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1289 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1290 /// converted to double-precision values. The upper two elements are unused.
1291 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1292 static __inline__ __m128d __DEFAULT_FN_ATTRS
1293 _mm_cvtepi32_pd(__m128i __a)
1295 return (__m128d) __builtin_convertvector(
1296 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1299 /// \brief Converts the two double-precision floating-point elements of a
1300 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1301 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1302 /// 64 bits of the result vector are set to zero.
1304 /// \headerfile <x86intrin.h>
1306 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1309 /// A 128-bit vector of [2 x double].
1310 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1311 /// converted values. The upper 64 bits are set to zero.
1312 static __inline__ __m128i __DEFAULT_FN_ATTRS
1313 _mm_cvtpd_epi32(__m128d __a)
1315 return __builtin_ia32_cvtpd2dq((__v2df)__a);
1318 /// \brief Converts the low-order element of a 128-bit vector of [2 x double]
1319 /// into a 32-bit signed integer value.
1321 /// \headerfile <x86intrin.h>
1323 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1326 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1328 /// \returns A 32-bit signed integer containing the converted value.
1329 static __inline__ int __DEFAULT_FN_ATTRS
1330 _mm_cvtsd_si32(__m128d __a)
1332 return __builtin_ia32_cvtsd2si((__v2df)__a);
1335 /// \brief Converts the lower double-precision floating-point element of a
1336 /// 128-bit vector of [2 x double], in the second parameter, into a
1337 /// single-precision floating-point value, returned in the lower 32 bits of a
1338 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1339 /// copied from the upper 96 bits of the first parameter.
1341 /// \headerfile <x86intrin.h>
1343 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1346 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1347 /// copied to the upper 96 bits of the result.
1349 /// A 128-bit vector of [2 x double]. The lower double-precision
1350 /// floating-point element is used in the conversion.
1351 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1352 /// converted value from the second parameter. The upper 96 bits are copied
1353 /// from the upper 96 bits of the first parameter.
1354 static __inline__ __m128 __DEFAULT_FN_ATTRS
1355 _mm_cvtsd_ss(__m128 __a, __m128d __b)
1357 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1360 /// \brief Converts a 32-bit signed integer value, in the second parameter, into
1361 /// a double-precision floating-point value, returned in the lower 64 bits of
1362 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1363 /// are copied from the upper 64 bits of the first parameter.
1365 /// \headerfile <x86intrin.h>
1367 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1370 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1371 /// copied to the upper 64 bits of the result.
1373 /// A 32-bit signed integer containing the value to be converted.
1374 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1375 /// converted value from the second parameter. The upper 64 bits are copied
1376 /// from the upper 64 bits of the first parameter.
1377 static __inline__ __m128d __DEFAULT_FN_ATTRS
1378 _mm_cvtsi32_sd(__m128d __a, int __b)
1384 /// \brief Converts the lower single-precision floating-point element of a
1385 /// 128-bit vector of [4 x float], in the second parameter, into a
1386 /// double-precision floating-point value, returned in the lower 64 bits of
1387 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1388 /// are copied from the upper 64 bits of the first parameter.
1390 /// \headerfile <x86intrin.h>
1392 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1395 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1396 /// copied to the upper 64 bits of the result.
1398 /// A 128-bit vector of [4 x float]. The lower single-precision
1399 /// floating-point element is used in the conversion.
1400 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1401 /// converted value from the second parameter. The upper 64 bits are copied
1402 /// from the upper 64 bits of the first parameter.
1403 static __inline__ __m128d __DEFAULT_FN_ATTRS
1404 _mm_cvtss_sd(__m128d __a, __m128 __b)
1410 /// \brief Converts the two double-precision floating-point elements of a
1411 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1412 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. If the
1413 /// result of either conversion is inexact, the result is truncated (rounded
1414 /// towards zero) regardless of the current MXCSR setting. The upper 64 bits
1415 /// of the result vector are set to zero.
1417 /// \headerfile <x86intrin.h>
1419 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1423 /// A 128-bit vector of [2 x double].
1424 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1425 /// converted values. The upper 64 bits are set to zero.
1426 static __inline__ __m128i __DEFAULT_FN_ATTRS
1427 _mm_cvttpd_epi32(__m128d __a)
1429 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1432 /// \brief Converts the low-order element of a [2 x double] vector into a 32-bit
1433 /// signed integer value, truncating the result when it is inexact.
1435 /// \headerfile <x86intrin.h>
1437 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1441 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1443 /// \returns A 32-bit signed integer containing the converted value.
1444 static __inline__ int __DEFAULT_FN_ATTRS
1445 _mm_cvttsd_si32(__m128d __a)
1447 return __builtin_ia32_cvttsd2si((__v2df)__a);
1450 /// \brief Converts the two double-precision floating-point elements of a
1451 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1452 /// returned in a 64-bit vector of [2 x i32].
1454 /// \headerfile <x86intrin.h>
1456 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1459 /// A 128-bit vector of [2 x double].
1460 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1461 static __inline__ __m64 __DEFAULT_FN_ATTRS
1462 _mm_cvtpd_pi32(__m128d __a)
1464 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1467 /// \brief Converts the two double-precision floating-point elements of a
1468 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1469 /// returned in a 64-bit vector of [2 x i32]. If the result of either
1470 /// conversion is inexact, the result is truncated (rounded towards zero)
1471 /// regardless of the current MXCSR setting.
1473 /// \headerfile <x86intrin.h>
1475 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1478 /// A 128-bit vector of [2 x double].
1479 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1480 static __inline__ __m64 __DEFAULT_FN_ATTRS
1481 _mm_cvttpd_pi32(__m128d __a)
1483 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1486 /// \brief Converts the two signed 32-bit integer elements of a 64-bit vector of
1487 /// [2 x i32] into two double-precision floating-point values, returned in a
1488 /// 128-bit vector of [2 x double].
1490 /// \headerfile <x86intrin.h>
1492 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1495 /// A 64-bit vector of [2 x i32].
1496 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1497 static __inline__ __m128d __DEFAULT_FN_ATTRS
1498 _mm_cvtpi32_pd(__m64 __a)
1500 return __builtin_ia32_cvtpi2pd((__v2si)__a);
1503 /// \brief Returns the low-order element of a 128-bit vector of [2 x double] as
1504 /// a double-precision floating-point value.
1506 /// \headerfile <x86intrin.h>
1508 /// This intrinsic has no corresponding instruction.
1511 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1512 /// \returns A double-precision floating-point value copied from the lower 64
1514 static __inline__ double __DEFAULT_FN_ATTRS
1515 _mm_cvtsd_f64(__m128d __a)
1520 /// \brief Loads a 128-bit floating-point vector of [2 x double] from an aligned
1521 /// memory location.
1523 /// \headerfile <x86intrin.h>
1525 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1528 /// A pointer to a 128-bit memory location. The address of the memory
1529 /// location has to be 16-byte aligned.
1530 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1531 static __inline__ __m128d __DEFAULT_FN_ATTRS
1532 _mm_load_pd(double const *__dp)
1534 return *(__m128d*)__dp;
1537 /// \brief Loads a double-precision floating-point value from a specified memory
1538 /// location and duplicates it to both vector elements of a 128-bit vector of
1541 /// \headerfile <x86intrin.h>
1543 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1546 /// A pointer to a memory location containing a double-precision value.
1547 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1548 /// duplicated values.
1549 static __inline__ __m128d __DEFAULT_FN_ATTRS
1550 _mm_load1_pd(double const *__dp)
1552 struct __mm_load1_pd_struct {
1554 } __attribute__((__packed__, __may_alias__));
1555 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
1556 return (__m128d){ __u, __u };
1559 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1561 /// \brief Loads two double-precision values, in reverse order, from an aligned
1562 /// memory location into a 128-bit vector of [2 x double].
1564 /// \headerfile <x86intrin.h>
1566 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1567 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1568 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1571 /// A 16-byte aligned pointer to an array of double-precision values to be
1572 /// loaded in reverse order.
1573 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1575 static __inline__ __m128d __DEFAULT_FN_ATTRS
1576 _mm_loadr_pd(double const *__dp)
1578 __m128d __u = *(__m128d*)__dp;
1579 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1582 /// \brief Loads a 128-bit floating-point vector of [2 x double] from an
1583 /// unaligned memory location.
1585 /// \headerfile <x86intrin.h>
1587 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1590 /// A pointer to a 128-bit memory location. The address of the memory
1591 /// location does not have to be aligned.
1592 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1593 static __inline__ __m128d __DEFAULT_FN_ATTRS
1594 _mm_loadu_pd(double const *__dp)
1598 } __attribute__((__packed__, __may_alias__));
1599 return ((struct __loadu_pd*)__dp)->__v;
1602 static __inline__ __m128i __DEFAULT_FN_ATTRS
1603 _mm_loadu_si64(void const *__a)
1605 struct __loadu_si64 {
1607 } __attribute__((__packed__, __may_alias__));
1608 long long __u = ((struct __loadu_si64*)__a)->__v;
1609 return (__m128i){__u, 0L};
1612 static __inline__ __m128d __DEFAULT_FN_ATTRS
1613 _mm_load_sd(double const *__dp)
1615 struct __mm_load_sd_struct {
1617 } __attribute__((__packed__, __may_alias__));
1618 double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
1619 return (__m128d){ __u, 0 };
1622 /// \brief Loads a double-precision value into the high-order bits of a 128-bit
1623 /// vector of [2 x double]. The low-order bits are copied from the low-order
1624 /// bits of the first operand.
1626 /// \headerfile <x86intrin.h>
1628 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1631 /// A 128-bit vector of [2 x double]. \n
1632 /// Bits [63:0] are written to bits [63:0] of the result.
1634 /// A pointer to a 64-bit memory location containing a double-precision
1635 /// floating-point value that is loaded. The loaded value is written to bits
1636 /// [127:64] of the result. The address of the memory location does not have
1638 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1639 static __inline__ __m128d __DEFAULT_FN_ATTRS
1640 _mm_loadh_pd(__m128d __a, double const *__dp)
1642 struct __mm_loadh_pd_struct {
1644 } __attribute__((__packed__, __may_alias__));
1645 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
1646 return (__m128d){ __a[0], __u };
1649 /// \brief Loads a double-precision value into the low-order bits of a 128-bit
1650 /// vector of [2 x double]. The high-order bits are copied from the
1651 /// high-order bits of the first operand.
1653 /// \headerfile <x86intrin.h>
1655 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1658 /// A 128-bit vector of [2 x double]. \n
1659 /// Bits [127:64] are written to bits [127:64] of the result.
1661 /// A pointer to a 64-bit memory location containing a double-precision
1662 /// floating-point value that is loaded. The loaded value is written to bits
1663 /// [63:0] of the result. The address of the memory location does not have to
1665 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1666 static __inline__ __m128d __DEFAULT_FN_ATTRS
1667 _mm_loadl_pd(__m128d __a, double const *__dp)
1669 struct __mm_loadl_pd_struct {
1671 } __attribute__((__packed__, __may_alias__));
1672 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
1673 return (__m128d){ __u, __a[1] };
1676 /// \brief Constructs a 128-bit floating-point vector of [2 x double] with
1677 /// unspecified content. This could be used as an argument to another
1678 /// intrinsic function where the argument is required but the value is not
1681 /// \headerfile <x86intrin.h>
1683 /// This intrinsic has no corresponding instruction.
1685 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1687 static __inline__ __m128d __DEFAULT_FN_ATTRS
1688 _mm_undefined_pd(void)
1690 return (__m128d)__builtin_ia32_undef128();
1693 /// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
1694 /// 64 bits of the vector are initialized with the specified double-precision
1695 /// floating-point value. The upper 64 bits are set to zero.
1697 /// \headerfile <x86intrin.h>
1699 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1702 /// A double-precision floating-point value used to initialize the lower 64
1703 /// bits of the result.
1704 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1705 /// lower 64 bits contain the value of the parameter. The upper 64 bits are
1707 static __inline__ __m128d __DEFAULT_FN_ATTRS
1708 _mm_set_sd(double __w)
1710 return (__m128d){ __w, 0 };
1713 /// \brief Constructs a 128-bit floating-point vector of [2 x double], with each
1714 /// of the two double-precision floating-point vector elements set to the
1715 /// specified double-precision floating-point value.
1717 /// \headerfile <x86intrin.h>
1719 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1722 /// A double-precision floating-point value used to initialize each vector
1723 /// element of the result.
1724 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1725 static __inline__ __m128d __DEFAULT_FN_ATTRS
1726 _mm_set1_pd(double __w)
1728 return (__m128d){ __w, __w };
1731 /// \brief Constructs a 128-bit floating-point vector of [2 x double]
1732 /// initialized with the specified double-precision floating-point values.
1734 /// \headerfile <x86intrin.h>
1736 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1739 /// A double-precision floating-point value used to initialize the upper 64
1740 /// bits of the result.
1742 /// A double-precision floating-point value used to initialize the lower 64
1743 /// bits of the result.
1744 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1745 static __inline__ __m128d __DEFAULT_FN_ATTRS
1746 _mm_set_pd(double __w, double __x)
1748 return (__m128d){ __x, __w };
1751 /// \brief Constructs a 128-bit floating-point vector of [2 x double],
1752 /// initialized in reverse order with the specified double-precision
1753 /// floating-point values.
1755 /// \headerfile <x86intrin.h>
1757 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1760 /// A double-precision floating-point value used to initialize the lower 64
1761 /// bits of the result.
1763 /// A double-precision floating-point value used to initialize the upper 64
1764 /// bits of the result.
1765 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1766 static __inline__ __m128d __DEFAULT_FN_ATTRS
1767 _mm_setr_pd(double __w, double __x)
1769 return (__m128d){ __w, __x };
1772 /// \brief Constructs a 128-bit floating-point vector of [2 x double]
1773 /// initialized to zero.
1775 /// \headerfile <x86intrin.h>
1777 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1779 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1780 /// all elements set to zero.
1781 static __inline__ __m128d __DEFAULT_FN_ATTRS
1782 _mm_setzero_pd(void)
1784 return (__m128d){ 0, 0 };
1787 /// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
1788 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
1789 /// 64 bits are set to the upper 64 bits of the first parameter.
1791 /// \headerfile <x86intrin.h>
1793 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1796 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1797 /// upper 64 bits of the result.
1799 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1800 /// lower 64 bits of the result.
1801 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1802 static __inline__ __m128d __DEFAULT_FN_ATTRS
1803 _mm_move_sd(__m128d __a, __m128d __b)
1805 return (__m128d){ __b[0], __a[1] };
1808 /// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1809 /// memory location.
1811 /// \headerfile <x86intrin.h>
1813 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1816 /// A pointer to a 64-bit memory location.
1818 /// A 128-bit vector of [2 x double] containing the value to be stored.
1819 static __inline__ void __DEFAULT_FN_ATTRS
1820 _mm_store_sd(double *__dp, __m128d __a)
1822 struct __mm_store_sd_struct {
1824 } __attribute__((__packed__, __may_alias__));
1825 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
1828 static __inline__ void __DEFAULT_FN_ATTRS
1829 _mm_store_pd(double *__dp, __m128d __a)
1831 *(__m128d*)__dp = __a;
1834 static __inline__ void __DEFAULT_FN_ATTRS
1835 _mm_store1_pd(double *__dp, __m128d __a)
1837 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1838 _mm_store_pd(__dp, __a);
1841 /// \brief Stores a 128-bit vector of [2 x double] into an aligned memory
1844 /// \headerfile <x86intrin.h>
1846 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1849 /// A pointer to a 128-bit memory location. The address of the memory
1850 /// location has to be 16-byte aligned.
1852 /// A 128-bit vector of [2 x double] containing the values to be stored.
1853 static __inline__ void __DEFAULT_FN_ATTRS
1854 _mm_store_pd1(double *__dp, __m128d __a)
1856 return _mm_store1_pd(__dp, __a);
1859 /// \brief Stores a 128-bit vector of [2 x double] into an unaligned memory
1862 /// \headerfile <x86intrin.h>
1864 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1867 /// A pointer to a 128-bit memory location. The address of the memory
1868 /// location does not have to be aligned.
1870 /// A 128-bit vector of [2 x double] containing the values to be stored.
1871 static __inline__ void __DEFAULT_FN_ATTRS
1872 _mm_storeu_pd(double *__dp, __m128d __a)
1874 struct __storeu_pd {
1876 } __attribute__((__packed__, __may_alias__));
1877 ((struct __storeu_pd*)__dp)->__v = __a;
1880 /// \brief Stores two double-precision values, in reverse order, from a 128-bit
1881 /// vector of [2 x double] to a 16-byte aligned memory location.
1883 /// \headerfile <x86intrin.h>
1885 /// This intrinsic corresponds to a shuffling instruction followed by a
1886 /// <c> VMOVAPD / MOVAPD </c> instruction.
1889 /// A pointer to a 16-byte aligned memory location that can store two
1890 /// double-precision values.
1892 /// A 128-bit vector of [2 x double] containing the values to be reversed and
1894 static __inline__ void __DEFAULT_FN_ATTRS
1895 _mm_storer_pd(double *__dp, __m128d __a)
1897 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1898 *(__m128d *)__dp = __a;
1901 /// \brief Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1902 /// memory location.
1904 /// \headerfile <x86intrin.h>
1906 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1909 /// A pointer to a 64-bit memory location.
1911 /// A 128-bit vector of [2 x double] containing the value to be stored.
1912 static __inline__ void __DEFAULT_FN_ATTRS
1913 _mm_storeh_pd(double *__dp, __m128d __a)
1915 struct __mm_storeh_pd_struct {
1917 } __attribute__((__packed__, __may_alias__));
1918 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
1921 /// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1922 /// memory location.
1924 /// \headerfile <x86intrin.h>
1926 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1929 /// A pointer to a 64-bit memory location.
1931 /// A 128-bit vector of [2 x double] containing the value to be stored.
1932 static __inline__ void __DEFAULT_FN_ATTRS
1933 _mm_storel_pd(double *__dp, __m128d __a)
1935 struct __mm_storeh_pd_struct {
1937 } __attribute__((__packed__, __may_alias__));
1938 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
1941 /// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8],
1942 /// saving the lower 8 bits of each sum in the corresponding element of a
1943 /// 128-bit result vector of [16 x i8]. The integer elements of both
1944 /// parameters can be either signed or unsigned.
1946 /// \headerfile <x86intrin.h>
1948 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
1951 /// A 128-bit vector of [16 x i8].
1953 /// A 128-bit vector of [16 x i8].
1954 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
1956 static __inline__ __m128i __DEFAULT_FN_ATTRS
1957 _mm_add_epi8(__m128i __a, __m128i __b)
1959 return (__m128i)((__v16qu)__a + (__v16qu)__b);
1962 /// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16],
1963 /// saving the lower 16 bits of each sum in the corresponding element of a
1964 /// 128-bit result vector of [8 x i16]. The integer elements of both
1965 /// parameters can be either signed or unsigned.
1967 /// \headerfile <x86intrin.h>
1969 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
1972 /// A 128-bit vector of [8 x i16].
1974 /// A 128-bit vector of [8 x i16].
1975 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
1977 static __inline__ __m128i __DEFAULT_FN_ATTRS
1978 _mm_add_epi16(__m128i __a, __m128i __b)
1980 return (__m128i)((__v8hu)__a + (__v8hu)__b);
1983 /// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32],
1984 /// saving the lower 32 bits of each sum in the corresponding element of a
1985 /// 128-bit result vector of [4 x i32]. The integer elements of both
1986 /// parameters can be either signed or unsigned.
1988 /// \headerfile <x86intrin.h>
1990 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
1993 /// A 128-bit vector of [4 x i32].
1995 /// A 128-bit vector of [4 x i32].
1996 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
1998 static __inline__ __m128i __DEFAULT_FN_ATTRS
1999 _mm_add_epi32(__m128i __a, __m128i __b)
2001 return (__m128i)((__v4su)__a + (__v4su)__b);
2004 /// \brief Adds two signed or unsigned 64-bit integer values, returning the
2005 /// lower 64 bits of the sum.
2007 /// \headerfile <x86intrin.h>
2009 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2012 /// A 64-bit integer.
2014 /// A 64-bit integer.
2015 /// \returns A 64-bit integer containing the sum of both parameters.
2016 static __inline__ __m64 __DEFAULT_FN_ATTRS
2017 _mm_add_si64(__m64 __a, __m64 __b)
2019 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2022 /// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2023 /// saving the lower 64 bits of each sum in the corresponding element of a
2024 /// 128-bit result vector of [2 x i64]. The integer elements of both
2025 /// parameters can be either signed or unsigned.
2027 /// \headerfile <x86intrin.h>
2029 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2032 /// A 128-bit vector of [2 x i64].
2034 /// A 128-bit vector of [2 x i64].
2035 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2037 static __inline__ __m128i __DEFAULT_FN_ATTRS
2038 _mm_add_epi64(__m128i __a, __m128i __b)
2040 return (__m128i)((__v2du)__a + (__v2du)__b);
2043 /// \brief Adds, with saturation, the corresponding elements of two 128-bit
2044 /// signed [16 x i8] vectors, saving each sum in the corresponding element of
2045 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 7Fh are
2046 /// saturated to 7Fh. Negative sums less than 80h are saturated to 80h.
2048 /// \headerfile <x86intrin.h>
2050 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2053 /// A 128-bit signed [16 x i8] vector.
2055 /// A 128-bit signed [16 x i8] vector.
2056 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2057 /// both parameters.
2058 static __inline__ __m128i __DEFAULT_FN_ATTRS
2059 _mm_adds_epi8(__m128i __a, __m128i __b)
2061 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
2064 /// \brief Adds, with saturation, the corresponding elements of two 128-bit
2065 /// signed [8 x i16] vectors, saving each sum in the corresponding element of
2066 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 7FFFh
2067 /// are saturated to 7FFFh. Negative sums less than 8000h are saturated to
2070 /// \headerfile <x86intrin.h>
2072 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2075 /// A 128-bit signed [8 x i16] vector.
2077 /// A 128-bit signed [8 x i16] vector.
2078 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2079 /// both parameters.
2080 static __inline__ __m128i __DEFAULT_FN_ATTRS
2081 _mm_adds_epi16(__m128i __a, __m128i __b)
2083 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
2086 /// \brief Adds, with saturation, the corresponding elements of two 128-bit
2087 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2088 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than FFh
2089 /// are saturated to FFh. Negative sums are saturated to 00h.
2091 /// \headerfile <x86intrin.h>
2093 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2096 /// A 128-bit unsigned [16 x i8] vector.
2098 /// A 128-bit unsigned [16 x i8] vector.
2099 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2100 /// of both parameters.
2101 static __inline__ __m128i __DEFAULT_FN_ATTRS
2102 _mm_adds_epu8(__m128i __a, __m128i __b)
2104 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
2107 /// \brief Adds, with saturation, the corresponding elements of two 128-bit
2108 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2109 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than FFFFh
2110 /// are saturated to FFFFh. Negative sums are saturated to 0000h.
2112 /// \headerfile <x86intrin.h>
2114 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2117 /// A 128-bit unsigned [8 x i16] vector.
2119 /// A 128-bit unsigned [8 x i16] vector.
2120 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2121 /// of both parameters.
2122 static __inline__ __m128i __DEFAULT_FN_ATTRS
2123 _mm_adds_epu16(__m128i __a, __m128i __b)
2125 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
2128 /// \brief Computes the rounded avarages of corresponding elements of two
2129 /// 128-bit unsigned [16 x i8] vectors, saving each result in the
2130 /// corresponding element of a 128-bit result vector of [16 x i8].
2132 /// \headerfile <x86intrin.h>
2134 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2137 /// A 128-bit unsigned [16 x i8] vector.
2139 /// A 128-bit unsigned [16 x i8] vector.
2140 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2141 /// averages of both parameters.
2142 static __inline__ __m128i __DEFAULT_FN_ATTRS
2143 _mm_avg_epu8(__m128i __a, __m128i __b)
2145 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2148 /// \brief Computes the rounded avarages of corresponding elements of two
2149 /// 128-bit unsigned [8 x i16] vectors, saving each result in the
2150 /// corresponding element of a 128-bit result vector of [8 x i16].
2152 /// \headerfile <x86intrin.h>
2154 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2157 /// A 128-bit unsigned [8 x i16] vector.
2159 /// A 128-bit unsigned [8 x i16] vector.
2160 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2161 /// averages of both parameters.
2162 static __inline__ __m128i __DEFAULT_FN_ATTRS
2163 _mm_avg_epu16(__m128i __a, __m128i __b)
2165 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2168 /// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2169 /// vectors, producing eight intermediate 32-bit signed integer products, and
2170 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2171 /// [4 x i32] vector. For example, bits [15:0] of both parameters are
2172 /// multiplied producing a 32-bit product, bits [31:16] of both parameters
2173 /// are multiplied producing a 32-bit product, and the sum of those two
2174 /// products becomes bits [31:0] of the result.
2176 /// \headerfile <x86intrin.h>
2178 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2181 /// A 128-bit signed [8 x i16] vector.
2183 /// A 128-bit signed [8 x i16] vector.
2184 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2185 /// of both parameters.
2186 static __inline__ __m128i __DEFAULT_FN_ATTRS
2187 _mm_madd_epi16(__m128i __a, __m128i __b)
2189 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2192 /// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
2193 /// vectors, saving the greater value from each comparison in the
2194 /// corresponding element of a 128-bit result vector of [8 x i16].
2196 /// \headerfile <x86intrin.h>
2198 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2201 /// A 128-bit signed [8 x i16] vector.
2203 /// A 128-bit signed [8 x i16] vector.
2204 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2205 /// each comparison.
2206 static __inline__ __m128i __DEFAULT_FN_ATTRS
2207 _mm_max_epi16(__m128i __a, __m128i __b)
2209 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
2212 /// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
2213 /// vectors, saving the greater value from each comparison in the
2214 /// corresponding element of a 128-bit result vector of [16 x i8].
2216 /// \headerfile <x86intrin.h>
2218 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2221 /// A 128-bit unsigned [16 x i8] vector.
2223 /// A 128-bit unsigned [16 x i8] vector.
2224 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2225 /// each comparison.
2226 static __inline__ __m128i __DEFAULT_FN_ATTRS
2227 _mm_max_epu8(__m128i __a, __m128i __b)
2229 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
2232 /// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
2233 /// vectors, saving the smaller value from each comparison in the
2234 /// corresponding element of a 128-bit result vector of [8 x i16].
2236 /// \headerfile <x86intrin.h>
2238 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2241 /// A 128-bit signed [8 x i16] vector.
2243 /// A 128-bit signed [8 x i16] vector.
2244 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2245 /// each comparison.
2246 static __inline__ __m128i __DEFAULT_FN_ATTRS
2247 _mm_min_epi16(__m128i __a, __m128i __b)
2249 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
2252 /// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
2253 /// vectors, saving the smaller value from each comparison in the
2254 /// corresponding element of a 128-bit result vector of [16 x i8].
2256 /// \headerfile <x86intrin.h>
2258 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2261 /// A 128-bit unsigned [16 x i8] vector.
2263 /// A 128-bit unsigned [16 x i8] vector.
2264 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2265 /// each comparison.
2266 static __inline__ __m128i __DEFAULT_FN_ATTRS
2267 _mm_min_epu8(__m128i __a, __m128i __b)
2269 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
2272 /// \brief Multiplies the corresponding elements of two signed [8 x i16]
2273 /// vectors, saving the upper 16 bits of each 32-bit product in the
2274 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2276 /// \headerfile <x86intrin.h>
2278 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2281 /// A 128-bit signed [8 x i16] vector.
2283 /// A 128-bit signed [8 x i16] vector.
2284 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2285 /// each of the eight 32-bit products.
2286 static __inline__ __m128i __DEFAULT_FN_ATTRS
2287 _mm_mulhi_epi16(__m128i __a, __m128i __b)
2289 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2292 /// \brief Multiplies the corresponding elements of two unsigned [8 x i16]
2293 /// vectors, saving the upper 16 bits of each 32-bit product in the
2294 /// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2296 /// \headerfile <x86intrin.h>
2298 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2301 /// A 128-bit unsigned [8 x i16] vector.
2303 /// A 128-bit unsigned [8 x i16] vector.
2304 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2305 /// of each of the eight 32-bit products.
2306 static __inline__ __m128i __DEFAULT_FN_ATTRS
2307 _mm_mulhi_epu16(__m128i __a, __m128i __b)
2309 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2312 /// \brief Multiplies the corresponding elements of two signed [8 x i16]
2313 /// vectors, saving the lower 16 bits of each 32-bit product in the
2314 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2316 /// \headerfile <x86intrin.h>
2318 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2321 /// A 128-bit signed [8 x i16] vector.
2323 /// A 128-bit signed [8 x i16] vector.
2324 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2325 /// each of the eight 32-bit products.
2326 static __inline__ __m128i __DEFAULT_FN_ATTRS
2327 _mm_mullo_epi16(__m128i __a, __m128i __b)
2329 return (__m128i)((__v8hu)__a * (__v8hu)__b);
2332 /// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
2333 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
2336 /// \headerfile <x86intrin.h>
2338 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2341 /// A 64-bit integer containing one of the source operands.
2343 /// A 64-bit integer containing one of the source operands.
2344 /// \returns A 64-bit integer vector containing the product of both operands.
2345 static __inline__ __m64 __DEFAULT_FN_ATTRS
2346 _mm_mul_su32(__m64 __a, __m64 __b)
2348 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2351 /// \brief Multiplies 32-bit unsigned integer values contained in the lower
2352 /// bits of the corresponding elements of two [2 x i64] vectors, and returns
2353 /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2355 /// \headerfile <x86intrin.h>
2357 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2360 /// A [2 x i64] vector containing one of the source operands.
2362 /// A [2 x i64] vector containing one of the source operands.
2363 /// \returns A [2 x i64] vector containing the product of both operands.
2364 static __inline__ __m128i __DEFAULT_FN_ATTRS
2365 _mm_mul_epu32(__m128i __a, __m128i __b)
2367 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2370 /// \brief Computes the absolute differences of corresponding 8-bit integer
2371 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2372 /// separately sums the second 8 absolute differences. Packss these two
2373 /// unsigned 16-bit integer sums into the upper and lower elements of a
2374 /// [2 x i64] vector.
2376 /// \headerfile <x86intrin.h>
2378 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2381 /// A 128-bit integer vector containing one of the source operands.
2383 /// A 128-bit integer vector containing one of the source operands.
2384 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2385 /// differences between both operands.
2386 static __inline__ __m128i __DEFAULT_FN_ATTRS
2387 _mm_sad_epu8(__m128i __a, __m128i __b)
2389 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2392 /// \brief Subtracts the corresponding 8-bit integer values in the operands.
2394 /// \headerfile <x86intrin.h>
2396 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2399 /// A 128-bit integer vector containing the minuends.
2401 /// A 128-bit integer vector containing the subtrahends.
2402 /// \returns A 128-bit integer vector containing the differences of the values
2403 /// in the operands.
2404 static __inline__ __m128i __DEFAULT_FN_ATTRS
2405 _mm_sub_epi8(__m128i __a, __m128i __b)
2407 return (__m128i)((__v16qu)__a - (__v16qu)__b);
2410 /// \brief Subtracts the corresponding 16-bit integer values in the operands.
2412 /// \headerfile <x86intrin.h>
2414 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2417 /// A 128-bit integer vector containing the minuends.
2419 /// A 128-bit integer vector containing the subtrahends.
2420 /// \returns A 128-bit integer vector containing the differences of the values
2421 /// in the operands.
2422 static __inline__ __m128i __DEFAULT_FN_ATTRS
2423 _mm_sub_epi16(__m128i __a, __m128i __b)
2425 return (__m128i)((__v8hu)__a - (__v8hu)__b);
2428 /// \brief Subtracts the corresponding 32-bit integer values in the operands.
2430 /// \headerfile <x86intrin.h>
2432 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2435 /// A 128-bit integer vector containing the minuends.
2437 /// A 128-bit integer vector containing the subtrahends.
2438 /// \returns A 128-bit integer vector containing the differences of the values
2439 /// in the operands.
2440 static __inline__ __m128i __DEFAULT_FN_ATTRS
2441 _mm_sub_epi32(__m128i __a, __m128i __b)
2443 return (__m128i)((__v4su)__a - (__v4su)__b);
2446 /// \brief Subtracts signed or unsigned 64-bit integer values and writes the
2447 /// difference to the corresponding bits in the destination.
2449 /// \headerfile <x86intrin.h>
2451 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2454 /// A 64-bit integer vector containing the minuend.
2456 /// A 64-bit integer vector containing the subtrahend.
2457 /// \returns A 64-bit integer vector containing the difference of the values in
2459 static __inline__ __m64 __DEFAULT_FN_ATTRS
2460 _mm_sub_si64(__m64 __a, __m64 __b)
2462 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2465 /// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
2467 /// \headerfile <x86intrin.h>
2469 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2472 /// A 128-bit integer vector containing the minuends.
2474 /// A 128-bit integer vector containing the subtrahends.
2475 /// \returns A 128-bit integer vector containing the differences of the values
2476 /// in the operands.
2477 static __inline__ __m128i __DEFAULT_FN_ATTRS
2478 _mm_sub_epi64(__m128i __a, __m128i __b)
2480 return (__m128i)((__v2du)__a - (__v2du)__b);
2483 /// \brief Subtracts corresponding 8-bit signed integer values in the input and
2484 /// returns the differences in the corresponding bytes in the destination.
2485 /// Differences greater than 7Fh are saturated to 7Fh, and differences less
2486 /// than 80h are saturated to 80h.
2488 /// \headerfile <x86intrin.h>
2490 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2493 /// A 128-bit integer vector containing the minuends.
2495 /// A 128-bit integer vector containing the subtrahends.
2496 /// \returns A 128-bit integer vector containing the differences of the values
2497 /// in the operands.
2498 static __inline__ __m128i __DEFAULT_FN_ATTRS
2499 _mm_subs_epi8(__m128i __a, __m128i __b)
2501 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
2504 /// \brief Subtracts corresponding 16-bit signed integer values in the input and
2505 /// returns the differences in the corresponding bytes in the destination.
2506 /// Differences greater than 7FFFh are saturated to 7FFFh, and values less
2507 /// than 8000h are saturated to 8000h.
2509 /// \headerfile <x86intrin.h>
2511 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2514 /// A 128-bit integer vector containing the minuends.
2516 /// A 128-bit integer vector containing the subtrahends.
2517 /// \returns A 128-bit integer vector containing the differences of the values
2518 /// in the operands.
2519 static __inline__ __m128i __DEFAULT_FN_ATTRS
2520 _mm_subs_epi16(__m128i __a, __m128i __b)
2522 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
2525 /// \brief Subtracts corresponding 8-bit unsigned integer values in the input
2526 /// and returns the differences in the corresponding bytes in the
2527 /// destination. Differences less than 00h are saturated to 00h.
2529 /// \headerfile <x86intrin.h>
2531 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2534 /// A 128-bit integer vector containing the minuends.
2536 /// A 128-bit integer vector containing the subtrahends.
2537 /// \returns A 128-bit integer vector containing the unsigned integer
2538 /// differences of the values in the operands.
2539 static __inline__ __m128i __DEFAULT_FN_ATTRS
2540 _mm_subs_epu8(__m128i __a, __m128i __b)
2542 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
2545 /// \brief Subtracts corresponding 16-bit unsigned integer values in the input
2546 /// and returns the differences in the corresponding bytes in the
2547 /// destination. Differences less than 0000h are saturated to 0000h.
2549 /// \headerfile <x86intrin.h>
2551 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2554 /// A 128-bit integer vector containing the minuends.
2556 /// A 128-bit integer vector containing the subtrahends.
2557 /// \returns A 128-bit integer vector containing the unsigned integer
2558 /// differences of the values in the operands.
2559 static __inline__ __m128i __DEFAULT_FN_ATTRS
2560 _mm_subs_epu16(__m128i __a, __m128i __b)
2562 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
2565 /// \brief Performs a bitwise AND of two 128-bit integer vectors.
2567 /// \headerfile <x86intrin.h>
2569 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2572 /// A 128-bit integer vector containing one of the source operands.
2574 /// A 128-bit integer vector containing one of the source operands.
2575 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2576 /// in both operands.
2577 static __inline__ __m128i __DEFAULT_FN_ATTRS
2578 _mm_and_si128(__m128i __a, __m128i __b)
2580 return (__m128i)((__v2du)__a & (__v2du)__b);
2583 /// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
2584 /// one's complement of the values contained in the first source operand.
2586 /// \headerfile <x86intrin.h>
2588 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2591 /// A 128-bit vector containing the left source operand. The one's complement
2592 /// of this value is used in the bitwise AND.
2594 /// A 128-bit vector containing the right source operand.
2595 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2596 /// complement of the first operand and the values in the second operand.
2597 static __inline__ __m128i __DEFAULT_FN_ATTRS
2598 _mm_andnot_si128(__m128i __a, __m128i __b)
2600 return (__m128i)(~(__v2du)__a & (__v2du)__b);
2602 /// \brief Performs a bitwise OR of two 128-bit integer vectors.
2604 /// \headerfile <x86intrin.h>
2606 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2609 /// A 128-bit integer vector containing one of the source operands.
2611 /// A 128-bit integer vector containing one of the source operands.
2612 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2613 /// in both operands.
2614 static __inline__ __m128i __DEFAULT_FN_ATTRS
2615 _mm_or_si128(__m128i __a, __m128i __b)
2617 return (__m128i)((__v2du)__a | (__v2du)__b);
2620 /// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
2622 /// \headerfile <x86intrin.h>
2624 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2627 /// A 128-bit integer vector containing one of the source operands.
2629 /// A 128-bit integer vector containing one of the source operands.
2630 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2631 /// values in both operands.
2632 static __inline__ __m128i __DEFAULT_FN_ATTRS
2633 _mm_xor_si128(__m128i __a, __m128i __b)
2635 return (__m128i)((__v2du)__a ^ (__v2du)__b);
2638 /// \brief Left-shifts the 128-bit integer vector operand by the specified
2639 /// number of bytes. Low-order bits are cleared.
2641 /// \headerfile <x86intrin.h>
2644 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2647 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2650 /// A 128-bit integer vector containing the source operand.
2652 /// An immediate value specifying the number of bytes to left-shift operand
2654 /// \returns A 128-bit integer vector containing the left-shifted value.
2655 #define _mm_slli_si128(a, imm) __extension__ ({ \
2656 (__m128i)__builtin_shufflevector( \
2657 (__v16qi)_mm_setzero_si128(), \
2658 (__v16qi)(__m128i)(a), \
2659 ((char)(imm)&0xF0) ? 0 : 16 - (char)(imm), \
2660 ((char)(imm)&0xF0) ? 1 : 17 - (char)(imm), \
2661 ((char)(imm)&0xF0) ? 2 : 18 - (char)(imm), \
2662 ((char)(imm)&0xF0) ? 3 : 19 - (char)(imm), \
2663 ((char)(imm)&0xF0) ? 4 : 20 - (char)(imm), \
2664 ((char)(imm)&0xF0) ? 5 : 21 - (char)(imm), \
2665 ((char)(imm)&0xF0) ? 6 : 22 - (char)(imm), \
2666 ((char)(imm)&0xF0) ? 7 : 23 - (char)(imm), \
2667 ((char)(imm)&0xF0) ? 8 : 24 - (char)(imm), \
2668 ((char)(imm)&0xF0) ? 9 : 25 - (char)(imm), \
2669 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
2670 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
2671 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
2672 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
2673 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
2674 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
2676 #define _mm_bslli_si128(a, imm) \
2677 _mm_slli_si128((a), (imm))
2679 /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
2680 /// by the specified number of bits. Low-order bits are cleared.
2682 /// \headerfile <x86intrin.h>
2684 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2687 /// A 128-bit integer vector containing the source operand.
2689 /// An integer value specifying the number of bits to left-shift each value
2690 /// in operand \a __a.
2691 /// \returns A 128-bit integer vector containing the left-shifted values.
2692 static __inline__ __m128i __DEFAULT_FN_ATTRS
2693 _mm_slli_epi16(__m128i __a, int __count)
2695 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2698 /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
2699 /// by the specified number of bits. Low-order bits are cleared.
2701 /// \headerfile <x86intrin.h>
2703 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2706 /// A 128-bit integer vector containing the source operand.
2708 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2709 /// to left-shift each value in operand \a __a.
2710 /// \returns A 128-bit integer vector containing the left-shifted values.
2711 static __inline__ __m128i __DEFAULT_FN_ATTRS
2712 _mm_sll_epi16(__m128i __a, __m128i __count)
2714 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2717 /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
2718 /// by the specified number of bits. Low-order bits are cleared.
2720 /// \headerfile <x86intrin.h>
2722 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2725 /// A 128-bit integer vector containing the source operand.
2727 /// An integer value specifying the number of bits to left-shift each value
2728 /// in operand \a __a.
2729 /// \returns A 128-bit integer vector containing the left-shifted values.
2730 static __inline__ __m128i __DEFAULT_FN_ATTRS
2731 _mm_slli_epi32(__m128i __a, int __count)
2733 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2736 /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
2737 /// by the specified number of bits. Low-order bits are cleared.
2739 /// \headerfile <x86intrin.h>
2741 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2744 /// A 128-bit integer vector containing the source operand.
2746 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2747 /// to left-shift each value in operand \a __a.
2748 /// \returns A 128-bit integer vector containing the left-shifted values.
2749 static __inline__ __m128i __DEFAULT_FN_ATTRS
2750 _mm_sll_epi32(__m128i __a, __m128i __count)
2752 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2755 /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
2756 /// by the specified number of bits. Low-order bits are cleared.
2758 /// \headerfile <x86intrin.h>
2760 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2763 /// A 128-bit integer vector containing the source operand.
2765 /// An integer value specifying the number of bits to left-shift each value
2766 /// in operand \a __a.
2767 /// \returns A 128-bit integer vector containing the left-shifted values.
2768 static __inline__ __m128i __DEFAULT_FN_ATTRS
2769 _mm_slli_epi64(__m128i __a, int __count)
2771 return __builtin_ia32_psllqi128((__v2di)__a, __count);
2774 /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
2775 /// by the specified number of bits. Low-order bits are cleared.
2777 /// \headerfile <x86intrin.h>
2779 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2782 /// A 128-bit integer vector containing the source operand.
2784 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2785 /// to left-shift each value in operand \a __a.
2786 /// \returns A 128-bit integer vector containing the left-shifted values.
2787 static __inline__ __m128i __DEFAULT_FN_ATTRS
2788 _mm_sll_epi64(__m128i __a, __m128i __count)
2790 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2793 /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
2794 /// by the specified number of bits. High-order bits are filled with the sign
2795 /// bit of the initial value.
2797 /// \headerfile <x86intrin.h>
2799 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2802 /// A 128-bit integer vector containing the source operand.
2804 /// An integer value specifying the number of bits to right-shift each value
2805 /// in operand \a __a.
2806 /// \returns A 128-bit integer vector containing the right-shifted values.
2807 static __inline__ __m128i __DEFAULT_FN_ATTRS
2808 _mm_srai_epi16(__m128i __a, int __count)
2810 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2813 /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
2814 /// by the specified number of bits. High-order bits are filled with the sign
2815 /// bit of the initial value.
2817 /// \headerfile <x86intrin.h>
2819 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2822 /// A 128-bit integer vector containing the source operand.
2824 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2825 /// to right-shift each value in operand \a __a.
2826 /// \returns A 128-bit integer vector containing the right-shifted values.
2827 static __inline__ __m128i __DEFAULT_FN_ATTRS
2828 _mm_sra_epi16(__m128i __a, __m128i __count)
2830 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2833 /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
2834 /// by the specified number of bits. High-order bits are filled with the sign
2835 /// bit of the initial value.
2837 /// \headerfile <x86intrin.h>
2839 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2842 /// A 128-bit integer vector containing the source operand.
2844 /// An integer value specifying the number of bits to right-shift each value
2845 /// in operand \a __a.
2846 /// \returns A 128-bit integer vector containing the right-shifted values.
2847 static __inline__ __m128i __DEFAULT_FN_ATTRS
2848 _mm_srai_epi32(__m128i __a, int __count)
2850 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2853 /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
2854 /// by the specified number of bits. High-order bits are filled with the sign
2855 /// bit of the initial value.
2857 /// \headerfile <x86intrin.h>
2859 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2862 /// A 128-bit integer vector containing the source operand.
2864 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2865 /// to right-shift each value in operand \a __a.
2866 /// \returns A 128-bit integer vector containing the right-shifted values.
2867 static __inline__ __m128i __DEFAULT_FN_ATTRS
2868 _mm_sra_epi32(__m128i __a, __m128i __count)
2870 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2873 /// \brief Right-shifts the 128-bit integer vector operand by the specified
2874 /// number of bytes. High-order bits are cleared.
2876 /// \headerfile <x86intrin.h>
2879 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2882 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2885 /// A 128-bit integer vector containing the source operand.
2887 /// An immediate value specifying the number of bytes to right-shift operand
2889 /// \returns A 128-bit integer vector containing the right-shifted value.
2890 #define _mm_srli_si128(a, imm) __extension__ ({ \
2891 (__m128i)__builtin_shufflevector( \
2892 (__v16qi)(__m128i)(a), \
2893 (__v16qi)_mm_setzero_si128(), \
2894 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0, \
2895 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1, \
2896 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2, \
2897 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3, \
2898 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4, \
2899 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5, \
2900 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6, \
2901 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7, \
2902 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8, \
2903 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9, \
2904 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
2905 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
2906 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
2907 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
2908 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
2909 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
2911 #define _mm_bsrli_si128(a, imm) \
2912 _mm_srli_si128((a), (imm))
2914 /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
2915 /// operand by the specified number of bits. High-order bits are cleared.
2917 /// \headerfile <x86intrin.h>
2919 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2922 /// A 128-bit integer vector containing the source operand.
2924 /// An integer value specifying the number of bits to right-shift each value
2925 /// in operand \a __a.
2926 /// \returns A 128-bit integer vector containing the right-shifted values.
2927 static __inline__ __m128i __DEFAULT_FN_ATTRS
2928 _mm_srli_epi16(__m128i __a, int __count)
2930 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2933 /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
2934 /// operand by the specified number of bits. High-order bits are cleared.
2936 /// \headerfile <x86intrin.h>
2938 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2941 /// A 128-bit integer vector containing the source operand.
2943 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2944 /// to right-shift each value in operand \a __a.
2945 /// \returns A 128-bit integer vector containing the right-shifted values.
2946 static __inline__ __m128i __DEFAULT_FN_ATTRS
2947 _mm_srl_epi16(__m128i __a, __m128i __count)
2949 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2952 /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
2953 /// operand by the specified number of bits. High-order bits are cleared.
2955 /// \headerfile <x86intrin.h>
2957 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2960 /// A 128-bit integer vector containing the source operand.
2962 /// An integer value specifying the number of bits to right-shift each value
2963 /// in operand \a __a.
2964 /// \returns A 128-bit integer vector containing the right-shifted values.
2965 static __inline__ __m128i __DEFAULT_FN_ATTRS
2966 _mm_srli_epi32(__m128i __a, int __count)
2968 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
2971 /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
2972 /// operand by the specified number of bits. High-order bits are cleared.
2974 /// \headerfile <x86intrin.h>
2976 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2979 /// A 128-bit integer vector containing the source operand.
2981 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2982 /// to right-shift each value in operand \a __a.
2983 /// \returns A 128-bit integer vector containing the right-shifted values.
2984 static __inline__ __m128i __DEFAULT_FN_ATTRS
2985 _mm_srl_epi32(__m128i __a, __m128i __count)
2987 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
2990 /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
2991 /// operand by the specified number of bits. High-order bits are cleared.
2993 /// \headerfile <x86intrin.h>
2995 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2998 /// A 128-bit integer vector containing the source operand.
3000 /// An integer value specifying the number of bits to right-shift each value
3001 /// in operand \a __a.
3002 /// \returns A 128-bit integer vector containing the right-shifted values.
3003 static __inline__ __m128i __DEFAULT_FN_ATTRS
3004 _mm_srli_epi64(__m128i __a, int __count)
3006 return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3009 /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
3010 /// operand by the specified number of bits. High-order bits are cleared.
3012 /// \headerfile <x86intrin.h>
3014 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3017 /// A 128-bit integer vector containing the source operand.
3019 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3020 /// to right-shift each value in operand \a __a.
3021 /// \returns A 128-bit integer vector containing the right-shifted values.
3022 static __inline__ __m128i __DEFAULT_FN_ATTRS
3023 _mm_srl_epi64(__m128i __a, __m128i __count)
3025 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3028 /// \brief Compares each of the corresponding 8-bit values of the 128-bit
3029 /// integer vectors for equality. Each comparison yields 0h for false, FFh
3032 /// \headerfile <x86intrin.h>
3034 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3037 /// A 128-bit integer vector.
3039 /// A 128-bit integer vector.
3040 /// \returns A 128-bit integer vector containing the comparison results.
3041 static __inline__ __m128i __DEFAULT_FN_ATTRS
3042 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
3044 return (__m128i)((__v16qi)__a == (__v16qi)__b);
3047 /// \brief Compares each of the corresponding 16-bit values of the 128-bit
3048 /// integer vectors for equality. Each comparison yields 0h for false, FFFFh
3051 /// \headerfile <x86intrin.h>
3053 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3056 /// A 128-bit integer vector.
3058 /// A 128-bit integer vector.
3059 /// \returns A 128-bit integer vector containing the comparison results.
3060 static __inline__ __m128i __DEFAULT_FN_ATTRS
3061 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
3063 return (__m128i)((__v8hi)__a == (__v8hi)__b);
3066 /// \brief Compares each of the corresponding 32-bit values of the 128-bit
3067 /// integer vectors for equality. Each comparison yields 0h for false,
3068 /// FFFFFFFFh for true.
3070 /// \headerfile <x86intrin.h>
3072 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3075 /// A 128-bit integer vector.
3077 /// A 128-bit integer vector.
3078 /// \returns A 128-bit integer vector containing the comparison results.
3079 static __inline__ __m128i __DEFAULT_FN_ATTRS
3080 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
3082 return (__m128i)((__v4si)__a == (__v4si)__b);
3085 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
3086 /// integer vectors to determine if the values in the first operand are
3087 /// greater than those in the second operand. Each comparison yields 0h for
3088 /// false, FFh for true.
3090 /// \headerfile <x86intrin.h>
3092 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3095 /// A 128-bit integer vector.
3097 /// A 128-bit integer vector.
3098 /// \returns A 128-bit integer vector containing the comparison results.
3099 static __inline__ __m128i __DEFAULT_FN_ATTRS
3100 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
3102 /* This function always performs a signed comparison, but __v16qi is a char
3103 which may be signed or unsigned, so use __v16qs. */
3104 return (__m128i)((__v16qs)__a > (__v16qs)__b);
3107 /// \brief Compares each of the corresponding signed 16-bit values of the
3108 /// 128-bit integer vectors to determine if the values in the first operand
3109 /// are greater than those in the second operand. Each comparison yields 0h
3110 /// for false, FFFFh for true.
3112 /// \headerfile <x86intrin.h>
3114 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3117 /// A 128-bit integer vector.
3119 /// A 128-bit integer vector.
3120 /// \returns A 128-bit integer vector containing the comparison results.
3121 static __inline__ __m128i __DEFAULT_FN_ATTRS
3122 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
3124 return (__m128i)((__v8hi)__a > (__v8hi)__b);
3127 /// \brief Compares each of the corresponding signed 32-bit values of the
3128 /// 128-bit integer vectors to determine if the values in the first operand
3129 /// are greater than those in the second operand. Each comparison yields 0h
3130 /// for false, FFFFFFFFh for true.
3132 /// \headerfile <x86intrin.h>
3134 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3137 /// A 128-bit integer vector.
3139 /// A 128-bit integer vector.
3140 /// \returns A 128-bit integer vector containing the comparison results.
3141 static __inline__ __m128i __DEFAULT_FN_ATTRS
3142 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
3144 return (__m128i)((__v4si)__a > (__v4si)__b);
3147 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
3148 /// integer vectors to determine if the values in the first operand are less
3149 /// than those in the second operand. Each comparison yields 0h for false,
3152 /// \headerfile <x86intrin.h>
3154 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3157 /// A 128-bit integer vector.
3159 /// A 128-bit integer vector.
3160 /// \returns A 128-bit integer vector containing the comparison results.
3161 static __inline__ __m128i __DEFAULT_FN_ATTRS
3162 _mm_cmplt_epi8(__m128i __a, __m128i __b)
3164 return _mm_cmpgt_epi8(__b, __a);
3167 /// \brief Compares each of the corresponding signed 16-bit values of the
3168 /// 128-bit integer vectors to determine if the values in the first operand
3169 /// are less than those in the second operand. Each comparison yields 0h for
3170 /// false, FFFFh for true.
3172 /// \headerfile <x86intrin.h>
3174 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3177 /// A 128-bit integer vector.
3179 /// A 128-bit integer vector.
3180 /// \returns A 128-bit integer vector containing the comparison results.
3181 static __inline__ __m128i __DEFAULT_FN_ATTRS
3182 _mm_cmplt_epi16(__m128i __a, __m128i __b)
3184 return _mm_cmpgt_epi16(__b, __a);
3187 /// \brief Compares each of the corresponding signed 32-bit values of the
3188 /// 128-bit integer vectors to determine if the values in the first operand
3189 /// are less than those in the second operand. Each comparison yields 0h for
3190 /// false, FFFFFFFFh for true.
3192 /// \headerfile <x86intrin.h>
3194 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3197 /// A 128-bit integer vector.
3199 /// A 128-bit integer vector.
3200 /// \returns A 128-bit integer vector containing the comparison results.
3201 static __inline__ __m128i __DEFAULT_FN_ATTRS
3202 _mm_cmplt_epi32(__m128i __a, __m128i __b)
3204 return _mm_cmpgt_epi32(__b, __a);
3208 /// \brief Converts a 64-bit signed integer value from the second operand into a
3209 /// double-precision value and returns it in the lower element of a [2 x
3210 /// double] vector; the upper element of the returned vector is copied from
3211 /// the upper element of the first operand.
3213 /// \headerfile <x86intrin.h>
3215 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3218 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3219 /// copied to the upper 64 bits of the destination.
3221 /// A 64-bit signed integer operand containing the value to be converted.
3222 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3223 /// converted value of the second operand. The upper 64 bits are copied from
3224 /// the upper 64 bits of the first operand.
3225 static __inline__ __m128d __DEFAULT_FN_ATTRS
3226 _mm_cvtsi64_sd(__m128d __a, long long __b)
3232 /// \brief Converts the first (lower) element of a vector of [2 x double] into a
3233 /// 64-bit signed integer value, according to the current rounding mode.
3235 /// \headerfile <x86intrin.h>
3237 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3240 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3242 /// \returns A 64-bit signed integer containing the converted value.
3243 static __inline__ long long __DEFAULT_FN_ATTRS
3244 _mm_cvtsd_si64(__m128d __a)
3246 return __builtin_ia32_cvtsd2si64((__v2df)__a);
3249 /// \brief Converts the first (lower) element of a vector of [2 x double] into a
3250 /// 64-bit signed integer value, truncating the result when it is inexact.
3252 /// \headerfile <x86intrin.h>
3254 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3258 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3260 /// \returns A 64-bit signed integer containing the converted value.
3261 static __inline__ long long __DEFAULT_FN_ATTRS
3262 _mm_cvttsd_si64(__m128d __a)
3264 return __builtin_ia32_cvttsd2si64((__v2df)__a);
3268 /// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
3270 /// \headerfile <x86intrin.h>
3272 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3275 /// A 128-bit integer vector.
3276 /// \returns A 128-bit vector of [4 x float] containing the converted values.
3277 static __inline__ __m128 __DEFAULT_FN_ATTRS
3278 _mm_cvtepi32_ps(__m128i __a)
3280 return __builtin_ia32_cvtdq2ps((__v4si)__a);
3283 /// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
3285 /// \headerfile <x86intrin.h>
3287 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3290 /// A 128-bit vector of [4 x float].
3291 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3293 static __inline__ __m128i __DEFAULT_FN_ATTRS
3294 _mm_cvtps_epi32(__m128 __a)
3296 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3299 /// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
3300 /// truncating the result when it is inexact.
3302 /// \headerfile <x86intrin.h>
3304 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3308 /// A 128-bit vector of [4 x float].
3309 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
3310 static __inline__ __m128i __DEFAULT_FN_ATTRS
3311 _mm_cvttps_epi32(__m128 __a)
3313 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3316 /// \brief Returns a vector of [4 x i32] where the lowest element is the input
3317 /// operand and the remaining elements are zero.
3319 /// \headerfile <x86intrin.h>
3321 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3324 /// A 32-bit signed integer operand.
3325 /// \returns A 128-bit vector of [4 x i32].
3326 static __inline__ __m128i __DEFAULT_FN_ATTRS
3327 _mm_cvtsi32_si128(int __a)
3329 return (__m128i)(__v4si){ __a, 0, 0, 0 };
3333 /// \brief Returns a vector of [2 x i64] where the lower element is the input
3334 /// operand and the upper element is zero.
3336 /// \headerfile <x86intrin.h>
3338 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3341 /// A 64-bit signed integer operand containing the value to be converted.
3342 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
3343 static __inline__ __m128i __DEFAULT_FN_ATTRS
3344 _mm_cvtsi64_si128(long long __a)
3346 return (__m128i){ __a, 0 };
3350 /// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
3351 /// 32-bit signed integer value.
3353 /// \headerfile <x86intrin.h>
3355 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3358 /// A vector of [4 x i32]. The least significant 32 bits are moved to the
3360 /// \returns A 32-bit signed integer containing the moved value.
3361 static __inline__ int __DEFAULT_FN_ATTRS
3362 _mm_cvtsi128_si32(__m128i __a)
3364 __v4si __b = (__v4si)__a;
3369 /// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
3370 /// 64-bit signed integer value.
3372 /// \headerfile <x86intrin.h>
3374 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3377 /// A vector of [2 x i64]. The least significant 64 bits are moved to the
3379 /// \returns A 64-bit signed integer containing the moved value.
3380 static __inline__ long long __DEFAULT_FN_ATTRS
3381 _mm_cvtsi128_si64(__m128i __a)
3387 /// \brief Moves packed integer values from an aligned 128-bit memory location
3388 /// to elements in a 128-bit integer vector.
3390 /// \headerfile <x86intrin.h>
3392 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3395 /// An aligned pointer to a memory location containing integer values.
3396 /// \returns A 128-bit integer vector containing the moved values.
3397 static __inline__ __m128i __DEFAULT_FN_ATTRS
3398 _mm_load_si128(__m128i const *__p)
3403 /// \brief Moves packed integer values from an unaligned 128-bit memory location
3404 /// to elements in a 128-bit integer vector.
3406 /// \headerfile <x86intrin.h>
3408 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3411 /// A pointer to a memory location containing integer values.
3412 /// \returns A 128-bit integer vector containing the moved values.
3413 static __inline__ __m128i __DEFAULT_FN_ATTRS
3414 _mm_loadu_si128(__m128i const *__p)
3416 struct __loadu_si128 {
3418 } __attribute__((__packed__, __may_alias__));
3419 return ((struct __loadu_si128*)__p)->__v;
3422 /// \brief Returns a vector of [2 x i64] where the lower element is taken from
3423 /// the lower element of the operand, and the upper element is zero.
3425 /// \headerfile <x86intrin.h>
3427 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3430 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3431 /// the destination.
3432 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3433 /// moved value. The higher order bits are cleared.
3434 static __inline__ __m128i __DEFAULT_FN_ATTRS
3435 _mm_loadl_epi64(__m128i const *__p)
3437 struct __mm_loadl_epi64_struct {
3439 } __attribute__((__packed__, __may_alias__));
3440 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
3443 /// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
3444 /// This could be used as an argument to another intrinsic function where the
3445 /// argument is required but the value is not actually used.
3447 /// \headerfile <x86intrin.h>
3449 /// This intrinsic has no corresponding instruction.
3451 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
3452 static __inline__ __m128i __DEFAULT_FN_ATTRS
3453 _mm_undefined_si128(void)
3455 return (__m128i)__builtin_ia32_undef128();
3458 /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3459 /// the specified 64-bit integer values.
3461 /// \headerfile <x86intrin.h>
3463 /// This intrinsic is a utility function and does not correspond to a specific
3467 /// A 64-bit integer value used to initialize the upper 64 bits of the
3468 /// destination vector of [2 x i64].
3470 /// A 64-bit integer value used to initialize the lower 64 bits of the
3471 /// destination vector of [2 x i64].
3472 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3473 /// provided in the operands.
3474 static __inline__ __m128i __DEFAULT_FN_ATTRS
3475 _mm_set_epi64x(long long __q1, long long __q0)
3477 return (__m128i){ __q0, __q1 };
3480 /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3481 /// the specified 64-bit integer values.
3483 /// \headerfile <x86intrin.h>
3485 /// This intrinsic is a utility function and does not correspond to a specific
3489 /// A 64-bit integer value used to initialize the upper 64 bits of the
3490 /// destination vector of [2 x i64].
3492 /// A 64-bit integer value used to initialize the lower 64 bits of the
3493 /// destination vector of [2 x i64].
3494 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3495 /// provided in the operands.
3496 static __inline__ __m128i __DEFAULT_FN_ATTRS
3497 _mm_set_epi64(__m64 __q1, __m64 __q0)
3499 return (__m128i){ (long long)__q0, (long long)__q1 };
3502 /// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3503 /// the specified 32-bit integer values.
3505 /// \headerfile <x86intrin.h>
3507 /// This intrinsic is a utility function and does not correspond to a specific
3511 /// A 32-bit integer value used to initialize bits [127:96] of the
3512 /// destination vector.
3514 /// A 32-bit integer value used to initialize bits [95:64] of the destination
3517 /// A 32-bit integer value used to initialize bits [63:32] of the destination
3520 /// A 32-bit integer value used to initialize bits [31:0] of the destination
3522 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3523 /// provided in the operands.
3524 static __inline__ __m128i __DEFAULT_FN_ATTRS
3525 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
3527 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
3530 /// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3531 /// the specified 16-bit integer values.
3533 /// \headerfile <x86intrin.h>
3535 /// This intrinsic is a utility function and does not correspond to a specific
3539 /// A 16-bit integer value used to initialize bits [127:112] of the
3540 /// destination vector.
3542 /// A 16-bit integer value used to initialize bits [111:96] of the
3543 /// destination vector.
3545 /// A 16-bit integer value used to initialize bits [95:80] of the destination
3548 /// A 16-bit integer value used to initialize bits [79:64] of the destination
3551 /// A 16-bit integer value used to initialize bits [63:48] of the destination
3554 /// A 16-bit integer value used to initialize bits [47:32] of the destination
3557 /// A 16-bit integer value used to initialize bits [31:16] of the destination
3560 /// A 16-bit integer value used to initialize bits [15:0] of the destination
3562 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3563 /// provided in the operands.
3564 static __inline__ __m128i __DEFAULT_FN_ATTRS
3565 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
3567 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
3570 /// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3571 /// the specified 8-bit integer values.
3573 /// \headerfile <x86intrin.h>
3575 /// This intrinsic is a utility function and does not correspond to a specific
3579 /// Initializes bits [127:120] of the destination vector.
3581 /// Initializes bits [119:112] of the destination vector.
3583 /// Initializes bits [111:104] of the destination vector.
3585 /// Initializes bits [103:96] of the destination vector.
3587 /// Initializes bits [95:88] of the destination vector.
3589 /// Initializes bits [87:80] of the destination vector.
3591 /// Initializes bits [79:72] of the destination vector.
3593 /// Initializes bits [71:64] of the destination vector.
3595 /// Initializes bits [63:56] of the destination vector.
3597 /// Initializes bits [55:48] of the destination vector.
3599 /// Initializes bits [47:40] of the destination vector.
3601 /// Initializes bits [39:32] of the destination vector.
3603 /// Initializes bits [31:24] of the destination vector.
3605 /// Initializes bits [23:16] of the destination vector.
3607 /// Initializes bits [15:8] of the destination vector.
3609 /// Initializes bits [7:0] of the destination vector.
3610 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3611 /// provided in the operands.
3612 static __inline__ __m128i __DEFAULT_FN_ATTRS
3613 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
3615 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
3618 /// \brief Initializes both values in a 128-bit integer vector with the
3619 /// specified 64-bit integer value.
3621 /// \headerfile <x86intrin.h>
3623 /// This intrinsic is a utility function and does not correspond to a specific
3627 /// Integer value used to initialize the elements of the destination integer
3629 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3630 /// elements containing the value provided in the operand.
3631 static __inline__ __m128i __DEFAULT_FN_ATTRS
3632 _mm_set1_epi64x(long long __q)
3634 return (__m128i){ __q, __q };
3637 /// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
3638 /// specified 64-bit value.
3640 /// \headerfile <x86intrin.h>
3642 /// This intrinsic is a utility function and does not correspond to a specific
3646 /// A 64-bit value used to initialize the elements of the destination integer
3648 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3649 /// containing the value provided in the operand.
3650 static __inline__ __m128i __DEFAULT_FN_ATTRS
3651 _mm_set1_epi64(__m64 __q)
3653 return (__m128i){ (long long)__q, (long long)__q };
3656 /// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
3657 /// specified 32-bit value.
3659 /// \headerfile <x86intrin.h>
3661 /// This intrinsic is a utility function and does not correspond to a specific
3665 /// A 32-bit value used to initialize the elements of the destination integer
3667 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3668 /// containing the value provided in the operand.
3669 static __inline__ __m128i __DEFAULT_FN_ATTRS
3670 _mm_set1_epi32(int __i)
3672 return (__m128i)(__v4si){ __i, __i, __i, __i };
3675 /// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
3676 /// specified 16-bit value.
3678 /// \headerfile <x86intrin.h>
3680 /// This intrinsic is a utility function and does not correspond to a specific
3684 /// A 16-bit value used to initialize the elements of the destination integer
3686 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3687 /// containing the value provided in the operand.
3688 static __inline__ __m128i __DEFAULT_FN_ATTRS
3689 _mm_set1_epi16(short __w)
3691 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
3694 /// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
3695 /// specified 8-bit value.
3697 /// \headerfile <x86intrin.h>
3699 /// This intrinsic is a utility function and does not correspond to a specific
3703 /// An 8-bit value used to initialize the elements of the destination integer
3705 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3706 /// containing the value provided in the operand.
3707 static __inline__ __m128i __DEFAULT_FN_ATTRS
3708 _mm_set1_epi8(char __b)
3710 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
3713 /// \brief Constructs a 128-bit integer vector, initialized in reverse order
3714 /// with the specified 64-bit integral values.
3716 /// \headerfile <x86intrin.h>
3718 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
3722 /// A 64-bit integral value used to initialize the lower 64 bits of the
3725 /// A 64-bit integral value used to initialize the upper 64 bits of the
3727 /// \returns An initialized 128-bit integer vector.
3728 static __inline__ __m128i __DEFAULT_FN_ATTRS
3729 _mm_setr_epi64(__m64 __q0, __m64 __q1)
3731 return (__m128i){ (long long)__q0, (long long)__q1 };
3734 /// \brief Constructs a 128-bit integer vector, initialized in reverse order
3735 /// with the specified 32-bit integral values.
3737 /// \headerfile <x86intrin.h>
3739 /// This intrinsic is a utility function and does not correspond to a specific
3743 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3745 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3747 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3749 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3750 /// \returns An initialized 128-bit integer vector.
3751 static __inline__ __m128i __DEFAULT_FN_ATTRS
3752 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
3754 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
3757 /// \brief Constructs a 128-bit integer vector, initialized in reverse order
3758 /// with the specified 16-bit integral values.
3760 /// \headerfile <x86intrin.h>
3762 /// This intrinsic is a utility function and does not correspond to a specific
3766 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3768 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3770 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3772 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3774 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3776 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3778 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3780 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3781 /// \returns An initialized 128-bit integer vector.
3782 static __inline__ __m128i __DEFAULT_FN_ATTRS
3783 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
3785 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
3788 /// \brief Constructs a 128-bit integer vector, initialized in reverse order
3789 /// with the specified 8-bit integral values.
3791 /// \headerfile <x86intrin.h>
3793 /// This intrinsic is a utility function and does not correspond to a specific
3797 /// An 8-bit integral value used to initialize bits [7:0] of the result.
3799 /// An 8-bit integral value used to initialize bits [15:8] of the result.
3801 /// An 8-bit integral value used to initialize bits [23:16] of the result.
3803 /// An 8-bit integral value used to initialize bits [31:24] of the result.
3805 /// An 8-bit integral value used to initialize bits [39:32] of the result.
3807 /// An 8-bit integral value used to initialize bits [47:40] of the result.
3809 /// An 8-bit integral value used to initialize bits [55:48] of the result.
3811 /// An 8-bit integral value used to initialize bits [63:56] of the result.
3813 /// An 8-bit integral value used to initialize bits [71:64] of the result.
3815 /// An 8-bit integral value used to initialize bits [79:72] of the result.
3817 /// An 8-bit integral value used to initialize bits [87:80] of the result.
3819 /// An 8-bit integral value used to initialize bits [95:88] of the result.
3821 /// An 8-bit integral value used to initialize bits [103:96] of the result.
3823 /// An 8-bit integral value used to initialize bits [111:104] of the result.
3825 /// An 8-bit integral value used to initialize bits [119:112] of the result.
3827 /// An 8-bit integral value used to initialize bits [127:120] of the result.
3828 /// \returns An initialized 128-bit integer vector.
3829 static __inline__ __m128i __DEFAULT_FN_ATTRS
3830 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
3832 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
3835 /// \brief Creates a 128-bit integer vector initialized to zero.
3837 /// \headerfile <x86intrin.h>
3839 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3841 /// \returns An initialized 128-bit integer vector with all elements set to
3843 static __inline__ __m128i __DEFAULT_FN_ATTRS
3844 _mm_setzero_si128(void)
3846 return (__m128i){ 0LL, 0LL };
3849 /// \brief Stores a 128-bit integer vector to a memory location aligned on a
3850 /// 128-bit boundary.
3852 /// \headerfile <x86intrin.h>
3854 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3857 /// A pointer to an aligned memory location that will receive the integer
3860 /// A 128-bit integer vector containing the values to be moved.
3861 static __inline__ void __DEFAULT_FN_ATTRS
3862 _mm_store_si128(__m128i *__p, __m128i __b)
3867 /// \brief Stores a 128-bit integer vector to an unaligned memory location.
3869 /// \headerfile <x86intrin.h>
3871 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3874 /// A pointer to a memory location that will receive the integer values.
3876 /// A 128-bit integer vector containing the values to be moved.
3877 static __inline__ void __DEFAULT_FN_ATTRS
3878 _mm_storeu_si128(__m128i *__p, __m128i __b)
3880 struct __storeu_si128 {
3882 } __attribute__((__packed__, __may_alias__));
3883 ((struct __storeu_si128*)__p)->__v = __b;
3886 /// \brief Moves bytes selected by the mask from the first operand to the
3887 /// specified unaligned memory location. When a mask bit is 1, the
3888 /// corresponding byte is written, otherwise it is not written. To minimize
3889 /// caching, the date is flagged as non-temporal (unlikely to be used again
3890 /// soon). Exception and trap behavior for elements not selected for storage
3891 /// to memory are implementation dependent.
3893 /// \headerfile <x86intrin.h>
3895 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3899 /// A 128-bit integer vector containing the values to be moved.
3901 /// A 128-bit integer vector containing the mask. The most significant bit of
3902 /// each byte represents the mask bits.
3904 /// A pointer to an unaligned 128-bit memory location where the specified
3905 /// values are moved.
3906 static __inline__ void __DEFAULT_FN_ATTRS
3907 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
3909 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3912 /// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3913 /// a memory location.
3915 /// \headerfile <x86intrin.h>
3917 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3920 /// A pointer to a 64-bit memory location that will receive the lower 64 bits
3921 /// of the integer vector parameter.
3923 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3924 /// value to be stored.
3925 static __inline__ void __DEFAULT_FN_ATTRS
3926 _mm_storel_epi64(__m128i *__p, __m128i __a)
3928 struct __mm_storel_epi64_struct {
3930 } __attribute__((__packed__, __may_alias__));
3931 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
3934 /// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3935 /// aligned memory location. To minimize caching, the data is flagged as
3936 /// non-temporal (unlikely to be used again soon).
3938 /// \headerfile <x86intrin.h>
3940 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3943 /// A pointer to the 128-bit aligned memory location used to store the value.
3945 /// A vector of [2 x double] containing the 64-bit values to be stored.
3946 static __inline__ void __DEFAULT_FN_ATTRS
3947 _mm_stream_pd(double *__p, __m128d __a)
3949 __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
3952 /// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location.
3953 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3954 /// used again soon).
3956 /// \headerfile <x86intrin.h>
3958 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3961 /// A pointer to the 128-bit aligned memory location used to store the value.
3963 /// A 128-bit integer vector containing the values to be stored.
3964 static __inline__ void __DEFAULT_FN_ATTRS
3965 _mm_stream_si128(__m128i *__p, __m128i __a)
3967 __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
3970 /// \brief Stores a 32-bit integer value in the specified memory location. To
3971 /// minimize caching, the data is flagged as non-temporal (unlikely to be
3972 /// used again soon).
3974 /// \headerfile <x86intrin.h>
3976 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3979 /// A pointer to the 32-bit memory location used to store the value.
3981 /// A 32-bit integer containing the value to be stored.
3982 static __inline__ void __DEFAULT_FN_ATTRS
3983 _mm_stream_si32(int *__p, int __a)
3985 __builtin_ia32_movnti(__p, __a);
3989 /// \brief Stores a 64-bit integer value in the specified memory location. To
3990 /// minimize caching, the data is flagged as non-temporal (unlikely to be
3991 /// used again soon).
3993 /// \headerfile <x86intrin.h>
3995 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
3998 /// A pointer to the 64-bit memory location used to store the value.
4000 /// A 64-bit integer containing the value to be stored.
4001 static __inline__ void __DEFAULT_FN_ATTRS
4002 _mm_stream_si64(long long *__p, long long __a)
4004 __builtin_ia32_movnti64(__p, __a);
4008 #if defined(__cplusplus)
4012 /// \brief The cache line containing \a __p is flushed and invalidated from all
4013 /// caches in the coherency domain.
4015 /// \headerfile <x86intrin.h>
4017 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4020 /// A pointer to the memory location used to identify the cache line to be
4022 void _mm_clflush(void const *);
4024 /// \brief Forces strong memory ordering (serialization) between load
4025 /// instructions preceding this instruction and load instructions following
4026 /// this instruction, ensuring the system completes all previous loads before
4027 /// executing subsequent loads.
4029 /// \headerfile <x86intrin.h>
4031 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4033 void _mm_lfence(void);
4035 /// \brief Forces strong memory ordering (serialization) between load and store
4036 /// instructions preceding this instruction and load and store instructions
4037 /// following this instruction, ensuring that the system completes all
4038 /// previous memory accesses before executing subsequent memory accesses.
4040 /// \headerfile <x86intrin.h>
4042 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4044 void _mm_mfence(void);
4046 #if defined(__cplusplus)
4050 /// \brief Converts 16-bit signed integers from both 128-bit integer vector
4051 /// operands into 8-bit signed integers, and packs the results into the
4052 /// destination. Positive values greater than 0x7F are saturated to 0x7F.
4053 /// Negative values less than 0x80 are saturated to 0x80.
4055 /// \headerfile <x86intrin.h>
4057 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4060 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4061 /// a signed integer and is converted to a 8-bit signed integer with
4062 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4063 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4064 /// written to the lower 64 bits of the result.
4066 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4067 /// a signed integer and is converted to a 8-bit signed integer with
4068 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4069 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4070 /// written to the higher 64 bits of the result.
4071 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4072 static __inline__ __m128i __DEFAULT_FN_ATTRS
4073 _mm_packs_epi16(__m128i __a, __m128i __b)
4075 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4078 /// \brief Converts 32-bit signed integers from both 128-bit integer vector
4079 /// operands into 16-bit signed integers, and packs the results into the
4080 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4081 /// Negative values less than 0x8000 are saturated to 0x8000.
4083 /// \headerfile <x86intrin.h>
4085 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4088 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4089 /// a signed integer and is converted to a 16-bit signed integer with
4090 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4091 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4092 /// are written to the lower 64 bits of the result.
4094 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4095 /// a signed integer and is converted to a 16-bit signed integer with
4096 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4097 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4098 /// are written to the higher 64 bits of the result.
4099 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
4100 static __inline__ __m128i __DEFAULT_FN_ATTRS
4101 _mm_packs_epi32(__m128i __a, __m128i __b)
4103 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4106 /// \brief Converts 16-bit signed integers from both 128-bit integer vector
4107 /// operands into 8-bit unsigned integers, and packs the results into the
4108 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less
4109 /// than 0x00 are saturated to 0x00.
4111 /// \headerfile <x86intrin.h>
4113 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4116 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4117 /// a signed integer and is converted to an 8-bit unsigned integer with
4118 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4119 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4120 /// written to the lower 64 bits of the result.
4122 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4123 /// a signed integer and is converted to an 8-bit unsigned integer with
4124 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4125 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4126 /// written to the higher 64 bits of the result.
4127 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4128 static __inline__ __m128i __DEFAULT_FN_ATTRS
4129 _mm_packus_epi16(__m128i __a, __m128i __b)
4131 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4134 /// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4135 /// the immediate-value parameter as a selector.
4137 /// \headerfile <x86intrin.h>
4139 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4142 /// A 128-bit integer vector.
4144 /// An immediate value. Bits [3:0] selects values from \a __a to be assigned
4145 /// to bits[15:0] of the result. \n
4146 /// 000: assign values from bits [15:0] of \a __a. \n
4147 /// 001: assign values from bits [31:16] of \a __a. \n
4148 /// 010: assign values from bits [47:32] of \a __a. \n
4149 /// 011: assign values from bits [63:48] of \a __a. \n
4150 /// 100: assign values from bits [79:64] of \a __a. \n
4151 /// 101: assign values from bits [95:80] of \a __a. \n
4152 /// 110: assign values from bits [111:96] of \a __a. \n
4153 /// 111: assign values from bits [127:112] of \a __a.
4154 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4155 /// integer vector parameter and the remaining bits are assigned zeros.
4156 static __inline__ int __DEFAULT_FN_ATTRS
4157 _mm_extract_epi16(__m128i __a, int __imm)
4159 __v8hi __b = (__v8hi)__a;
4160 return (unsigned short)__b[__imm & 7];
4163 /// \brief Constructs a 128-bit integer vector by first making a copy of the
4164 /// 128-bit integer vector parameter, and then inserting the lower 16 bits
4165 /// of an integer parameter into an offset specified by the immediate-value
4168 /// \headerfile <x86intrin.h>
4170 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4173 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4174 /// result and then one of the eight elements in the result is replaced by
4175 /// the lower 16 bits of \a __b.
4177 /// An integer. The lower 16 bits of this parameter are written to the
4178 /// result beginning at an offset specified by \a __imm.
4180 /// An immediate value specifying the bit offset in the result at which the
4181 /// lower 16 bits of \a __b are written.
4182 /// \returns A 128-bit integer vector containing the constructed values.
4183 static __inline__ __m128i __DEFAULT_FN_ATTRS
4184 _mm_insert_epi16(__m128i __a, int __b, int __imm)
4186 __v8hi __c = (__v8hi)__a;
4187 __c[__imm & 7] = __b;
4188 return (__m128i)__c;
4191 /// \brief Copies the values of the most significant bits from each 8-bit
4192 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4193 /// value, zero-extends the value, and writes it to the destination.
4195 /// \headerfile <x86intrin.h>
4197 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4200 /// A 128-bit integer vector containing the values with bits to be extracted.
4201 /// \returns The most significant bits from each 8-bit element in \a __a,
4202 /// written to bits [15:0]. The other bits are assigned zeros.
4203 static __inline__ int __DEFAULT_FN_ATTRS
4204 _mm_movemask_epi8(__m128i __a)
4206 return __builtin_ia32_pmovmskb128((__v16qi)__a);
4209 /// \brief Constructs a 128-bit integer vector by shuffling four 32-bit
4210 /// elements of a 128-bit integer vector parameter, using the immediate-value
4211 /// parameter as a specifier.
4213 /// \headerfile <x86intrin.h>
4216 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4219 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4222 /// A 128-bit integer vector containing the values to be copied.
4224 /// An immediate value containing an 8-bit value specifying which elements to
4225 /// copy from a. The destinations within the 128-bit destination are assigned
4226 /// values as follows: \n
4227 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4228 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4229 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4230 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4231 /// Bit value assignments: \n
4232 /// 00: assign values from bits [31:0] of \a a. \n
4233 /// 01: assign values from bits [63:32] of \a a. \n
4234 /// 10: assign values from bits [95:64] of \a a. \n
4235 /// 11: assign values from bits [127:96] of \a a.
4236 /// \returns A 128-bit integer vector containing the shuffled values.
4237 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
4238 (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
4239 (__v4si)_mm_undefined_si128(), \
4240 ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
4241 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
4243 /// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit
4244 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4245 /// value parameter as a specifier.
4247 /// \headerfile <x86intrin.h>
4250 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4253 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4256 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4257 /// [127:64] of the result.
4259 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4260 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4261 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4262 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4263 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4264 /// Bit value assignments: \n
4265 /// 00: assign values from bits [15:0] of \a a. \n
4266 /// 01: assign values from bits [31:16] of \a a. \n
4267 /// 10: assign values from bits [47:32] of \a a. \n
4268 /// 11: assign values from bits [63:48] of \a a. \n
4269 /// \returns A 128-bit integer vector containing the shuffled values.
4270 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
4271 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
4272 (__v8hi)_mm_undefined_si128(), \
4273 ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
4274 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
4277 /// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit
4278 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4279 /// value parameter as a specifier.
4281 /// \headerfile <x86intrin.h>
4284 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4287 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4290 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4291 /// [63:0] of the result.
4293 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4294 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4295 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4296 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4297 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4298 /// Bit value assignments: \n
4299 /// 00: assign values from bits [79:64] of \a a. \n
4300 /// 01: assign values from bits [95:80] of \a a. \n
4301 /// 10: assign values from bits [111:96] of \a a. \n
4302 /// 11: assign values from bits [127:112] of \a a. \n
4303 /// \returns A 128-bit integer vector containing the shuffled values.
4304 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
4305 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
4306 (__v8hi)_mm_undefined_si128(), \
4308 4 + (((imm) >> 0) & 0x3), \
4309 4 + (((imm) >> 2) & 0x3), \
4310 4 + (((imm) >> 4) & 0x3), \
4311 4 + (((imm) >> 6) & 0x3)); })
4313 /// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors
4314 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4316 /// \headerfile <x86intrin.h>
4318 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4322 /// A 128-bit vector of [16 x i8].
4323 /// Bits [71:64] are written to bits [7:0] of the result. \n
4324 /// Bits [79:72] are written to bits [23:16] of the result. \n
4325 /// Bits [87:80] are written to bits [39:32] of the result. \n
4326 /// Bits [95:88] are written to bits [55:48] of the result. \n
4327 /// Bits [103:96] are written to bits [71:64] of the result. \n
4328 /// Bits [111:104] are written to bits [87:80] of the result. \n
4329 /// Bits [119:112] are written to bits [103:96] of the result. \n
4330 /// Bits [127:120] are written to bits [119:112] of the result.
4332 /// A 128-bit vector of [16 x i8]. \n
4333 /// Bits [71:64] are written to bits [15:8] of the result. \n
4334 /// Bits [79:72] are written to bits [31:24] of the result. \n
4335 /// Bits [87:80] are written to bits [47:40] of the result. \n
4336 /// Bits [95:88] are written to bits [63:56] of the result. \n
4337 /// Bits [103:96] are written to bits [79:72] of the result. \n
4338 /// Bits [111:104] are written to bits [95:88] of the result. \n
4339 /// Bits [119:112] are written to bits [111:104] of the result. \n
4340 /// Bits [127:120] are written to bits [127:120] of the result.
4341 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4342 static __inline__ __m128i __DEFAULT_FN_ATTRS
4343 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
4345 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
4348 /// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4349 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4351 /// \headerfile <x86intrin.h>
4353 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4357 /// A 128-bit vector of [8 x i16].
4358 /// Bits [79:64] are written to bits [15:0] of the result. \n
4359 /// Bits [95:80] are written to bits [47:32] of the result. \n
4360 /// Bits [111:96] are written to bits [79:64] of the result. \n
4361 /// Bits [127:112] are written to bits [111:96] of the result.
4363 /// A 128-bit vector of [8 x i16].
4364 /// Bits [79:64] are written to bits [31:16] of the result. \n
4365 /// Bits [95:80] are written to bits [63:48] of the result. \n
4366 /// Bits [111:96] are written to bits [95:80] of the result. \n
4367 /// Bits [127:112] are written to bits [127:112] of the result.
4368 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4369 static __inline__ __m128i __DEFAULT_FN_ATTRS
4370 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
4372 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
4375 /// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4376 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4378 /// \headerfile <x86intrin.h>
4380 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4384 /// A 128-bit vector of [4 x i32]. \n
4385 /// Bits [95:64] are written to bits [31:0] of the destination. \n
4386 /// Bits [127:96] are written to bits [95:64] of the destination.
4388 /// A 128-bit vector of [4 x i32]. \n
4389 /// Bits [95:64] are written to bits [64:32] of the destination. \n
4390 /// Bits [127:96] are written to bits [127:96] of the destination.
4391 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4392 static __inline__ __m128i __DEFAULT_FN_ATTRS
4393 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
4395 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
4398 /// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
4399 /// of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4401 /// \headerfile <x86intrin.h>
4403 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4407 /// A 128-bit vector of [2 x i64]. \n
4408 /// Bits [127:64] are written to bits [63:0] of the destination.
4410 /// A 128-bit vector of [2 x i64]. \n
4411 /// Bits [127:64] are written to bits [127:64] of the destination.
4412 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4413 static __inline__ __m128i __DEFAULT_FN_ATTRS
4414 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
4416 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
4419 /// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4420 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4422 /// \headerfile <x86intrin.h>
4424 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4428 /// A 128-bit vector of [16 x i8]. \n
4429 /// Bits [7:0] are written to bits [7:0] of the result. \n
4430 /// Bits [15:8] are written to bits [23:16] of the result. \n
4431 /// Bits [23:16] are written to bits [39:32] of the result. \n
4432 /// Bits [31:24] are written to bits [55:48] of the result. \n
4433 /// Bits [39:32] are written to bits [71:64] of the result. \n
4434 /// Bits [47:40] are written to bits [87:80] of the result. \n
4435 /// Bits [55:48] are written to bits [103:96] of the result. \n
4436 /// Bits [63:56] are written to bits [119:112] of the result.
4438 /// A 128-bit vector of [16 x i8].
4439 /// Bits [7:0] are written to bits [15:8] of the result. \n
4440 /// Bits [15:8] are written to bits [31:24] of the result. \n
4441 /// Bits [23:16] are written to bits [47:40] of the result. \n
4442 /// Bits [31:24] are written to bits [63:56] of the result. \n
4443 /// Bits [39:32] are written to bits [79:72] of the result. \n
4444 /// Bits [47:40] are written to bits [95:88] of the result. \n
4445 /// Bits [55:48] are written to bits [111:104] of the result. \n
4446 /// Bits [63:56] are written to bits [127:120] of the result.
4447 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4448 static __inline__ __m128i __DEFAULT_FN_ATTRS
4449 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
4451 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
4454 /// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit
4455 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4458 /// \headerfile <x86intrin.h>
4460 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4464 /// A 128-bit vector of [8 x i16].
4465 /// Bits [15:0] are written to bits [15:0] of the result. \n
4466 /// Bits [31:16] are written to bits [47:32] of the result. \n
4467 /// Bits [47:32] are written to bits [79:64] of the result. \n
4468 /// Bits [63:48] are written to bits [111:96] of the result.
4470 /// A 128-bit vector of [8 x i16].
4471 /// Bits [15:0] are written to bits [31:16] of the result. \n
4472 /// Bits [31:16] are written to bits [63:48] of the result. \n
4473 /// Bits [47:32] are written to bits [95:80] of the result. \n
4474 /// Bits [63:48] are written to bits [127:112] of the result.
4475 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4476 static __inline__ __m128i __DEFAULT_FN_ATTRS
4477 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
4479 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
4482 /// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4483 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4485 /// \headerfile <x86intrin.h>
4487 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4491 /// A 128-bit vector of [4 x i32]. \n
4492 /// Bits [31:0] are written to bits [31:0] of the destination. \n
4493 /// Bits [63:32] are written to bits [95:64] of the destination.
4495 /// A 128-bit vector of [4 x i32]. \n
4496 /// Bits [31:0] are written to bits [64:32] of the destination. \n
4497 /// Bits [63:32] are written to bits [127:96] of the destination.
4498 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4499 static __inline__ __m128i __DEFAULT_FN_ATTRS
4500 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
4502 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
4505 /// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of
4506 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4508 /// \headerfile <x86intrin.h>
4510 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4514 /// A 128-bit vector of [2 x i64]. \n
4515 /// Bits [63:0] are written to bits [63:0] of the destination. \n
4517 /// A 128-bit vector of [2 x i64]. \n
4518 /// Bits [63:0] are written to bits [127:64] of the destination. \n
4519 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4520 static __inline__ __m128i __DEFAULT_FN_ATTRS
4521 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
4523 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
4526 /// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4529 /// \headerfile <x86intrin.h>
4531 /// This intrinsic has no corresponding instruction.
4534 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4536 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4537 static __inline__ __m64 __DEFAULT_FN_ATTRS
4538 _mm_movepi64_pi64(__m128i __a)
4540 return (__m64)__a[0];
4543 /// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4546 /// \headerfile <x86intrin.h>
4548 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction.
4552 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4553 /// the operand. The upper 64 bits are assigned zeros.
4554 static __inline__ __m128i __DEFAULT_FN_ATTRS
4555 _mm_movpi64_epi64(__m64 __a)
4557 return (__m128i){ (long long)__a, 0 };
4560 /// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4561 /// integer vector, zeroing the upper bits.
4563 /// \headerfile <x86intrin.h>
4565 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4568 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4570 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4571 /// the operand. The upper 64 bits are assigned zeros.
4572 static __inline__ __m128i __DEFAULT_FN_ATTRS
4573 _mm_move_epi64(__m128i __a)
4575 return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
4578 /// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
4579 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4582 /// \headerfile <x86intrin.h>
4584 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4587 /// A 128-bit vector of [2 x double]. \n
4588 /// Bits [127:64] are written to bits [63:0] of the destination.
4590 /// A 128-bit vector of [2 x double]. \n
4591 /// Bits [127:64] are written to bits [127:64] of the destination.
4592 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4593 static __inline__ __m128d __DEFAULT_FN_ATTRS
4594 _mm_unpackhi_pd(__m128d __a, __m128d __b)
4596 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
4599 /// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
4600 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4603 /// \headerfile <x86intrin.h>
4605 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4608 /// A 128-bit vector of [2 x double]. \n
4609 /// Bits [63:0] are written to bits [63:0] of the destination.
4611 /// A 128-bit vector of [2 x double]. \n
4612 /// Bits [63:0] are written to bits [127:64] of the destination.
4613 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4614 static __inline__ __m128d __DEFAULT_FN_ATTRS
4615 _mm_unpacklo_pd(__m128d __a, __m128d __b)
4617 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
4620 /// \brief Extracts the sign bits of the double-precision values in the 128-bit
4621 /// vector of [2 x double], zero-extends the value, and writes it to the
4622 /// low-order bits of the destination.
4624 /// \headerfile <x86intrin.h>
4626 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4629 /// A 128-bit vector of [2 x double] containing the values with sign bits to
4631 /// \returns The sign bits from each of the double-precision elements in \a __a,
4632 /// written to bits [1:0]. The remaining bits are assigned values of zero.
4633 static __inline__ int __DEFAULT_FN_ATTRS
4634 _mm_movemask_pd(__m128d __a)
4636 return __builtin_ia32_movmskpd((__v2df)__a);
4640 /// \brief Constructs a 128-bit floating-point vector of [2 x double] from two
4641 /// 128-bit vector parameters of [2 x double], using the immediate-value
4642 /// parameter as a specifier.
4644 /// \headerfile <x86intrin.h>
4647 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4650 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4653 /// A 128-bit vector of [2 x double].
4655 /// A 128-bit vector of [2 x double].
4657 /// An 8-bit immediate value. The least significant two bits specify which
4658 /// elements to copy from a and b: \n
4659 /// Bit[0] = 0: lower element of a copied to lower element of result. \n
4660 /// Bit[0] = 1: upper element of a copied to lower element of result. \n
4661 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4662 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4663 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4664 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
4665 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4666 0 + (((i) >> 0) & 0x1), \
4667 2 + (((i) >> 1) & 0x1)); })
4669 /// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4670 /// floating-point vector of [4 x float].
4672 /// \headerfile <x86intrin.h>
4674 /// This intrinsic has no corresponding instruction.
4677 /// A 128-bit floating-point vector of [2 x double].
4678 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4679 /// bitwise pattern as the parameter.
4680 static __inline__ __m128 __DEFAULT_FN_ATTRS
4681 _mm_castpd_ps(__m128d __a)
4686 /// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4689 /// \headerfile <x86intrin.h>
4691 /// This intrinsic has no corresponding instruction.
4694 /// A 128-bit floating-point vector of [2 x double].
4695 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4697 static __inline__ __m128i __DEFAULT_FN_ATTRS
4698 _mm_castpd_si128(__m128d __a)
4700 return (__m128i)__a;
4703 /// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4704 /// floating-point vector of [2 x double].
4706 /// \headerfile <x86intrin.h>
4708 /// This intrinsic has no corresponding instruction.
4711 /// A 128-bit floating-point vector of [4 x float].
4712 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4713 /// bitwise pattern as the parameter.
4714 static __inline__ __m128d __DEFAULT_FN_ATTRS
4715 _mm_castps_pd(__m128 __a)
4717 return (__m128d)__a;
4720 /// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4723 /// \headerfile <x86intrin.h>
4725 /// This intrinsic has no corresponding instruction.
4728 /// A 128-bit floating-point vector of [4 x float].
4729 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4731 static __inline__ __m128i __DEFAULT_FN_ATTRS
4732 _mm_castps_si128(__m128 __a)
4734 return (__m128i)__a;
4737 /// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
4740 /// \headerfile <x86intrin.h>
4742 /// This intrinsic has no corresponding instruction.
4745 /// A 128-bit integer vector.
4746 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4747 /// bitwise pattern as the parameter.
4748 static __inline__ __m128 __DEFAULT_FN_ATTRS
4749 _mm_castsi128_ps(__m128i __a)
4754 /// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
4755 /// of [2 x double].
4757 /// \headerfile <x86intrin.h>
4759 /// This intrinsic has no corresponding instruction.
4762 /// A 128-bit integer vector.
4763 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4764 /// bitwise pattern as the parameter.
4765 static __inline__ __m128d __DEFAULT_FN_ATTRS
4766 _mm_castsi128_pd(__m128i __a)
4768 return (__m128d)__a;
4771 #if defined(__cplusplus)
4775 /// \brief Indicates that a spin loop is being executed for the purposes of
4776 /// optimizing power consumption during the loop.
4778 /// \headerfile <x86intrin.h>
4780 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4782 void _mm_pause(void);
4784 #if defined(__cplusplus)
4787 #undef __DEFAULT_FN_ATTRS
4789 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4791 #endif /* __EMMINTRIN_H */