1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
15 typedef int __v4si __attribute__((__vector_size__(16)));
16 typedef float __v4sf __attribute__((__vector_size__(16)));
17 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
19 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
22 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
24 /* This header should only be included in a hosted environment as it depends on
25 * a standard library to provide allocation routines. */
27 #include <mm_malloc.h>
30 /* Define the default attributes for the functions in this file. */
31 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
32 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
34 /// Adds the 32-bit float values in the low-order bits of the operands.
36 /// \headerfile <x86intrin.h>
38 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
41 /// A 128-bit vector of [4 x float] containing one of the source operands.
42 /// The lower 32 bits of this operand are used in the calculation.
44 /// A 128-bit vector of [4 x float] containing one of the source operands.
45 /// The lower 32 bits of this operand are used in the calculation.
46 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
47 /// of the lower 32 bits of both operands. The upper 96 bits are copied from
48 /// the upper 96 bits of the first source operand.
49 static __inline__ __m128 __DEFAULT_FN_ATTRS
50 _mm_add_ss(__m128 __a, __m128 __b)
56 /// Adds two 128-bit vectors of [4 x float], and returns the results of
59 /// \headerfile <x86intrin.h>
61 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
64 /// A 128-bit vector of [4 x float] containing one of the source operands.
66 /// A 128-bit vector of [4 x float] containing one of the source operands.
67 /// \returns A 128-bit vector of [4 x float] containing the sums of both
69 static __inline__ __m128 __DEFAULT_FN_ATTRS
70 _mm_add_ps(__m128 __a, __m128 __b)
72 return (__m128)((__v4sf)__a + (__v4sf)__b);
75 /// Subtracts the 32-bit float value in the low-order bits of the second
76 /// operand from the corresponding value in the first operand.
78 /// \headerfile <x86intrin.h>
80 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
83 /// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
84 /// of this operand are used in the calculation.
86 /// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
87 /// bits of this operand are used in the calculation.
88 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
89 /// difference of the lower 32 bits of both operands. The upper 96 bits are
90 /// copied from the upper 96 bits of the first source operand.
91 static __inline__ __m128 __DEFAULT_FN_ATTRS
92 _mm_sub_ss(__m128 __a, __m128 __b)
98 /// Subtracts each of the values of the second operand from the first
99 /// operand, both of which are 128-bit vectors of [4 x float] and returns
100 /// the results of the subtraction.
102 /// \headerfile <x86intrin.h>
104 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
107 /// A 128-bit vector of [4 x float] containing the minuend.
109 /// A 128-bit vector of [4 x float] containing the subtrahend.
110 /// \returns A 128-bit vector of [4 x float] containing the differences between
112 static __inline__ __m128 __DEFAULT_FN_ATTRS
113 _mm_sub_ps(__m128 __a, __m128 __b)
115 return (__m128)((__v4sf)__a - (__v4sf)__b);
118 /// Multiplies two 32-bit float values in the low-order bits of the
121 /// \headerfile <x86intrin.h>
123 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
126 /// A 128-bit vector of [4 x float] containing one of the source operands.
127 /// The lower 32 bits of this operand are used in the calculation.
129 /// A 128-bit vector of [4 x float] containing one of the source operands.
130 /// The lower 32 bits of this operand are used in the calculation.
131 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
132 /// 32 bits of both operands. The upper 96 bits are copied from the upper 96
133 /// bits of the first source operand.
134 static __inline__ __m128 __DEFAULT_FN_ATTRS
135 _mm_mul_ss(__m128 __a, __m128 __b)
141 /// Multiplies two 128-bit vectors of [4 x float] and returns the
142 /// results of the multiplication.
144 /// \headerfile <x86intrin.h>
146 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
149 /// A 128-bit vector of [4 x float] containing one of the source operands.
151 /// A 128-bit vector of [4 x float] containing one of the source operands.
152 /// \returns A 128-bit vector of [4 x float] containing the products of both
154 static __inline__ __m128 __DEFAULT_FN_ATTRS
155 _mm_mul_ps(__m128 __a, __m128 __b)
157 return (__m128)((__v4sf)__a * (__v4sf)__b);
160 /// Divides the value in the low-order 32 bits of the first operand by
161 /// the corresponding value in the second operand.
163 /// \headerfile <x86intrin.h>
165 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
168 /// A 128-bit vector of [4 x float] containing the dividend. The lower 32
169 /// bits of this operand are used in the calculation.
171 /// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
172 /// of this operand are used in the calculation.
173 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
174 /// lower 32 bits of both operands. The upper 96 bits are copied from the
175 /// upper 96 bits of the first source operand.
176 static __inline__ __m128 __DEFAULT_FN_ATTRS
177 _mm_div_ss(__m128 __a, __m128 __b)
183 /// Divides two 128-bit vectors of [4 x float].
185 /// \headerfile <x86intrin.h>
187 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
190 /// A 128-bit vector of [4 x float] containing the dividend.
192 /// A 128-bit vector of [4 x float] containing the divisor.
193 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
195 static __inline__ __m128 __DEFAULT_FN_ATTRS
196 _mm_div_ps(__m128 __a, __m128 __b)
198 return (__m128)((__v4sf)__a / (__v4sf)__b);
201 /// Calculates the square root of the value stored in the low-order bits
202 /// of a 128-bit vector of [4 x float].
204 /// \headerfile <x86intrin.h>
206 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
209 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
210 /// used in the calculation.
211 /// \returns A 128-bit vector of [4 x float] containing the square root of the
212 /// value in the low-order bits of the operand.
213 static __inline__ __m128 __DEFAULT_FN_ATTRS
214 _mm_sqrt_ss(__m128 __a)
216 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
219 /// Calculates the square roots of the values stored in a 128-bit vector
222 /// \headerfile <x86intrin.h>
224 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
227 /// A 128-bit vector of [4 x float].
228 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
229 /// values in the operand.
230 static __inline__ __m128 __DEFAULT_FN_ATTRS
231 _mm_sqrt_ps(__m128 __a)
233 return __builtin_ia32_sqrtps((__v4sf)__a);
236 /// Calculates the approximate reciprocal of the value stored in the
237 /// low-order bits of a 128-bit vector of [4 x float].
239 /// \headerfile <x86intrin.h>
241 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
244 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
245 /// used in the calculation.
246 /// \returns A 128-bit vector of [4 x float] containing the approximate
247 /// reciprocal of the value in the low-order bits of the operand.
248 static __inline__ __m128 __DEFAULT_FN_ATTRS
249 _mm_rcp_ss(__m128 __a)
251 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
254 /// Calculates the approximate reciprocals of the values stored in a
255 /// 128-bit vector of [4 x float].
257 /// \headerfile <x86intrin.h>
259 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
262 /// A 128-bit vector of [4 x float].
263 /// \returns A 128-bit vector of [4 x float] containing the approximate
264 /// reciprocals of the values in the operand.
265 static __inline__ __m128 __DEFAULT_FN_ATTRS
266 _mm_rcp_ps(__m128 __a)
268 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
271 /// Calculates the approximate reciprocal of the square root of the value
272 /// stored in the low-order bits of a 128-bit vector of [4 x float].
274 /// \headerfile <x86intrin.h>
276 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
279 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
280 /// used in the calculation.
281 /// \returns A 128-bit vector of [4 x float] containing the approximate
282 /// reciprocal of the square root of the value in the low-order bits of the
284 static __inline__ __m128 __DEFAULT_FN_ATTRS
285 _mm_rsqrt_ss(__m128 __a)
287 return __builtin_ia32_rsqrtss((__v4sf)__a);
290 /// Calculates the approximate reciprocals of the square roots of the
291 /// values stored in a 128-bit vector of [4 x float].
293 /// \headerfile <x86intrin.h>
295 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
298 /// A 128-bit vector of [4 x float].
299 /// \returns A 128-bit vector of [4 x float] containing the approximate
300 /// reciprocals of the square roots of the values in the operand.
301 static __inline__ __m128 __DEFAULT_FN_ATTRS
302 _mm_rsqrt_ps(__m128 __a)
304 return __builtin_ia32_rsqrtps((__v4sf)__a);
307 /// Compares two 32-bit float values in the low-order bits of both
308 /// operands and returns the lesser value in the low-order bits of the
309 /// vector of [4 x float].
311 /// \headerfile <x86intrin.h>
313 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
316 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
317 /// 32 bits of this operand are used in the comparison.
319 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
320 /// 32 bits of this operand are used in the comparison.
321 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
322 /// minimum value between both operands. The upper 96 bits are copied from
323 /// the upper 96 bits of the first source operand.
324 static __inline__ __m128 __DEFAULT_FN_ATTRS
325 _mm_min_ss(__m128 __a, __m128 __b)
327 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
330 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
331 /// of each pair of values.
333 /// \headerfile <x86intrin.h>
335 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
338 /// A 128-bit vector of [4 x float] containing one of the operands.
340 /// A 128-bit vector of [4 x float] containing one of the operands.
341 /// \returns A 128-bit vector of [4 x float] containing the minimum values
342 /// between both operands.
343 static __inline__ __m128 __DEFAULT_FN_ATTRS
344 _mm_min_ps(__m128 __a, __m128 __b)
346 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
349 /// Compares two 32-bit float values in the low-order bits of both
350 /// operands and returns the greater value in the low-order bits of a 128-bit
351 /// vector of [4 x float].
353 /// \headerfile <x86intrin.h>
355 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
358 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
359 /// 32 bits of this operand are used in the comparison.
361 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
362 /// 32 bits of this operand are used in the comparison.
363 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
364 /// maximum value between both operands. The upper 96 bits are copied from
365 /// the upper 96 bits of the first source operand.
366 static __inline__ __m128 __DEFAULT_FN_ATTRS
367 _mm_max_ss(__m128 __a, __m128 __b)
369 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
372 /// Compares two 128-bit vectors of [4 x float] and returns the greater
373 /// of each pair of values.
375 /// \headerfile <x86intrin.h>
377 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
380 /// A 128-bit vector of [4 x float] containing one of the operands.
382 /// A 128-bit vector of [4 x float] containing one of the operands.
383 /// \returns A 128-bit vector of [4 x float] containing the maximum values
384 /// between both operands.
385 static __inline__ __m128 __DEFAULT_FN_ATTRS
386 _mm_max_ps(__m128 __a, __m128 __b)
388 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
391 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
393 /// \headerfile <x86intrin.h>
395 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
398 /// A 128-bit vector containing one of the source operands.
400 /// A 128-bit vector containing one of the source operands.
401 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
402 /// values between both operands.
403 static __inline__ __m128 __DEFAULT_FN_ATTRS
404 _mm_and_ps(__m128 __a, __m128 __b)
406 return (__m128)((__v4su)__a & (__v4su)__b);
409 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
410 /// the one's complement of the values contained in the first source
413 /// \headerfile <x86intrin.h>
415 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
418 /// A 128-bit vector of [4 x float] containing the first source operand. The
419 /// one's complement of this value is used in the bitwise AND.
421 /// A 128-bit vector of [4 x float] containing the second source operand.
422 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
423 /// one's complement of the first operand and the values in the second
425 static __inline__ __m128 __DEFAULT_FN_ATTRS
426 _mm_andnot_ps(__m128 __a, __m128 __b)
428 return (__m128)(~(__v4su)__a & (__v4su)__b);
431 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
433 /// \headerfile <x86intrin.h>
435 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
438 /// A 128-bit vector of [4 x float] containing one of the source operands.
440 /// A 128-bit vector of [4 x float] containing one of the source operands.
441 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
442 /// values between both operands.
443 static __inline__ __m128 __DEFAULT_FN_ATTRS
444 _mm_or_ps(__m128 __a, __m128 __b)
446 return (__m128)((__v4su)__a | (__v4su)__b);
449 /// Performs a bitwise exclusive OR of two 128-bit vectors of
452 /// \headerfile <x86intrin.h>
454 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
457 /// A 128-bit vector of [4 x float] containing one of the source operands.
459 /// A 128-bit vector of [4 x float] containing one of the source operands.
460 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
461 /// of the values between both operands.
462 static __inline__ __m128 __DEFAULT_FN_ATTRS
463 _mm_xor_ps(__m128 __a, __m128 __b)
465 return (__m128)((__v4su)__a ^ (__v4su)__b);
468 /// Compares two 32-bit float values in the low-order bits of both
469 /// operands for equality and returns the result of the comparison in the
470 /// low-order bits of a vector [4 x float].
472 /// \headerfile <x86intrin.h>
474 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
477 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
478 /// 32 bits of this operand are used in the comparison.
480 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
481 /// 32 bits of this operand are used in the comparison.
482 /// \returns A 128-bit vector of [4 x float] containing the comparison results
483 /// in the low-order bits.
484 static __inline__ __m128 __DEFAULT_FN_ATTRS
485 _mm_cmpeq_ss(__m128 __a, __m128 __b)
487 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
490 /// Compares each of the corresponding 32-bit float values of the
491 /// 128-bit vectors of [4 x float] for equality.
493 /// \headerfile <x86intrin.h>
495 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
498 /// A 128-bit vector of [4 x float].
500 /// A 128-bit vector of [4 x float].
501 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
502 static __inline__ __m128 __DEFAULT_FN_ATTRS
503 _mm_cmpeq_ps(__m128 __a, __m128 __b)
505 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
508 /// Compares two 32-bit float values in the low-order bits of both
509 /// operands to determine if the value in the first operand is less than the
510 /// corresponding value in the second operand and returns the result of the
511 /// comparison in the low-order bits of a vector of [4 x float].
513 /// \headerfile <x86intrin.h>
515 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
518 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
519 /// 32 bits of this operand are used in the comparison.
521 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
522 /// 32 bits of this operand are used in the comparison.
523 /// \returns A 128-bit vector of [4 x float] containing the comparison results
524 /// in the low-order bits.
525 static __inline__ __m128 __DEFAULT_FN_ATTRS
526 _mm_cmplt_ss(__m128 __a, __m128 __b)
528 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
531 /// Compares each of the corresponding 32-bit float values of the
532 /// 128-bit vectors of [4 x float] to determine if the values in the first
533 /// operand are less than those in the second operand.
535 /// \headerfile <x86intrin.h>
537 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
540 /// A 128-bit vector of [4 x float].
542 /// A 128-bit vector of [4 x float].
543 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
544 static __inline__ __m128 __DEFAULT_FN_ATTRS
545 _mm_cmplt_ps(__m128 __a, __m128 __b)
547 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
550 /// Compares two 32-bit float values in the low-order bits of both
551 /// operands to determine if the value in the first operand is less than or
552 /// equal to the corresponding value in the second operand and returns the
553 /// result of the comparison in the low-order bits of a vector of
556 /// \headerfile <x86intrin.h>
558 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
561 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
562 /// 32 bits of this operand are used in the comparison.
564 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
565 /// 32 bits of this operand are used in the comparison.
566 /// \returns A 128-bit vector of [4 x float] containing the comparison results
567 /// in the low-order bits.
568 static __inline__ __m128 __DEFAULT_FN_ATTRS
569 _mm_cmple_ss(__m128 __a, __m128 __b)
571 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
574 /// Compares each of the corresponding 32-bit float values of the
575 /// 128-bit vectors of [4 x float] to determine if the values in the first
576 /// operand are less than or equal to those in the second operand.
578 /// \headerfile <x86intrin.h>
580 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
583 /// A 128-bit vector of [4 x float].
585 /// A 128-bit vector of [4 x float].
586 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
587 static __inline__ __m128 __DEFAULT_FN_ATTRS
588 _mm_cmple_ps(__m128 __a, __m128 __b)
590 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
593 /// Compares two 32-bit float values in the low-order bits of both
594 /// operands to determine if the value in the first operand is greater than
595 /// the corresponding value in the second operand and returns the result of
596 /// the comparison in the low-order bits of a vector of [4 x float].
598 /// \headerfile <x86intrin.h>
600 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
603 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
604 /// 32 bits of this operand are used in the comparison.
606 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
607 /// 32 bits of this operand are used in the comparison.
608 /// \returns A 128-bit vector of [4 x float] containing the comparison results
609 /// in the low-order bits.
610 static __inline__ __m128 __DEFAULT_FN_ATTRS
611 _mm_cmpgt_ss(__m128 __a, __m128 __b)
613 return (__m128)__builtin_shufflevector((__v4sf)__a,
614 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
618 /// Compares each of the corresponding 32-bit float values of the
619 /// 128-bit vectors of [4 x float] to determine if the values in the first
620 /// operand are greater than those in the second operand.
622 /// \headerfile <x86intrin.h>
624 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
627 /// A 128-bit vector of [4 x float].
629 /// A 128-bit vector of [4 x float].
630 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
631 static __inline__ __m128 __DEFAULT_FN_ATTRS
632 _mm_cmpgt_ps(__m128 __a, __m128 __b)
634 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
637 /// Compares two 32-bit float values in the low-order bits of both
638 /// operands to determine if the value in the first operand is greater than
639 /// or equal to the corresponding value in the second operand and returns
640 /// the result of the comparison in the low-order bits of a vector of
643 /// \headerfile <x86intrin.h>
645 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
648 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
649 /// 32 bits of this operand are used in the comparison.
651 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
652 /// 32 bits of this operand are used in the comparison.
653 /// \returns A 128-bit vector of [4 x float] containing the comparison results
654 /// in the low-order bits.
655 static __inline__ __m128 __DEFAULT_FN_ATTRS
656 _mm_cmpge_ss(__m128 __a, __m128 __b)
658 return (__m128)__builtin_shufflevector((__v4sf)__a,
659 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
663 /// Compares each of the corresponding 32-bit float values of the
664 /// 128-bit vectors of [4 x float] to determine if the values in the first
665 /// operand are greater than or equal to those in the second operand.
667 /// \headerfile <x86intrin.h>
669 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
672 /// A 128-bit vector of [4 x float].
674 /// A 128-bit vector of [4 x float].
675 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
676 static __inline__ __m128 __DEFAULT_FN_ATTRS
677 _mm_cmpge_ps(__m128 __a, __m128 __b)
679 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
682 /// Compares two 32-bit float values in the low-order bits of both
683 /// operands for inequality and returns the result of the comparison in the
684 /// low-order bits of a vector of [4 x float].
686 /// \headerfile <x86intrin.h>
688 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
692 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
693 /// 32 bits of this operand are used in the comparison.
695 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
696 /// 32 bits of this operand are used in the comparison.
697 /// \returns A 128-bit vector of [4 x float] containing the comparison results
698 /// in the low-order bits.
699 static __inline__ __m128 __DEFAULT_FN_ATTRS
700 _mm_cmpneq_ss(__m128 __a, __m128 __b)
702 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
705 /// Compares each of the corresponding 32-bit float values of the
706 /// 128-bit vectors of [4 x float] for inequality.
708 /// \headerfile <x86intrin.h>
710 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
714 /// A 128-bit vector of [4 x float].
716 /// A 128-bit vector of [4 x float].
717 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
718 static __inline__ __m128 __DEFAULT_FN_ATTRS
719 _mm_cmpneq_ps(__m128 __a, __m128 __b)
721 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
724 /// Compares two 32-bit float values in the low-order bits of both
725 /// operands to determine if the value in the first operand is not less than
726 /// the corresponding value in the second operand and returns the result of
727 /// the comparison in the low-order bits of a vector of [4 x float].
729 /// \headerfile <x86intrin.h>
731 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
735 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
736 /// 32 bits of this operand are used in the comparison.
738 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
739 /// 32 bits of this operand are used in the comparison.
740 /// \returns A 128-bit vector of [4 x float] containing the comparison results
741 /// in the low-order bits.
742 static __inline__ __m128 __DEFAULT_FN_ATTRS
743 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
745 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
748 /// Compares each of the corresponding 32-bit float values of the
749 /// 128-bit vectors of [4 x float] to determine if the values in the first
750 /// operand are not less than those in the second operand.
752 /// \headerfile <x86intrin.h>
754 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
758 /// A 128-bit vector of [4 x float].
760 /// A 128-bit vector of [4 x float].
761 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
762 static __inline__ __m128 __DEFAULT_FN_ATTRS
763 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
765 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
768 /// Compares two 32-bit float values in the low-order bits of both
769 /// operands to determine if the value in the first operand is not less than
770 /// or equal to the corresponding value in the second operand and returns
771 /// the result of the comparison in the low-order bits of a vector of
774 /// \headerfile <x86intrin.h>
776 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
780 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
781 /// 32 bits of this operand are used in the comparison.
783 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
784 /// 32 bits of this operand are used in the comparison.
785 /// \returns A 128-bit vector of [4 x float] containing the comparison results
786 /// in the low-order bits.
787 static __inline__ __m128 __DEFAULT_FN_ATTRS
788 _mm_cmpnle_ss(__m128 __a, __m128 __b)
790 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
793 /// Compares each of the corresponding 32-bit float values of the
794 /// 128-bit vectors of [4 x float] to determine if the values in the first
795 /// operand are not less than or equal to those in the second operand.
797 /// \headerfile <x86intrin.h>
799 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
803 /// A 128-bit vector of [4 x float].
805 /// A 128-bit vector of [4 x float].
806 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
807 static __inline__ __m128 __DEFAULT_FN_ATTRS
808 _mm_cmpnle_ps(__m128 __a, __m128 __b)
810 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
813 /// Compares two 32-bit float values in the low-order bits of both
814 /// operands to determine if the value in the first operand is not greater
815 /// than the corresponding value in the second operand and returns the
816 /// result of the comparison in the low-order bits of a vector of
819 /// \headerfile <x86intrin.h>
821 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
825 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
826 /// 32 bits of this operand are used in the comparison.
828 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
829 /// 32 bits of this operand are used in the comparison.
830 /// \returns A 128-bit vector of [4 x float] containing the comparison results
831 /// in the low-order bits.
832 static __inline__ __m128 __DEFAULT_FN_ATTRS
833 _mm_cmpngt_ss(__m128 __a, __m128 __b)
835 return (__m128)__builtin_shufflevector((__v4sf)__a,
836 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
840 /// Compares each of the corresponding 32-bit float values of the
841 /// 128-bit vectors of [4 x float] to determine if the values in the first
842 /// operand are not greater than those in the second operand.
844 /// \headerfile <x86intrin.h>
846 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
850 /// A 128-bit vector of [4 x float].
852 /// A 128-bit vector of [4 x float].
853 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
854 static __inline__ __m128 __DEFAULT_FN_ATTRS
855 _mm_cmpngt_ps(__m128 __a, __m128 __b)
857 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
860 /// Compares two 32-bit float values in the low-order bits of both
861 /// operands to determine if the value in the first operand is not greater
862 /// than or equal to the corresponding value in the second operand and
863 /// returns the result of the comparison in the low-order bits of a vector
866 /// \headerfile <x86intrin.h>
868 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
872 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
873 /// 32 bits of this operand are used in the comparison.
875 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
876 /// 32 bits of this operand are used in the comparison.
877 /// \returns A 128-bit vector of [4 x float] containing the comparison results
878 /// in the low-order bits.
879 static __inline__ __m128 __DEFAULT_FN_ATTRS
880 _mm_cmpnge_ss(__m128 __a, __m128 __b)
882 return (__m128)__builtin_shufflevector((__v4sf)__a,
883 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
887 /// Compares each of the corresponding 32-bit float values of the
888 /// 128-bit vectors of [4 x float] to determine if the values in the first
889 /// operand are not greater than or equal to those in the second operand.
891 /// \headerfile <x86intrin.h>
893 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
897 /// A 128-bit vector of [4 x float].
899 /// A 128-bit vector of [4 x float].
900 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
901 static __inline__ __m128 __DEFAULT_FN_ATTRS
902 _mm_cmpnge_ps(__m128 __a, __m128 __b)
904 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
907 /// Compares two 32-bit float values in the low-order bits of both
908 /// operands to determine if the value in the first operand is ordered with
909 /// respect to the corresponding value in the second operand and returns the
910 /// result of the comparison in the low-order bits of a vector of
913 /// \headerfile <x86intrin.h>
915 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
919 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
920 /// 32 bits of this operand are used in the comparison.
922 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
923 /// 32 bits of this operand are used in the comparison.
924 /// \returns A 128-bit vector of [4 x float] containing the comparison results
925 /// in the low-order bits.
926 static __inline__ __m128 __DEFAULT_FN_ATTRS
927 _mm_cmpord_ss(__m128 __a, __m128 __b)
929 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
932 /// Compares each of the corresponding 32-bit float values of the
933 /// 128-bit vectors of [4 x float] to determine if the values in the first
934 /// operand are ordered with respect to those in the second operand.
936 /// \headerfile <x86intrin.h>
938 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
942 /// A 128-bit vector of [4 x float].
944 /// A 128-bit vector of [4 x float].
945 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
946 static __inline__ __m128 __DEFAULT_FN_ATTRS
947 _mm_cmpord_ps(__m128 __a, __m128 __b)
949 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
952 /// Compares two 32-bit float values in the low-order bits of both
953 /// operands to determine if the value in the first operand is unordered
954 /// with respect to the corresponding value in the second operand and
955 /// returns the result of the comparison in the low-order bits of a vector
958 /// \headerfile <x86intrin.h>
960 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
964 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
965 /// 32 bits of this operand are used in the comparison.
967 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
968 /// 32 bits of this operand are used in the comparison.
969 /// \returns A 128-bit vector of [4 x float] containing the comparison results
970 /// in the low-order bits.
971 static __inline__ __m128 __DEFAULT_FN_ATTRS
972 _mm_cmpunord_ss(__m128 __a, __m128 __b)
974 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
977 /// Compares each of the corresponding 32-bit float values of the
978 /// 128-bit vectors of [4 x float] to determine if the values in the first
979 /// operand are unordered with respect to those in the second operand.
981 /// \headerfile <x86intrin.h>
983 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
987 /// A 128-bit vector of [4 x float].
989 /// A 128-bit vector of [4 x float].
990 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
991 static __inline__ __m128 __DEFAULT_FN_ATTRS
992 _mm_cmpunord_ps(__m128 __a, __m128 __b)
994 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
997 /// Compares two 32-bit float values in the low-order bits of both
998 /// operands for equality and returns the result of the comparison.
1000 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1002 /// \headerfile <x86intrin.h>
1004 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1008 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1009 /// used in the comparison.
1011 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1012 /// used in the comparison.
1013 /// \returns An integer containing the comparison results. If either of the
1014 /// two lower 32-bit values is NaN, 0 is returned.
1015 static __inline__ int __DEFAULT_FN_ATTRS
1016 _mm_comieq_ss(__m128 __a, __m128 __b)
1018 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1021 /// Compares two 32-bit float values in the low-order bits of both
1022 /// operands to determine if the first operand is less than the second
1023 /// operand and returns the result of the comparison.
1025 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1027 /// \headerfile <x86intrin.h>
1029 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1033 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1034 /// used in the comparison.
1036 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1037 /// used in the comparison.
1038 /// \returns An integer containing the comparison results. If either of the two
1039 /// lower 32-bit values is NaN, 0 is returned.
1040 static __inline__ int __DEFAULT_FN_ATTRS
1041 _mm_comilt_ss(__m128 __a, __m128 __b)
1043 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1046 /// Compares two 32-bit float values in the low-order bits of both
1047 /// operands to determine if the first operand is less than or equal to the
1048 /// second operand and returns the result of the comparison.
1050 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1052 /// \headerfile <x86intrin.h>
1054 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1057 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1058 /// used in the comparison.
1060 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1061 /// used in the comparison.
1062 /// \returns An integer containing the comparison results. If either of the two
1063 /// lower 32-bit values is NaN, 0 is returned.
1064 static __inline__ int __DEFAULT_FN_ATTRS
1065 _mm_comile_ss(__m128 __a, __m128 __b)
1067 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1070 /// Compares two 32-bit float values in the low-order bits of both
1071 /// operands to determine if the first operand is greater than the second
1072 /// operand and returns the result of the comparison.
1074 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1076 /// \headerfile <x86intrin.h>
1078 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1081 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1082 /// used in the comparison.
1084 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1085 /// used in the comparison.
1086 /// \returns An integer containing the comparison results. If either of the
1087 /// two lower 32-bit values is NaN, 0 is returned.
1088 static __inline__ int __DEFAULT_FN_ATTRS
1089 _mm_comigt_ss(__m128 __a, __m128 __b)
1091 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1094 /// Compares two 32-bit float values in the low-order bits of both
1095 /// operands to determine if the first operand is greater than or equal to
1096 /// the second operand and returns the result of the comparison.
1098 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1100 /// \headerfile <x86intrin.h>
1102 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1105 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1106 /// used in the comparison.
1108 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1109 /// used in the comparison.
1110 /// \returns An integer containing the comparison results. If either of the two
1111 /// lower 32-bit values is NaN, 0 is returned.
1112 static __inline__ int __DEFAULT_FN_ATTRS
1113 _mm_comige_ss(__m128 __a, __m128 __b)
1115 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1118 /// Compares two 32-bit float values in the low-order bits of both
1119 /// operands to determine if the first operand is not equal to the second
1120 /// operand and returns the result of the comparison.
1122 /// If either of the two lower 32-bit values is NaN, 1 is returned.
1124 /// \headerfile <x86intrin.h>
1126 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1129 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1130 /// used in the comparison.
1132 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1133 /// used in the comparison.
1134 /// \returns An integer containing the comparison results. If either of the
1135 /// two lower 32-bit values is NaN, 1 is returned.
1136 static __inline__ int __DEFAULT_FN_ATTRS
1137 _mm_comineq_ss(__m128 __a, __m128 __b)
1139 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1142 /// Performs an unordered comparison of two 32-bit float values using
1143 /// the low-order bits of both operands to determine equality and returns
1144 /// the result of the comparison.
1146 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1148 /// \headerfile <x86intrin.h>
1150 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1153 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1154 /// used in the comparison.
1156 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1157 /// used in the comparison.
1158 /// \returns An integer containing the comparison results. If either of the two
1159 /// lower 32-bit values is NaN, 0 is returned.
1160 static __inline__ int __DEFAULT_FN_ATTRS
1161 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1163 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1166 /// Performs an unordered comparison of two 32-bit float values using
1167 /// the low-order bits of both operands to determine if the first operand is
1168 /// less than the second operand and returns the result of the comparison.
1170 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1172 /// \headerfile <x86intrin.h>
1174 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1177 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1178 /// used in the comparison.
1180 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1181 /// used in the comparison.
1182 /// \returns An integer containing the comparison results. If either of the two
1183 /// lower 32-bit values is NaN, 0 is returned.
1184 static __inline__ int __DEFAULT_FN_ATTRS
1185 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1187 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1190 /// Performs an unordered comparison of two 32-bit float values using
1191 /// the low-order bits of both operands to determine if the first operand is
1192 /// less than or equal to the second operand and returns the result of the
1195 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1197 /// \headerfile <x86intrin.h>
1199 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1202 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1203 /// used in the comparison.
1205 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1206 /// used in the comparison.
1207 /// \returns An integer containing the comparison results. If either of the two
1208 /// lower 32-bit values is NaN, 0 is returned.
1209 static __inline__ int __DEFAULT_FN_ATTRS
1210 _mm_ucomile_ss(__m128 __a, __m128 __b)
1212 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1215 /// Performs an unordered comparison of two 32-bit float values using
1216 /// the low-order bits of both operands to determine if the first operand is
1217 /// greater than the second operand and returns the result of the
1220 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1222 /// \headerfile <x86intrin.h>
1224 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1227 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1228 /// used in the comparison.
1230 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1231 /// used in the comparison.
1232 /// \returns An integer containing the comparison results. If either of the two
1233 /// lower 32-bit values is NaN, 0 is returned.
1234 static __inline__ int __DEFAULT_FN_ATTRS
1235 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1237 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1240 /// Performs an unordered comparison of two 32-bit float values using
1241 /// the low-order bits of both operands to determine if the first operand is
1242 /// greater than or equal to the second operand and returns the result of
1245 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1247 /// \headerfile <x86intrin.h>
1249 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1252 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1253 /// used in the comparison.
1255 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1256 /// used in the comparison.
1257 /// \returns An integer containing the comparison results. If either of the two
1258 /// lower 32-bit values is NaN, 0 is returned.
1259 static __inline__ int __DEFAULT_FN_ATTRS
1260 _mm_ucomige_ss(__m128 __a, __m128 __b)
1262 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1265 /// Performs an unordered comparison of two 32-bit float values using
1266 /// the low-order bits of both operands to determine inequality and returns
1267 /// the result of the comparison.
1269 /// If either of the two lower 32-bit values is NaN, 1 is returned.
1271 /// \headerfile <x86intrin.h>
1273 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1276 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1277 /// used in the comparison.
1279 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1280 /// used in the comparison.
1281 /// \returns An integer containing the comparison results. If either of the two
1282 /// lower 32-bit values is NaN, 1 is returned.
1283 static __inline__ int __DEFAULT_FN_ATTRS
1284 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1286 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1289 /// Converts a float value contained in the lower 32 bits of a vector of
1290 /// [4 x float] into a 32-bit integer.
1292 /// \headerfile <x86intrin.h>
1294 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1298 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1299 /// used in the conversion.
1300 /// \returns A 32-bit integer containing the converted value.
1301 static __inline__ int __DEFAULT_FN_ATTRS
1302 _mm_cvtss_si32(__m128 __a)
1304 return __builtin_ia32_cvtss2si((__v4sf)__a);
1307 /// Converts a float value contained in the lower 32 bits of a vector of
1308 /// [4 x float] into a 32-bit integer.
1310 /// \headerfile <x86intrin.h>
1312 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1316 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1317 /// used in the conversion.
1318 /// \returns A 32-bit integer containing the converted value.
1319 static __inline__ int __DEFAULT_FN_ATTRS
1320 _mm_cvt_ss2si(__m128 __a)
1322 return _mm_cvtss_si32(__a);
1327 /// Converts a float value contained in the lower 32 bits of a vector of
1328 /// [4 x float] into a 64-bit integer.
1330 /// \headerfile <x86intrin.h>
1332 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1336 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1337 /// used in the conversion.
1338 /// \returns A 64-bit integer containing the converted value.
1339 static __inline__ long long __DEFAULT_FN_ATTRS
1340 _mm_cvtss_si64(__m128 __a)
1342 return __builtin_ia32_cvtss2si64((__v4sf)__a);
1347 /// Converts two low-order float values in a 128-bit vector of
1348 /// [4 x float] into a 64-bit vector of [2 x i32].
1350 /// \headerfile <x86intrin.h>
1352 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1355 /// A 128-bit vector of [4 x float].
1356 /// \returns A 64-bit integer vector containing the converted values.
1357 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1358 _mm_cvtps_pi32(__m128 __a)
1360 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1363 /// Converts two low-order float values in a 128-bit vector of
1364 /// [4 x float] into a 64-bit vector of [2 x i32].
1366 /// \headerfile <x86intrin.h>
1368 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1371 /// A 128-bit vector of [4 x float].
1372 /// \returns A 64-bit integer vector containing the converted values.
1373 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1374 _mm_cvt_ps2pi(__m128 __a)
1376 return _mm_cvtps_pi32(__a);
1379 /// Converts a float value contained in the lower 32 bits of a vector of
1380 /// [4 x float] into a 32-bit integer, truncating the result when it is
1383 /// \headerfile <x86intrin.h>
1385 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1389 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1390 /// used in the conversion.
1391 /// \returns A 32-bit integer containing the converted value.
1392 static __inline__ int __DEFAULT_FN_ATTRS
1393 _mm_cvttss_si32(__m128 __a)
1395 return __builtin_ia32_cvttss2si((__v4sf)__a);
1398 /// Converts a float value contained in the lower 32 bits of a vector of
1399 /// [4 x float] into a 32-bit integer, truncating the result when it is
1402 /// \headerfile <x86intrin.h>
1404 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1408 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1409 /// used in the conversion.
1410 /// \returns A 32-bit integer containing the converted value.
1411 static __inline__ int __DEFAULT_FN_ATTRS
1412 _mm_cvtt_ss2si(__m128 __a)
1414 return _mm_cvttss_si32(__a);
1418 /// Converts a float value contained in the lower 32 bits of a vector of
1419 /// [4 x float] into a 64-bit integer, truncating the result when it is
1422 /// \headerfile <x86intrin.h>
1424 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1428 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1429 /// used in the conversion.
1430 /// \returns A 64-bit integer containing the converted value.
1431 static __inline__ long long __DEFAULT_FN_ATTRS
1432 _mm_cvttss_si64(__m128 __a)
1434 return __builtin_ia32_cvttss2si64((__v4sf)__a);
1438 /// Converts two low-order float values in a 128-bit vector of
1439 /// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1440 /// when it is inexact.
1442 /// \headerfile <x86intrin.h>
1444 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1448 /// A 128-bit vector of [4 x float].
1449 /// \returns A 64-bit integer vector containing the converted values.
1450 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1451 _mm_cvttps_pi32(__m128 __a)
1453 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1456 /// Converts two low-order float values in a 128-bit vector of [4 x
1457 /// float] into a 64-bit vector of [2 x i32], truncating the result when it
1460 /// \headerfile <x86intrin.h>
1462 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1465 /// A 128-bit vector of [4 x float].
1466 /// \returns A 64-bit integer vector containing the converted values.
1467 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1468 _mm_cvtt_ps2pi(__m128 __a)
1470 return _mm_cvttps_pi32(__a);
1473 /// Converts a 32-bit signed integer value into a floating point value
1474 /// and writes it to the lower 32 bits of the destination. The remaining
1475 /// higher order elements of the destination vector are copied from the
1476 /// corresponding elements in the first operand.
1478 /// \headerfile <x86intrin.h>
1480 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1483 /// A 128-bit vector of [4 x float].
1485 /// A 32-bit signed integer operand containing the value to be converted.
1486 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1487 /// converted value of the second operand. The upper 96 bits are copied from
1488 /// the upper 96 bits of the first operand.
1489 static __inline__ __m128 __DEFAULT_FN_ATTRS
1490 _mm_cvtsi32_ss(__m128 __a, int __b)
1496 /// Converts a 32-bit signed integer value into a floating point value
1497 /// and writes it to the lower 32 bits of the destination. The remaining
1498 /// higher order elements of the destination are copied from the
1499 /// corresponding elements in the first operand.
1501 /// \headerfile <x86intrin.h>
1503 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1506 /// A 128-bit vector of [4 x float].
1508 /// A 32-bit signed integer operand containing the value to be converted.
1509 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1510 /// converted value of the second operand. The upper 96 bits are copied from
1511 /// the upper 96 bits of the first operand.
1512 static __inline__ __m128 __DEFAULT_FN_ATTRS
1513 _mm_cvt_si2ss(__m128 __a, int __b)
1515 return _mm_cvtsi32_ss(__a, __b);
1520 /// Converts a 64-bit signed integer value into a floating point value
1521 /// and writes it to the lower 32 bits of the destination. The remaining
1522 /// higher order elements of the destination are copied from the
1523 /// corresponding elements in the first operand.
1525 /// \headerfile <x86intrin.h>
1527 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1530 /// A 128-bit vector of [4 x float].
1532 /// A 64-bit signed integer operand containing the value to be converted.
1533 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1534 /// converted value of the second operand. The upper 96 bits are copied from
1535 /// the upper 96 bits of the first operand.
1536 static __inline__ __m128 __DEFAULT_FN_ATTRS
1537 _mm_cvtsi64_ss(__m128 __a, long long __b)
1545 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1546 /// floating point values and writes them to the lower 64-bits of the
1547 /// destination. The remaining higher order elements of the destination are
1548 /// copied from the corresponding elements in the first operand.
1550 /// \headerfile <x86intrin.h>
1552 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1555 /// A 128-bit vector of [4 x float].
1557 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1558 /// and written to the corresponding low-order elements in the destination.
1559 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1560 /// converted value of the second operand. The upper 64 bits are copied from
1561 /// the upper 64 bits of the first operand.
1562 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1563 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1565 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1568 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1569 /// floating point values and writes them to the lower 64-bits of the
1570 /// destination. The remaining higher order elements of the destination are
1571 /// copied from the corresponding elements in the first operand.
1573 /// \headerfile <x86intrin.h>
1575 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1578 /// A 128-bit vector of [4 x float].
1580 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1581 /// and written to the corresponding low-order elements in the destination.
1582 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1583 /// converted value from the second operand. The upper 64 bits are copied
1584 /// from the upper 64 bits of the first operand.
1585 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1586 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1588 return _mm_cvtpi32_ps(__a, __b);
1591 /// Extracts a float value contained in the lower 32 bits of a vector of
1594 /// \headerfile <x86intrin.h>
1596 /// This intrinsic has no corresponding instruction.
1599 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1600 /// used in the extraction.
1601 /// \returns A 32-bit float containing the extracted value.
1602 static __inline__ float __DEFAULT_FN_ATTRS
1603 _mm_cvtss_f32(__m128 __a)
1608 /// Loads two packed float values from the address \a __p into the
1609 /// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1610 /// are copied from the low-order bits of the first operand.
1612 /// \headerfile <x86intrin.h>
1614 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1617 /// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1618 /// of the destination.
1620 /// A pointer to two packed float values. Bits [63:0] are written to bits
1621 /// [127:64] of the destination.
1622 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1623 static __inline__ __m128 __DEFAULT_FN_ATTRS
1624 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1626 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1627 struct __mm_loadh_pi_struct {
1628 __mm_loadh_pi_v2f32 __u;
1629 } __attribute__((__packed__, __may_alias__));
1630 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1631 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1632 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1635 /// Loads two packed float values from the address \a __p into the
1636 /// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1637 /// are copied from the high-order bits of the first operand.
1639 /// \headerfile <x86intrin.h>
1641 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1644 /// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1645 /// [127:64] of the destination.
1647 /// A pointer to two packed float values. Bits [63:0] are written to bits
1648 /// [63:0] of the destination.
1649 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1650 static __inline__ __m128 __DEFAULT_FN_ATTRS
1651 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1653 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1654 struct __mm_loadl_pi_struct {
1655 __mm_loadl_pi_v2f32 __u;
1656 } __attribute__((__packed__, __may_alias__));
1657 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1658 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1659 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1662 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1663 /// 32 bits of the vector are initialized with the single-precision
1664 /// floating-point value loaded from a specified memory location. The upper
1665 /// 96 bits are set to zero.
1667 /// \headerfile <x86intrin.h>
1669 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1672 /// A pointer to a 32-bit memory location containing a single-precision
1673 /// floating-point value.
1674 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1675 /// lower 32 bits contain the value loaded from the memory location. The
1676 /// upper 96 bits are set to zero.
1677 static __inline__ __m128 __DEFAULT_FN_ATTRS
1678 _mm_load_ss(const float *__p)
1680 struct __mm_load_ss_struct {
1682 } __attribute__((__packed__, __may_alias__));
1683 float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1684 return __extension__ (__m128){ __u, 0, 0, 0 };
1687 /// Loads a 32-bit float value and duplicates it to all four vector
1688 /// elements of a 128-bit vector of [4 x float].
1690 /// \headerfile <x86intrin.h>
1692 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1696 /// A pointer to a float value to be loaded and duplicated.
1697 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1698 /// duplicated values.
1699 static __inline__ __m128 __DEFAULT_FN_ATTRS
1700 _mm_load1_ps(const float *__p)
1702 struct __mm_load1_ps_struct {
1704 } __attribute__((__packed__, __may_alias__));
1705 float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1706 return __extension__ (__m128){ __u, __u, __u, __u };
1709 #define _mm_load_ps1(p) _mm_load1_ps(p)
1711 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1712 /// memory location.
1714 /// \headerfile <x86intrin.h>
1716 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1719 /// A pointer to a 128-bit memory location. The address of the memory
1720 /// location has to be 128-bit aligned.
1721 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1722 static __inline__ __m128 __DEFAULT_FN_ATTRS
1723 _mm_load_ps(const float *__p)
1725 return *(const __m128*)__p;
1728 /// Loads a 128-bit floating-point vector of [4 x float] from an
1729 /// unaligned memory location.
1731 /// \headerfile <x86intrin.h>
1733 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1736 /// A pointer to a 128-bit memory location. The address of the memory
1737 /// location does not have to be aligned.
1738 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1739 static __inline__ __m128 __DEFAULT_FN_ATTRS
1740 _mm_loadu_ps(const float *__p)
1744 } __attribute__((__packed__, __may_alias__));
1745 return ((const struct __loadu_ps*)__p)->__v;
1748 /// Loads four packed float values, in reverse order, from an aligned
1749 /// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1751 /// \headerfile <x86intrin.h>
1753 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1757 /// A pointer to a 128-bit memory location. The address of the memory
1758 /// location has to be 128-bit aligned.
1759 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1760 /// in reverse order.
1761 static __inline__ __m128 __DEFAULT_FN_ATTRS
1762 _mm_loadr_ps(const float *__p)
1764 __m128 __a = _mm_load_ps(__p);
1765 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1768 /// Create a 128-bit vector of [4 x float] with undefined values.
1770 /// \headerfile <x86intrin.h>
1772 /// This intrinsic has no corresponding instruction.
1774 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1775 static __inline__ __m128 __DEFAULT_FN_ATTRS
1776 _mm_undefined_ps(void)
1778 return (__m128)__builtin_ia32_undef128();
1781 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1782 /// 32 bits of the vector are initialized with the specified single-precision
1783 /// floating-point value. The upper 96 bits are set to zero.
1785 /// \headerfile <x86intrin.h>
1787 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1790 /// A single-precision floating-point value used to initialize the lower 32
1791 /// bits of the result.
1792 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1793 /// lower 32 bits contain the value provided in the source operand. The
1794 /// upper 96 bits are set to zero.
1795 static __inline__ __m128 __DEFAULT_FN_ATTRS
1796 _mm_set_ss(float __w)
1798 return __extension__ (__m128){ __w, 0, 0, 0 };
1801 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1802 /// of the four single-precision floating-point vector elements set to the
1803 /// specified single-precision floating-point value.
1805 /// \headerfile <x86intrin.h>
1807 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1810 /// A single-precision floating-point value used to initialize each vector
1811 /// element of the result.
1812 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1813 static __inline__ __m128 __DEFAULT_FN_ATTRS
1814 _mm_set1_ps(float __w)
1816 return __extension__ (__m128){ __w, __w, __w, __w };
1819 /* Microsoft specific. */
1820 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1821 /// of the four single-precision floating-point vector elements set to the
1822 /// specified single-precision floating-point value.
1824 /// \headerfile <x86intrin.h>
1826 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1829 /// A single-precision floating-point value used to initialize each vector
1830 /// element of the result.
1831 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1832 static __inline__ __m128 __DEFAULT_FN_ATTRS
1833 _mm_set_ps1(float __w)
1835 return _mm_set1_ps(__w);
1838 /// Constructs a 128-bit floating-point vector of [4 x float]
1839 /// initialized with the specified single-precision floating-point values.
1841 /// \headerfile <x86intrin.h>
1843 /// This intrinsic is a utility function and does not correspond to a specific
1847 /// A single-precision floating-point value used to initialize bits [127:96]
1850 /// A single-precision floating-point value used to initialize bits [95:64]
1853 /// A single-precision floating-point value used to initialize bits [63:32]
1856 /// A single-precision floating-point value used to initialize bits [31:0]
1858 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1859 static __inline__ __m128 __DEFAULT_FN_ATTRS
1860 _mm_set_ps(float __z, float __y, float __x, float __w)
1862 return __extension__ (__m128){ __w, __x, __y, __z };
1865 /// Constructs a 128-bit floating-point vector of [4 x float],
1866 /// initialized in reverse order with the specified 32-bit single-precision
1867 /// float-point values.
1869 /// \headerfile <x86intrin.h>
1871 /// This intrinsic is a utility function and does not correspond to a specific
1875 /// A single-precision floating-point value used to initialize bits [31:0]
1878 /// A single-precision floating-point value used to initialize bits [63:32]
1881 /// A single-precision floating-point value used to initialize bits [95:64]
1884 /// A single-precision floating-point value used to initialize bits [127:96]
1886 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1887 static __inline__ __m128 __DEFAULT_FN_ATTRS
1888 _mm_setr_ps(float __z, float __y, float __x, float __w)
1890 return __extension__ (__m128){ __z, __y, __x, __w };
1893 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
1896 /// \headerfile <x86intrin.h>
1898 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1900 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
1901 /// all elements set to zero.
1902 static __inline__ __m128 __DEFAULT_FN_ATTRS
1903 _mm_setzero_ps(void)
1905 return __extension__ (__m128){ 0, 0, 0, 0 };
1908 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1909 /// memory location.
1911 /// \headerfile <x86intrin.h>
1913 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1916 /// A pointer to a 64-bit memory location.
1918 /// A 128-bit vector of [4 x float] containing the values to be stored.
1919 static __inline__ void __DEFAULT_FN_ATTRS
1920 _mm_storeh_pi(__m64 *__p, __m128 __a)
1922 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1923 struct __mm_storeh_pi_struct {
1924 __mm_storeh_pi_v2f32 __u;
1925 } __attribute__((__packed__, __may_alias__));
1926 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1929 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1930 /// memory location.
1932 /// \headerfile <x86intrin.h>
1934 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1937 /// A pointer to a memory location that will receive the float values.
1939 /// A 128-bit vector of [4 x float] containing the values to be stored.
1940 static __inline__ void __DEFAULT_FN_ATTRS
1941 _mm_storel_pi(__m64 *__p, __m128 __a)
1943 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1944 struct __mm_storeh_pi_struct {
1945 __mm_storeh_pi_v2f32 __u;
1946 } __attribute__((__packed__, __may_alias__));
1947 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1950 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1951 /// memory location.
1953 /// \headerfile <x86intrin.h>
1955 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1958 /// A pointer to a 32-bit memory location.
1960 /// A 128-bit vector of [4 x float] containing the value to be stored.
1961 static __inline__ void __DEFAULT_FN_ATTRS
1962 _mm_store_ss(float *__p, __m128 __a)
1964 struct __mm_store_ss_struct {
1966 } __attribute__((__packed__, __may_alias__));
1967 ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1970 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
1973 /// \headerfile <x86intrin.h>
1975 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1978 /// A pointer to a 128-bit memory location. The address of the memory
1979 /// location does not have to be aligned.
1981 /// A 128-bit vector of [4 x float] containing the values to be stored.
1982 static __inline__ void __DEFAULT_FN_ATTRS
1983 _mm_storeu_ps(float *__p, __m128 __a)
1985 struct __storeu_ps {
1987 } __attribute__((__packed__, __may_alias__));
1988 ((struct __storeu_ps*)__p)->__v = __a;
1991 /// Stores a 128-bit vector of [4 x float] into an aligned memory
1994 /// \headerfile <x86intrin.h>
1996 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1999 /// A pointer to a 128-bit memory location. The address of the memory
2000 /// location has to be 16-byte aligned.
2002 /// A 128-bit vector of [4 x float] containing the values to be stored.
2003 static __inline__ void __DEFAULT_FN_ATTRS
2004 _mm_store_ps(float *__p, __m128 __a)
2006 *(__m128*)__p = __a;
2009 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2010 /// four contiguous elements in an aligned memory location.
2012 /// \headerfile <x86intrin.h>
2014 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2018 /// A pointer to a 128-bit memory location.
2020 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2021 /// of the four contiguous elements pointed by \a __p.
2022 static __inline__ void __DEFAULT_FN_ATTRS
2023 _mm_store1_ps(float *__p, __m128 __a)
2025 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2026 _mm_store_ps(__p, __a);
2029 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2030 /// four contiguous elements in an aligned memory location.
2032 /// \headerfile <x86intrin.h>
2034 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2038 /// A pointer to a 128-bit memory location.
2040 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2041 /// of the four contiguous elements pointed by \a __p.
2042 static __inline__ void __DEFAULT_FN_ATTRS
2043 _mm_store_ps1(float *__p, __m128 __a)
2045 _mm_store1_ps(__p, __a);
2048 /// Stores float values from a 128-bit vector of [4 x float] to an
2049 /// aligned memory location in reverse order.
2051 /// \headerfile <x86intrin.h>
2053 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2057 /// A pointer to a 128-bit memory location. The address of the memory
2058 /// location has to be 128-bit aligned.
2060 /// A 128-bit vector of [4 x float] containing the values to be stored.
2061 static __inline__ void __DEFAULT_FN_ATTRS
2062 _mm_storer_ps(float *__p, __m128 __a)
2064 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2065 _mm_store_ps(__p, __a);
2068 #define _MM_HINT_ET0 7
2069 #define _MM_HINT_ET1 6
2070 #define _MM_HINT_T0 3
2071 #define _MM_HINT_T1 2
2072 #define _MM_HINT_T2 1
2073 #define _MM_HINT_NTA 0
2076 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2077 Sema doesn't do any form of constant propagation yet. */
2079 /// Loads one cache line of data from the specified address to a location
2080 /// closer to the processor.
2082 /// \headerfile <x86intrin.h>
2085 /// void _mm_prefetch(const void * a, const int sel);
2088 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2091 /// A pointer to a memory location containing a cache line of data.
2093 /// A predefined integer constant specifying the type of prefetch
2095 /// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2096 /// PREFETCHNTA instruction will be generated. \n
2097 /// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2098 /// be generated. \n
2099 /// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2100 /// be generated. \n
2101 /// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2103 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2104 ((sel) >> 2) & 1, (sel) & 0x3))
2107 /// Stores a 64-bit integer in the specified aligned memory location. To
2108 /// minimize caching, the data is flagged as non-temporal (unlikely to be
2109 /// used again soon).
2111 /// \headerfile <x86intrin.h>
2113 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2116 /// A pointer to an aligned memory location used to store the register value.
2118 /// A 64-bit integer containing the value to be stored.
2119 static __inline__ void __DEFAULT_FN_ATTRS_MMX
2120 _mm_stream_pi(__m64 *__p, __m64 __a)
2122 __builtin_ia32_movntq(__p, __a);
2125 /// Moves packed float values from a 128-bit vector of [4 x float] to a
2126 /// 128-bit aligned memory location. To minimize caching, the data is flagged
2127 /// as non-temporal (unlikely to be used again soon).
2129 /// \headerfile <x86intrin.h>
2131 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2134 /// A pointer to a 128-bit aligned memory location that will receive the
2135 /// single-precision floating-point values.
2137 /// A 128-bit vector of [4 x float] containing the values to be moved.
2138 static __inline__ void __DEFAULT_FN_ATTRS
2139 _mm_stream_ps(float *__p, __m128 __a)
2141 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2144 #if defined(__cplusplus)
2148 /// Forces strong memory ordering (serialization) between store
2149 /// instructions preceding this instruction and store instructions following
2150 /// this instruction, ensuring the system completes all previous stores
2151 /// before executing subsequent stores.
2153 /// \headerfile <x86intrin.h>
2155 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2157 void _mm_sfence(void);
2159 #if defined(__cplusplus)
2163 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2164 /// returns it, as specified by the immediate integer operand.
2166 /// \headerfile <x86intrin.h>
2169 /// int _mm_extract_pi16(__m64 a, int n);
2172 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2175 /// A 64-bit vector of [4 x i16].
2177 /// An immediate integer operand that determines which bits are extracted: \n
2178 /// 0: Bits [15:0] are copied to the destination. \n
2179 /// 1: Bits [31:16] are copied to the destination. \n
2180 /// 2: Bits [47:32] are copied to the destination. \n
2181 /// 3: Bits [63:48] are copied to the destination.
2182 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2183 #define _mm_extract_pi16(a, n) \
2184 (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
2186 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
2187 /// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2188 /// specified by the immediate operand \a n.
2190 /// \headerfile <x86intrin.h>
2193 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2196 /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2199 /// A 64-bit vector of [4 x i16].
2201 /// An integer. The lower 16-bit value from this operand is written to the
2202 /// destination at the offset specified by operand \a n.
2204 /// An immediate integer operant that determines which the bits to be used
2205 /// in the destination. \n
2206 /// 0: Bits [15:0] are copied to the destination. \n
2207 /// 1: Bits [31:16] are copied to the destination. \n
2208 /// 2: Bits [47:32] are copied to the destination. \n
2209 /// 3: Bits [63:48] are copied to the destination. \n
2210 /// The remaining bits in the destination are copied from the corresponding
2211 /// bits in operand \a a.
2212 /// \returns A 64-bit integer vector containing the copied packed data from the
2214 #define _mm_insert_pi16(a, d, n) \
2215 (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)
2217 /// Compares each of the corresponding packed 16-bit integer values of
2218 /// the 64-bit integer vectors, and writes the greater value to the
2219 /// corresponding bits in the destination.
2221 /// \headerfile <x86intrin.h>
2223 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2226 /// A 64-bit integer vector containing one of the source operands.
2228 /// A 64-bit integer vector containing one of the source operands.
2229 /// \returns A 64-bit integer vector containing the comparison results.
2230 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2231 _mm_max_pi16(__m64 __a, __m64 __b)
2233 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2236 /// Compares each of the corresponding packed 8-bit unsigned integer
2237 /// values of the 64-bit integer vectors, and writes the greater value to the
2238 /// corresponding bits in the destination.
2240 /// \headerfile <x86intrin.h>
2242 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2245 /// A 64-bit integer vector containing one of the source operands.
2247 /// A 64-bit integer vector containing one of the source operands.
2248 /// \returns A 64-bit integer vector containing the comparison results.
2249 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2250 _mm_max_pu8(__m64 __a, __m64 __b)
2252 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2255 /// Compares each of the corresponding packed 16-bit integer values of
2256 /// the 64-bit integer vectors, and writes the lesser value to the
2257 /// corresponding bits in the destination.
2259 /// \headerfile <x86intrin.h>
2261 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2264 /// A 64-bit integer vector containing one of the source operands.
2266 /// A 64-bit integer vector containing one of the source operands.
2267 /// \returns A 64-bit integer vector containing the comparison results.
2268 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2269 _mm_min_pi16(__m64 __a, __m64 __b)
2271 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2274 /// Compares each of the corresponding packed 8-bit unsigned integer
2275 /// values of the 64-bit integer vectors, and writes the lesser value to the
2276 /// corresponding bits in the destination.
2278 /// \headerfile <x86intrin.h>
2280 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2283 /// A 64-bit integer vector containing one of the source operands.
2285 /// A 64-bit integer vector containing one of the source operands.
2286 /// \returns A 64-bit integer vector containing the comparison results.
2287 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2288 _mm_min_pu8(__m64 __a, __m64 __b)
2290 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2293 /// Takes the most significant bit from each 8-bit element in a 64-bit
2294 /// integer vector to create an 8-bit mask value. Zero-extends the value to
2295 /// 32-bit integer and writes it to the destination.
2297 /// \headerfile <x86intrin.h>
2299 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2302 /// A 64-bit integer vector containing the values with bits to be extracted.
2303 /// \returns The most significant bit from each 8-bit element in \a __a,
2304 /// written to bits [7:0].
2305 static __inline__ int __DEFAULT_FN_ATTRS_MMX
2306 _mm_movemask_pi8(__m64 __a)
2308 return __builtin_ia32_pmovmskb((__v8qi)__a);
2311 /// Multiplies packed 16-bit unsigned integer values and writes the
2312 /// high-order 16 bits of each 32-bit product to the corresponding bits in
2313 /// the destination.
2315 /// \headerfile <x86intrin.h>
2317 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2320 /// A 64-bit integer vector containing one of the source operands.
2322 /// A 64-bit integer vector containing one of the source operands.
2323 /// \returns A 64-bit integer vector containing the products of both operands.
2324 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2325 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2327 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2330 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2331 /// destination, as specified by the immediate value operand.
2333 /// \headerfile <x86intrin.h>
2336 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2339 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2342 /// A 64-bit integer vector containing the values to be shuffled.
2344 /// An immediate value containing an 8-bit value specifying which elements to
2345 /// copy from \a a. The destinations within the 64-bit destination are
2346 /// assigned values as follows: \n
2347 /// Bits [1:0] are used to assign values to bits [15:0] in the
2349 /// Bits [3:2] are used to assign values to bits [31:16] in the
2351 /// Bits [5:4] are used to assign values to bits [47:32] in the
2353 /// Bits [7:6] are used to assign values to bits [63:48] in the
2355 /// Bit value assignments: \n
2356 /// 00: assigned from bits [15:0] of \a a. \n
2357 /// 01: assigned from bits [31:16] of \a a. \n
2358 /// 10: assigned from bits [47:32] of \a a. \n
2359 /// 11: assigned from bits [63:48] of \a a.
2360 /// \returns A 64-bit integer vector containing the shuffled values.
2361 #define _mm_shuffle_pi16(a, n) \
2362 (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
2364 /// Conditionally copies the values from each 8-bit element in the first
2365 /// 64-bit integer vector operand to the specified memory location, as
2366 /// specified by the most significant bit in the corresponding element in the
2367 /// second 64-bit integer vector operand.
2369 /// To minimize caching, the data is flagged as non-temporal
2370 /// (unlikely to be used again soon).
2372 /// \headerfile <x86intrin.h>
2374 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2377 /// A 64-bit integer vector containing the values with elements to be copied.
2379 /// A 64-bit integer vector operand. The most significant bit from each 8-bit
2380 /// element determines whether the corresponding element in operand \a __d
2381 /// is copied. If the most significant bit of a given element is 1, the
2382 /// corresponding element in operand \a __d is copied.
2384 /// A pointer to a 64-bit memory location that will receive the conditionally
2385 /// copied integer values. The address of the memory location does not have
2387 static __inline__ void __DEFAULT_FN_ATTRS_MMX
2388 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2390 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2393 /// Computes the rounded averages of the packed unsigned 8-bit integer
2394 /// values and writes the averages to the corresponding bits in the
2397 /// \headerfile <x86intrin.h>
2399 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2402 /// A 64-bit integer vector containing one of the source operands.
2404 /// A 64-bit integer vector containing one of the source operands.
2405 /// \returns A 64-bit integer vector containing the averages of both operands.
2406 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2407 _mm_avg_pu8(__m64 __a, __m64 __b)
2409 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2412 /// Computes the rounded averages of the packed unsigned 16-bit integer
2413 /// values and writes the averages to the corresponding bits in the
2416 /// \headerfile <x86intrin.h>
2418 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2421 /// A 64-bit integer vector containing one of the source operands.
2423 /// A 64-bit integer vector containing one of the source operands.
2424 /// \returns A 64-bit integer vector containing the averages of both operands.
2425 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2426 _mm_avg_pu16(__m64 __a, __m64 __b)
2428 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2431 /// Subtracts the corresponding 8-bit unsigned integer values of the two
2432 /// 64-bit vector operands and computes the absolute value for each of the
2433 /// difference. Then sum of the 8 absolute differences is written to the
2434 /// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2436 /// \headerfile <x86intrin.h>
2438 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2441 /// A 64-bit integer vector containing one of the source operands.
2443 /// A 64-bit integer vector containing one of the source operands.
2444 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2445 /// sets of absolute differences between both operands. The upper bits are
2447 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2448 _mm_sad_pu8(__m64 __a, __m64 __b)
2450 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2453 #if defined(__cplusplus)
2457 /// Returns the contents of the MXCSR register as a 32-bit unsigned
2460 /// There are several groups of macros associated with this
2461 /// intrinsic, including:
2464 /// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2465 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2466 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2467 /// _MM_GET_EXCEPTION_STATE().
2470 /// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2471 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2472 /// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2475 /// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2476 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2477 /// _MM_GET_ROUNDING_MODE().
2480 /// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2481 /// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2484 /// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2485 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2486 /// _MM_GET_DENORMALS_ZERO_MODE().
2490 /// For example, the following expression checks if an overflow exception has
2493 /// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2496 /// The following expression gets the current rounding mode:
2498 /// _MM_GET_ROUNDING_MODE()
2501 /// \headerfile <x86intrin.h>
2503 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2505 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2507 unsigned int _mm_getcsr(void);
2509 /// Sets the MXCSR register with the 32-bit unsigned integer value.
2511 /// There are several groups of macros associated with this intrinsic,
2515 /// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2516 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2517 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2518 /// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2521 /// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2522 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2523 /// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2524 /// of these macros.
2527 /// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2528 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2529 /// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2532 /// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2533 /// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2534 /// one of these macros.
2537 /// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2538 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2539 /// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2543 /// For example, the following expression causes subsequent floating-point
2544 /// operations to round up:
2545 /// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2547 /// The following example sets the DAZ and FTZ flags:
2549 /// void setFlags() {
2550 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2551 /// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2555 /// \headerfile <x86intrin.h>
2557 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2560 /// A 32-bit unsigned integer value to be written to the MXCSR register.
2561 void _mm_setcsr(unsigned int __i);
2563 #if defined(__cplusplus)
2567 /// Selects 4 float values from the 128-bit operands of [4 x float], as
2568 /// specified by the immediate value operand.
2570 /// \headerfile <x86intrin.h>
2573 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2576 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2579 /// A 128-bit vector of [4 x float].
2581 /// A 128-bit vector of [4 x float].
2583 /// An immediate value containing an 8-bit value specifying which elements to
2584 /// copy from \a a and \a b. \n
2585 /// Bits [3:0] specify the values copied from operand \a a. \n
2586 /// Bits [7:4] specify the values copied from operand \a b. \n
2587 /// The destinations within the 128-bit destination are assigned values as
2589 /// Bits [1:0] are used to assign values to bits [31:0] in the
2591 /// Bits [3:2] are used to assign values to bits [63:32] in the
2593 /// Bits [5:4] are used to assign values to bits [95:64] in the
2595 /// Bits [7:6] are used to assign values to bits [127:96] in the
2597 /// Bit value assignments: \n
2598 /// 00: Bits [31:0] copied from the specified operand. \n
2599 /// 01: Bits [63:32] copied from the specified operand. \n
2600 /// 10: Bits [95:64] copied from the specified operand. \n
2601 /// 11: Bits [127:96] copied from the specified operand.
2602 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2603 #define _mm_shuffle_ps(a, b, mask) \
2604 (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2607 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2608 /// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2610 /// \headerfile <x86intrin.h>
2612 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2615 /// A 128-bit vector of [4 x float]. \n
2616 /// Bits [95:64] are written to bits [31:0] of the destination. \n
2617 /// Bits [127:96] are written to bits [95:64] of the destination.
2619 /// A 128-bit vector of [4 x float].
2620 /// Bits [95:64] are written to bits [63:32] of the destination. \n
2621 /// Bits [127:96] are written to bits [127:96] of the destination.
2622 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2623 static __inline__ __m128 __DEFAULT_FN_ATTRS
2624 _mm_unpackhi_ps(__m128 __a, __m128 __b)
2626 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2629 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2630 /// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2632 /// \headerfile <x86intrin.h>
2634 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2637 /// A 128-bit vector of [4 x float]. \n
2638 /// Bits [31:0] are written to bits [31:0] of the destination. \n
2639 /// Bits [63:32] are written to bits [95:64] of the destination.
2641 /// A 128-bit vector of [4 x float]. \n
2642 /// Bits [31:0] are written to bits [63:32] of the destination. \n
2643 /// Bits [63:32] are written to bits [127:96] of the destination.
2644 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2645 static __inline__ __m128 __DEFAULT_FN_ATTRS
2646 _mm_unpacklo_ps(__m128 __a, __m128 __b)
2648 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2651 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2652 /// 32 bits are set to the lower 32 bits of the second parameter. The upper
2653 /// 96 bits are set to the upper 96 bits of the first parameter.
2655 /// \headerfile <x86intrin.h>
2657 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2661 /// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2662 /// written to the upper 96 bits of the result.
2664 /// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2665 /// written to the lower 32 bits of the result.
2666 /// \returns A 128-bit floating-point vector of [4 x float].
2667 static __inline__ __m128 __DEFAULT_FN_ATTRS
2668 _mm_move_ss(__m128 __a, __m128 __b)
2674 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2675 /// 64 bits are set to the upper 64 bits of the second parameter. The upper
2676 /// 64 bits are set to the upper 64 bits of the first parameter.
2678 /// \headerfile <x86intrin.h>
2680 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2683 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2684 /// written to the upper 64 bits of the result.
2686 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2687 /// written to the lower 64 bits of the result.
2688 /// \returns A 128-bit floating-point vector of [4 x float].
2689 static __inline__ __m128 __DEFAULT_FN_ATTRS
2690 _mm_movehl_ps(__m128 __a, __m128 __b)
2692 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2695 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2696 /// 64 bits are set to the lower 64 bits of the first parameter. The upper
2697 /// 64 bits are set to the lower 64 bits of the second parameter.
2699 /// \headerfile <x86intrin.h>
2701 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2704 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2705 /// written to the lower 64 bits of the result.
2707 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2708 /// written to the upper 64 bits of the result.
2709 /// \returns A 128-bit floating-point vector of [4 x float].
2710 static __inline__ __m128 __DEFAULT_FN_ATTRS
2711 _mm_movelh_ps(__m128 __a, __m128 __b)
2713 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2716 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2719 /// \headerfile <x86intrin.h>
2721 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2724 /// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2725 /// from the corresponding elements in this operand.
2726 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2727 /// values from the operand.
2728 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2729 _mm_cvtpi16_ps(__m64 __a)
2734 __b = _mm_setzero_si64();
2735 __b = _mm_cmpgt_pi16(__b, __a);
2736 __c = _mm_unpackhi_pi16(__a, __b);
2737 __r = _mm_setzero_ps();
2738 __r = _mm_cvtpi32_ps(__r, __c);
2739 __r = _mm_movelh_ps(__r, __r);
2740 __c = _mm_unpacklo_pi16(__a, __b);
2741 __r = _mm_cvtpi32_ps(__r, __c);
2746 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
2747 /// 128-bit vector of [4 x float].
2749 /// \headerfile <x86intrin.h>
2751 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2754 /// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2755 /// destination are copied from the corresponding elements in this operand.
2756 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2757 /// values from the operand.
2758 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2759 _mm_cvtpu16_ps(__m64 __a)
2764 __b = _mm_setzero_si64();
2765 __c = _mm_unpackhi_pi16(__a, __b);
2766 __r = _mm_setzero_ps();
2767 __r = _mm_cvtpi32_ps(__r, __c);
2768 __r = _mm_movelh_ps(__r, __r);
2769 __c = _mm_unpacklo_pi16(__a, __b);
2770 __r = _mm_cvtpi32_ps(__r, __c);
2775 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2776 /// into a 128-bit vector of [4 x float].
2778 /// \headerfile <x86intrin.h>
2780 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2783 /// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2784 /// from the corresponding lower 4 elements in this operand.
2785 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2786 /// values from the operand.
2787 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2788 _mm_cvtpi8_ps(__m64 __a)
2792 __b = _mm_setzero_si64();
2793 __b = _mm_cmpgt_pi8(__b, __a);
2794 __b = _mm_unpacklo_pi8(__a, __b);
2796 return _mm_cvtpi16_ps(__b);
2799 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
2800 /// vector of [8 x u8] into a 128-bit vector of [4 x float].
2802 /// \headerfile <x86intrin.h>
2804 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2807 /// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2808 /// destination are copied from the corresponding lower 4 elements in this
2810 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2811 /// values from the source operand.
2812 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2813 _mm_cvtpu8_ps(__m64 __a)
2817 __b = _mm_setzero_si64();
2818 __b = _mm_unpacklo_pi8(__a, __b);
2820 return _mm_cvtpi16_ps(__b);
2823 /// Converts the two 32-bit signed integer values from each 64-bit vector
2824 /// operand of [2 x i32] into a 128-bit vector of [4 x float].
2826 /// \headerfile <x86intrin.h>
2828 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2831 /// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2832 /// copied from the elements in this operand.
2834 /// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2835 /// copied from the elements in this operand.
2836 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2837 /// copied and converted values from the first operand. The upper 64 bits
2838 /// contain the copied and converted values from the second operand.
2839 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2840 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2844 __c = _mm_setzero_ps();
2845 __c = _mm_cvtpi32_ps(__c, __b);
2846 __c = _mm_movelh_ps(__c, __c);
2848 return _mm_cvtpi32_ps(__c, __a);
2851 /// Converts each single-precision floating-point element of a 128-bit
2852 /// floating-point vector of [4 x float] into a 16-bit signed integer, and
2853 /// packs the results into a 64-bit integer vector of [4 x i16].
2855 /// If the floating-point element is NaN or infinity, or if the
2856 /// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2857 /// it is converted to 0x8000. Otherwise if the floating-point element is
2858 /// greater than 0x7FFF, it is converted to 0x7FFF.
2860 /// \headerfile <x86intrin.h>
2862 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2865 /// A 128-bit floating-point vector of [4 x float].
2866 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2868 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2869 _mm_cvtps_pi16(__m128 __a)
2873 __b = _mm_cvtps_pi32(__a);
2874 __a = _mm_movehl_ps(__a, __a);
2875 __c = _mm_cvtps_pi32(__a);
2877 return _mm_packs_pi32(__b, __c);
2880 /// Converts each single-precision floating-point element of a 128-bit
2881 /// floating-point vector of [4 x float] into an 8-bit signed integer, and
2882 /// packs the results into the lower 32 bits of a 64-bit integer vector of
2883 /// [8 x i8]. The upper 32 bits of the vector are set to 0.
2885 /// If the floating-point element is NaN or infinity, or if the
2886 /// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2887 /// is converted to 0x80. Otherwise if the floating-point element is greater
2888 /// than 0x7F, it is converted to 0x7F.
2890 /// \headerfile <x86intrin.h>
2892 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2895 /// 128-bit floating-point vector of [4 x float].
2896 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2897 /// converted values and the uppper 32 bits are set to zero.
2898 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2899 _mm_cvtps_pi8(__m128 __a)
2903 __b = _mm_cvtps_pi16(__a);
2904 __c = _mm_setzero_si64();
2906 return _mm_packs_pi16(__b, __c);
2909 /// Extracts the sign bits from each single-precision floating-point
2910 /// element of a 128-bit floating-point vector of [4 x float] and returns the
2911 /// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2914 /// \headerfile <x86intrin.h>
2916 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2919 /// A 128-bit floating-point vector of [4 x float].
2920 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2921 /// single-precision floating-point element of the parameter. Bits [31:4] are
2923 static __inline__ int __DEFAULT_FN_ATTRS
2924 _mm_movemask_ps(__m128 __a)
2926 return __builtin_ia32_movmskps((__v4sf)__a);
2930 #define _MM_ALIGN16 __attribute__((aligned(16)))
2932 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2934 #define _MM_EXCEPT_INVALID (0x0001U)
2935 #define _MM_EXCEPT_DENORM (0x0002U)
2936 #define _MM_EXCEPT_DIV_ZERO (0x0004U)
2937 #define _MM_EXCEPT_OVERFLOW (0x0008U)
2938 #define _MM_EXCEPT_UNDERFLOW (0x0010U)
2939 #define _MM_EXCEPT_INEXACT (0x0020U)
2940 #define _MM_EXCEPT_MASK (0x003fU)
2942 #define _MM_MASK_INVALID (0x0080U)
2943 #define _MM_MASK_DENORM (0x0100U)
2944 #define _MM_MASK_DIV_ZERO (0x0200U)
2945 #define _MM_MASK_OVERFLOW (0x0400U)
2946 #define _MM_MASK_UNDERFLOW (0x0800U)
2947 #define _MM_MASK_INEXACT (0x1000U)
2948 #define _MM_MASK_MASK (0x1f80U)
2950 #define _MM_ROUND_NEAREST (0x0000U)
2951 #define _MM_ROUND_DOWN (0x2000U)
2952 #define _MM_ROUND_UP (0x4000U)
2953 #define _MM_ROUND_TOWARD_ZERO (0x6000U)
2954 #define _MM_ROUND_MASK (0x6000U)
2956 #define _MM_FLUSH_ZERO_MASK (0x8000U)
2957 #define _MM_FLUSH_ZERO_ON (0x8000U)
2958 #define _MM_FLUSH_ZERO_OFF (0x0000U)
2960 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2961 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2962 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2963 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2965 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2966 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2967 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2968 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2970 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2972 __m128 tmp3, tmp2, tmp1, tmp0; \
2973 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2974 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2975 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2976 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2977 (row0) = _mm_movelh_ps(tmp0, tmp2); \
2978 (row1) = _mm_movehl_ps(tmp2, tmp0); \
2979 (row2) = _mm_movelh_ps(tmp1, tmp3); \
2980 (row3) = _mm_movehl_ps(tmp3, tmp1); \
2983 /* Aliases for compatibility. */
2984 #define _m_pextrw _mm_extract_pi16
2985 #define _m_pinsrw _mm_insert_pi16
2986 #define _m_pmaxsw _mm_max_pi16
2987 #define _m_pmaxub _mm_max_pu8
2988 #define _m_pminsw _mm_min_pi16
2989 #define _m_pminub _mm_min_pu8
2990 #define _m_pmovmskb _mm_movemask_pi8
2991 #define _m_pmulhuw _mm_mulhi_pu16
2992 #define _m_pshufw _mm_shuffle_pi16
2993 #define _m_maskmovq _mm_maskmove_si64
2994 #define _m_pavgb _mm_avg_pu8
2995 #define _m_pavgw _mm_avg_pu16
2996 #define _m_psadbw _mm_sad_pu8
3000 #undef __DEFAULT_FN_ATTRS
3001 #undef __DEFAULT_FN_ATTRS_MMX
3003 /* Ugly hack for backwards-compatibility (compatible with gcc) */
3004 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3005 #include <emmintrin.h>
3008 #endif /* __XMMINTRIN_H */