1 /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 *===-----------------------------------------------------------------------===
27 #include <tmmintrin.h>
29 /* Define the default attributes for the functions in this file. */
30 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
32 /* SSE4 Rounding macros. */
33 #define _MM_FROUND_TO_NEAREST_INT 0x00
34 #define _MM_FROUND_TO_NEG_INF 0x01
35 #define _MM_FROUND_TO_POS_INF 0x02
36 #define _MM_FROUND_TO_ZERO 0x03
37 #define _MM_FROUND_CUR_DIRECTION 0x04
39 #define _MM_FROUND_RAISE_EXC 0x00
40 #define _MM_FROUND_NO_EXC 0x08
42 #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
43 #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
44 #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
45 #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
46 #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
47 #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
49 /// \brief Rounds up each element of the 128-bit vector of [4 x float] to an
50 /// integer and returns the rounded values in a 128-bit vector of
53 /// \headerfile <x86intrin.h>
56 /// __m128 _mm_ceil_ps(__m128 X);
59 /// This intrinsic corresponds to the <c> <i> VROUNDPS / ROUNDPS </i> </c>
63 /// A 128-bit vector of [4 x float] values to be rounded up.
64 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
65 #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
67 /// \brief Rounds up each element of the 128-bit vector of [2 x double] to an
68 /// integer and returns the rounded values in a 128-bit vector of
71 /// \headerfile <x86intrin.h>
74 /// __m128d _mm_ceil_pd(__m128d X);
77 /// This intrinsic corresponds to the <c> <i> VROUNDPD / ROUNDPD </i> </c>
81 /// A 128-bit vector of [2 x double] values to be rounded up.
82 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
83 #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
85 /// \brief Copies three upper elements of the first 128-bit vector operand to
86 /// the corresponding three upper elements of the 128-bit result vector of
87 /// [4 x float]. Rounds up the lowest element of the second 128-bit vector
88 /// operand to an integer and copies it to the lowest element of the 128-bit
89 /// result vector of [4 x float].
91 /// \headerfile <x86intrin.h>
94 /// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
97 /// This intrinsic corresponds to the <c> <i> VROUNDSS / ROUNDSS </i> </c>
101 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
102 /// copied to the corresponding bits of the result.
104 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
105 /// rounded up to the nearest integer and copied to the corresponding bits
107 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
109 #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
111 /// \brief Copies the upper element of the first 128-bit vector operand to the
112 /// corresponding upper element of the 128-bit result vector of [2 x double].
113 /// Rounds up the lower element of the second 128-bit vector operand to an
114 /// integer and copies it to the lower element of the 128-bit result vector
117 /// \headerfile <x86intrin.h>
120 /// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
123 /// This intrinsic corresponds to the <c> <i> VROUNDSD / ROUNDSD </i> </c>
127 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
128 /// copied to the corresponding bits of the result.
130 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
131 /// rounded up to the nearest integer and copied to the corresponding bits
133 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
135 #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
137 /// \brief Rounds down each element of the 128-bit vector of [4 x float] to an
138 /// an integer and returns the rounded values in a 128-bit vector of
141 /// \headerfile <x86intrin.h>
144 /// __m128 _mm_floor_ps(__m128 X);
147 /// This intrinsic corresponds to the <c> <i> VROUNDPS / ROUNDPS </i> </c>
151 /// A 128-bit vector of [4 x float] values to be rounded down.
152 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
153 #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
155 /// \brief Rounds down each element of the 128-bit vector of [2 x double] to an
156 /// integer and returns the rounded values in a 128-bit vector of
159 /// \headerfile <x86intrin.h>
162 /// __m128d _mm_floor_pd(__m128d X);
165 /// This intrinsic corresponds to the <c> <i> VROUNDPD / ROUNDPD </i> </c>
169 /// A 128-bit vector of [2 x double].
170 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
171 #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
173 /// \brief Copies three upper elements of the first 128-bit vector operand to
174 /// the corresponding three upper elements of the 128-bit result vector of
175 /// [4 x float]. Rounds down the lowest element of the second 128-bit vector
176 /// operand to an integer and copies it to the lowest element of the 128-bit
177 /// result vector of [4 x float].
179 /// \headerfile <x86intrin.h>
182 /// __m128 _mm_floor_ss(__m128 X, __m128 Y);
185 /// This intrinsic corresponds to the <c> <i> VROUNDSS / ROUNDSS </i> </c>
189 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
190 /// copied to the corresponding bits of the result.
192 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
193 /// rounded down to the nearest integer and copied to the corresponding bits
195 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
197 #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
199 /// \brief Copies the upper element of the first 128-bit vector operand to the
200 /// corresponding upper element of the 128-bit result vector of [2 x double].
201 /// Rounds down the lower element of the second 128-bit vector operand to an
202 /// integer and copies it to the lower element of the 128-bit result vector
205 /// \headerfile <x86intrin.h>
208 /// __m128d _mm_floor_sd(__m128d X, __m128d Y);
211 /// This intrinsic corresponds to the <c> <i> VROUNDSD / ROUNDSD </i> </c>
215 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
216 /// copied to the corresponding bits of the result.
218 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
219 /// rounded down to the nearest integer and copied to the corresponding bits
221 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
223 #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
225 /// \brief Rounds each element of the 128-bit vector of [4 x float] to an
226 /// integer value according to the rounding control specified by the second
227 /// argument and returns the rounded values in a 128-bit vector of
230 /// \headerfile <x86intrin.h>
233 /// __m128 _mm_round_ps(__m128 X, const int M);
236 /// This intrinsic corresponds to the <c> <i> VROUNDPS / ROUNDPS </i> </c>
240 /// A 128-bit vector of [4 x float].
242 /// An integer value that specifies the rounding operation. \n
243 /// Bits [7:4] are reserved. \n
244 /// Bit [3] is a precision exception value: \n
245 /// 0: A normal PE exception is used \n
246 /// 1: The PE field is not updated \n
247 /// Bit [2] is the rounding control source: \n
248 /// 0: Use bits [1:0] of \a M \n
249 /// 1: Use the current MXCSR setting \n
250 /// Bits [1:0] contain the rounding control definition: \n
252 /// 01: Downward (toward negative infinity) \n
253 /// 10: Upward (toward positive infinity) \n
255 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
256 #define _mm_round_ps(X, M) __extension__ ({ \
257 (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
259 /// \brief Copies three upper elements of the first 128-bit vector operand to
260 /// the corresponding three upper elements of the 128-bit result vector of
261 /// [4 x float]. Rounds the lowest element of the second 128-bit vector
262 /// operand to an integer value according to the rounding control specified
263 /// by the third argument and copies it to the lowest element of the 128-bit
264 /// result vector of [4 x float].
266 /// \headerfile <x86intrin.h>
269 /// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
272 /// This intrinsic corresponds to the <c> <i> VROUNDSS / ROUNDSS </i> </c>
276 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
277 /// copied to the corresponding bits of the result.
279 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
280 /// rounded to the nearest integer using the specified rounding control and
281 /// copied to the corresponding bits of the result.
283 /// An integer value that specifies the rounding operation. \n
284 /// Bits [7:4] are reserved. \n
285 /// Bit [3] is a precision exception value: \n
286 /// 0: A normal PE exception is used \n
287 /// 1: The PE field is not updated \n
288 /// Bit [2] is the rounding control source: \n
289 /// 0: Use bits [1:0] of \a M \n
290 /// 1: Use the current MXCSR setting \n
291 /// Bits [1:0] contain the rounding control definition: \n
293 /// 01: Downward (toward negative infinity) \n
294 /// 10: Upward (toward positive infinity) \n
296 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
298 #define _mm_round_ss(X, Y, M) __extension__ ({ \
299 (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
300 (__v4sf)(__m128)(Y), (M)); })
302 /// \brief Rounds each element of the 128-bit vector of [2 x double] to an
303 /// integer value according to the rounding control specified by the second
304 /// argument and returns the rounded values in a 128-bit vector of
307 /// \headerfile <x86intrin.h>
310 /// __m128d _mm_round_pd(__m128d X, const int M);
313 /// This intrinsic corresponds to the <c> <i> VROUNDPD / ROUNDPD </i> </c>
317 /// A 128-bit vector of [2 x double].
319 /// An integer value that specifies the rounding operation. \n
320 /// Bits [7:4] are reserved. \n
321 /// Bit [3] is a precision exception value: \n
322 /// 0: A normal PE exception is used \n
323 /// 1: The PE field is not updated \n
324 /// Bit [2] is the rounding control source: \n
325 /// 0: Use bits [1:0] of \a M \n
326 /// 1: Use the current MXCSR setting \n
327 /// Bits [1:0] contain the rounding control definition: \n
329 /// 01: Downward (toward negative infinity) \n
330 /// 10: Upward (toward positive infinity) \n
332 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
333 #define _mm_round_pd(X, M) __extension__ ({ \
334 (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
337 /// \brief Copies the upper element of the first 128-bit vector operand to the
338 /// corresponding upper element of the 128-bit result vector of [2 x double].
339 /// Rounds the lower element of the second 128-bit vector operand to an
340 /// integer value according to the rounding control specified by the third
341 /// argument and copies it to the lower element of the 128-bit result vector
344 /// \headerfile <x86intrin.h>
347 /// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
350 /// This intrinsic corresponds to the <c> <i> VROUNDSD / ROUNDSD </i> </c>
354 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
355 /// copied to the corresponding bits of the result.
357 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
358 /// rounded to the nearest integer using the specified rounding control and
359 /// copied to the corresponding bits of the result.
361 /// An integer value that specifies the rounding operation. \n
362 /// Bits [7:4] are reserved. \n
363 /// Bit [3] is a precision exception value: \n
364 /// 0: A normal PE exception is used \n
365 /// 1: The PE field is not updated \n
366 /// Bit [2] is the rounding control source: \n
367 /// 0: Use bits [1:0] of \a M \n
368 /// 1: Use the current MXCSR setting \n
369 /// Bits [1:0] contain the rounding control definition: \n
371 /// 01: Downward (toward negative infinity) \n
372 /// 10: Upward (toward positive infinity) \n
374 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
376 #define _mm_round_sd(X, Y, M) __extension__ ({ \
377 (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
378 (__v2df)(__m128d)(Y), (M)); })
380 /* SSE4 Packed Blending Intrinsics. */
381 /// \brief Returns a 128-bit vector of [2 x double] where the values are
382 /// selected from either the first or second operand as specified by the
383 /// third operand, the control mask.
385 /// \headerfile <x86intrin.h>
388 /// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
391 /// This intrinsic corresponds to the <c> <i> VBLENDPD / BLENDPD </i> </c>
395 /// A 128-bit vector of [2 x double].
397 /// A 128-bit vector of [2 x double].
399 /// An immediate integer operand, with mask bits [1:0] specifying how the
400 /// values are to be copied. The position of the mask bit corresponds to the
401 /// index of a copied value. When a mask bit is 0, the corresponding 64-bit
402 /// element in operand \a V1 is copied to the same position in the result.
403 /// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
404 /// is copied to the same position in the result.
405 /// \returns A 128-bit vector of [2 x double] containing the copied values.
406 #define _mm_blend_pd(V1, V2, M) __extension__ ({ \
407 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
408 (__v2df)(__m128d)(V2), \
409 (((M) & 0x01) ? 2 : 0), \
410 (((M) & 0x02) ? 3 : 1)); })
412 /// \brief Returns a 128-bit vector of [4 x float] where the values are selected
413 /// from either the first or second operand as specified by the third
414 /// operand, the control mask.
416 /// \headerfile <x86intrin.h>
419 /// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
422 /// This intrinsic corresponds to the <c> <i> VBLENDPS / BLENDPS </i> </c>
426 /// A 128-bit vector of [4 x float].
428 /// A 128-bit vector of [4 x float].
430 /// An immediate integer operand, with mask bits [3:0] specifying how the
431 /// values are to be copied. The position of the mask bit corresponds to the
432 /// index of a copied value. When a mask bit is 0, the corresponding 32-bit
433 /// element in operand \a V1 is copied to the same position in the result.
434 /// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
435 /// is copied to the same position in the result.
436 /// \returns A 128-bit vector of [4 x float] containing the copied values.
437 #define _mm_blend_ps(V1, V2, M) __extension__ ({ \
438 (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
439 (((M) & 0x01) ? 4 : 0), \
440 (((M) & 0x02) ? 5 : 1), \
441 (((M) & 0x04) ? 6 : 2), \
442 (((M) & 0x08) ? 7 : 3)); })
444 /// \brief Returns a 128-bit vector of [2 x double] where the values are
445 /// selected from either the first or second operand as specified by the
446 /// third operand, the control mask.
448 /// \headerfile <x86intrin.h>
450 /// This intrinsic corresponds to the <c> <i> VBLENDVPD / BLENDVPD </i> </c>
454 /// A 128-bit vector of [2 x double].
456 /// A 128-bit vector of [2 x double].
458 /// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
459 /// values are to be copied. The position of the mask bit corresponds to the
460 /// most significant bit of a copied value. When a mask bit is 0, the
461 /// corresponding 64-bit element in operand \a __V1 is copied to the same
462 /// position in the result. When a mask bit is 1, the corresponding 64-bit
463 /// element in operand \a __V2 is copied to the same position in the result.
464 /// \returns A 128-bit vector of [2 x double] containing the copied values.
465 static __inline__ __m128d __DEFAULT_FN_ATTRS
466 _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
468 return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
472 /// \brief Returns a 128-bit vector of [4 x float] where the values are
473 /// selected from either the first or second operand as specified by the
474 /// third operand, the control mask.
476 /// \headerfile <x86intrin.h>
478 /// This intrinsic corresponds to the <c> <i> VBLENDVPS / BLENDVPS </i> </c>
482 /// A 128-bit vector of [4 x float].
484 /// A 128-bit vector of [4 x float].
486 /// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
487 /// how the values are to be copied. The position of the mask bit corresponds
488 /// to the most significant bit of a copied value. When a mask bit is 0, the
489 /// corresponding 32-bit element in operand \a __V1 is copied to the same
490 /// position in the result. When a mask bit is 1, the corresponding 32-bit
491 /// element in operand \a __V2 is copied to the same position in the result.
492 /// \returns A 128-bit vector of [4 x float] containing the copied values.
493 static __inline__ __m128 __DEFAULT_FN_ATTRS
494 _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
496 return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
500 /// \brief Returns a 128-bit vector of [16 x i8] where the values are selected
501 /// from either of the first or second operand as specified by the third
502 /// operand, the control mask.
504 /// \headerfile <x86intrin.h>
506 /// This intrinsic corresponds to the <c> <i> VPBLENDVB / PBLENDVB </i> </c>
510 /// A 128-bit vector of [16 x i8].
512 /// A 128-bit vector of [16 x i8].
514 /// A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 specifying
515 /// how the values are to be copied. The position of the mask bit corresponds
516 /// to the most significant bit of a copied value. When a mask bit is 0, the
517 /// corresponding 8-bit element in operand \a __V1 is copied to the same
518 /// position in the result. When a mask bit is 1, the corresponding 8-bit
519 /// element in operand \a __V2 is copied to the same position in the result.
520 /// \returns A 128-bit vector of [16 x i8] containing the copied values.
521 static __inline__ __m128i __DEFAULT_FN_ATTRS
522 _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
524 return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
528 /// \brief Returns a 128-bit vector of [8 x i16] where the values are selected
529 /// from either of the first or second operand as specified by the third
530 /// operand, the control mask.
532 /// \headerfile <x86intrin.h>
535 /// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
538 /// This intrinsic corresponds to the <c> <i> VPBLENDW / PBLENDW </i> </c>
542 /// A 128-bit vector of [8 x i16].
544 /// A 128-bit vector of [8 x i16].
546 /// An immediate integer operand, with mask bits [7:0] specifying how the
547 /// values are to be copied. The position of the mask bit corresponds to the
548 /// index of a copied value. When a mask bit is 0, the corresponding 16-bit
549 /// element in operand \a V1 is copied to the same position in the result.
550 /// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
551 /// is copied to the same position in the result.
552 /// \returns A 128-bit vector of [8 x i16] containing the copied values.
553 #define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
554 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
555 (__v8hi)(__m128i)(V2), \
556 (((M) & 0x01) ? 8 : 0), \
557 (((M) & 0x02) ? 9 : 1), \
558 (((M) & 0x04) ? 10 : 2), \
559 (((M) & 0x08) ? 11 : 3), \
560 (((M) & 0x10) ? 12 : 4), \
561 (((M) & 0x20) ? 13 : 5), \
562 (((M) & 0x40) ? 14 : 6), \
563 (((M) & 0x80) ? 15 : 7)); })
565 /* SSE4 Dword Multiply Instructions. */
566 /// \brief Multiples corresponding elements of two 128-bit vectors of [4 x i32]
567 /// and returns the lower 32 bits of the each product in a 128-bit vector of
570 /// \headerfile <x86intrin.h>
572 /// This intrinsic corresponds to the <c> <i> VPMULLD / PMULLD </i> </c>
576 /// A 128-bit integer vector.
578 /// A 128-bit integer vector.
579 /// \returns A 128-bit integer vector containing the products of both operands.
580 static __inline__ __m128i __DEFAULT_FN_ATTRS
581 _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
583 return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
586 /// \brief Multiplies corresponding even-indexed elements of two 128-bit
587 /// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
588 /// containing the products.
590 /// \headerfile <x86intrin.h>
592 /// This intrinsic corresponds to the <c> <i> VPMULDQ / PMULDQ </i> </c>
596 /// A 128-bit vector of [4 x i32].
598 /// A 128-bit vector of [4 x i32].
599 /// \returns A 128-bit vector of [2 x i64] containing the products of both
601 static __inline__ __m128i __DEFAULT_FN_ATTRS
602 _mm_mul_epi32 (__m128i __V1, __m128i __V2)
604 return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
607 /* SSE4 Floating Point Dot Product Instructions. */
608 /// \brief Computes the dot product of the two 128-bit vectors of [4 x float]
609 /// and returns it in the elements of the 128-bit result vector of
610 /// [4 x float]. The immediate integer operand controls which input elements
611 /// will contribute to the dot product, and where the final results are
614 /// \headerfile <x86intrin.h>
617 /// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
620 /// This intrinsic corresponds to the <c> <i> VDPPS / DPPS </i> </c>
624 /// A 128-bit vector of [4 x float].
626 /// A 128-bit vector of [4 x float].
628 /// An immediate integer operand. Mask bits [7:4] determine which elements
629 /// of the input vectors are used, with bit [4] corresponding to the lowest
630 /// element and bit [7] corresponding to the highest element of each [4 x
631 /// float] vector. If a bit is set, the corresponding elements from the two
632 /// input vectors are used as an input for dot product; otherwise that input
633 /// is treated as zero. Bits [3:0] determine which elements of the result
634 /// will receive a copy of the final dot product, with bit [0] corresponding
635 /// to the lowest element and bit [3] corresponding to the highest element of
636 /// each [4 x float] subvector. If a bit is set, the dot product is returned
637 /// in the corresponding element; otherwise that element is set to zero.
638 /// \returns A 128-bit vector of [4 x float] containing the dot product.
639 #define _mm_dp_ps(X, Y, M) __extension__ ({ \
640 (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
641 (__v4sf)(__m128)(Y), (M)); })
643 /// \brief Computes the dot product of the two 128-bit vectors of [2 x double]
644 /// and returns it in the elements of the 128-bit result vector of
645 /// [2 x double]. The immediate integer operand controls which input
646 /// elements will contribute to the dot product, and where the final results
649 /// \headerfile <x86intrin.h>
652 /// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
655 /// This intrinsic corresponds to the <c> <i> VDPPD / DPPD </i> </c>
659 /// A 128-bit vector of [2 x double].
661 /// A 128-bit vector of [2 x double].
663 /// An immediate integer operand. Mask bits [5:4] determine which elements
664 /// of the input vectors are used, with bit [4] corresponding to the lowest
665 /// element and bit [5] corresponding to the highest element of each of [2 x
666 /// double] vector. If a bit is set, the corresponding elements from the two
667 /// input vectors are used as an input for dot product; otherwise that input
668 /// is treated as zero. Bits [1:0] determine which elements of the result
669 /// will receive a copy of the final dot product, with bit [0] corresponding
670 /// to the lowest element and bit [3] corresponding to the highest element of
671 /// each [2 x double] vector. If a bit is set, the dot product is returned in
672 /// the corresponding element; otherwise that element is set to zero.
673 #define _mm_dp_pd(X, Y, M) __extension__ ({\
674 (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
675 (__v2df)(__m128d)(Y), (M)); })
677 /* SSE4 Streaming Load Hint Instruction. */
678 /// \brief Loads integer values from a 128-bit aligned memory location to a
679 /// 128-bit integer vector.
681 /// \headerfile <x86intrin.h>
683 /// This intrinsic corresponds to the <c> <i> VMOVNTDQA / MOVNTDQA </i> </c>
687 /// A pointer to a 128-bit aligned memory location that contains the integer
689 /// \returns A 128-bit integer vector containing the data stored at the
690 /// specified memory location.
691 static __inline__ __m128i __DEFAULT_FN_ATTRS
692 _mm_stream_load_si128 (__m128i const *__V)
694 return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
697 /* SSE4 Packed Integer Min/Max Instructions. */
698 /// \brief Compares the corresponding elements of two 128-bit vectors of
699 /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
700 /// of the two values.
702 /// \headerfile <x86intrin.h>
704 /// This intrinsic corresponds to the <c> <i> VPMINSB / PMINSB </i> </c>
708 /// A 128-bit vector of [16 x i8].
710 /// A 128-bit vector of [16 x i8]
711 /// \returns A 128-bit vector of [16 x i8] containing the lesser values.
712 static __inline__ __m128i __DEFAULT_FN_ATTRS
713 _mm_min_epi8 (__m128i __V1, __m128i __V2)
715 return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
718 /// \brief Compares the corresponding elements of two 128-bit vectors of
719 /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
720 /// greater value of the two.
722 /// \headerfile <x86intrin.h>
724 /// This intrinsic corresponds to the <c> <i> VPMAXSB / PMAXSB </i> </c>
728 /// A 128-bit vector of [16 x i8].
730 /// A 128-bit vector of [16 x i8].
731 /// \returns A 128-bit vector of [16 x i8] containing the greater values.
732 static __inline__ __m128i __DEFAULT_FN_ATTRS
733 _mm_max_epi8 (__m128i __V1, __m128i __V2)
735 return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
738 /// \brief Compares the corresponding elements of two 128-bit vectors of
739 /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
740 /// value of the two.
742 /// \headerfile <x86intrin.h>
744 /// This intrinsic corresponds to the <c> <i> VPMINUW / PMINUW </i> </c>
748 /// A 128-bit vector of [8 x u16].
750 /// A 128-bit vector of [8 x u16].
751 /// \returns A 128-bit vector of [8 x u16] containing the lesser values.
752 static __inline__ __m128i __DEFAULT_FN_ATTRS
753 _mm_min_epu16 (__m128i __V1, __m128i __V2)
755 return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
758 /// \brief Compares the corresponding elements of two 128-bit vectors of
759 /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
760 /// greater value of the two.
762 /// \headerfile <x86intrin.h>
764 /// This intrinsic corresponds to the <c> <i> VPMAXUW / PMAXUW </i> </c>
768 /// A 128-bit vector of [8 x u16].
770 /// A 128-bit vector of [8 x u16].
771 /// \returns A 128-bit vector of [8 x u16] containing the greater values.
772 static __inline__ __m128i __DEFAULT_FN_ATTRS
773 _mm_max_epu16 (__m128i __V1, __m128i __V2)
775 return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
778 /// \brief Compares the corresponding elements of two 128-bit vectors of
779 /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
780 /// value of the two.
782 /// \headerfile <x86intrin.h>
784 /// This intrinsic corresponds to the <c> <i> VPMINSD / PMINSD </i> </c>
788 /// A 128-bit vector of [4 x i32].
790 /// A 128-bit vector of [4 x i32].
791 /// \returns A 128-bit vector of [4 x i32] containing the lesser values.
792 static __inline__ __m128i __DEFAULT_FN_ATTRS
793 _mm_min_epi32 (__m128i __V1, __m128i __V2)
795 return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
798 /// \brief Compares the corresponding elements of two 128-bit vectors of
799 /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
800 /// greater value of the two.
802 /// \headerfile <x86intrin.h>
804 /// This intrinsic corresponds to the <c> <i> VPMAXSD / PMAXSD </i> </c>
808 /// A 128-bit vector of [4 x i32].
810 /// A 128-bit vector of [4 x i32].
811 /// \returns A 128-bit vector of [4 x i32] containing the greater values.
812 static __inline__ __m128i __DEFAULT_FN_ATTRS
813 _mm_max_epi32 (__m128i __V1, __m128i __V2)
815 return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
818 /// \brief Compares the corresponding elements of two 128-bit vectors of
819 /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
820 /// value of the two.
822 /// \headerfile <x86intrin.h>
824 /// This intrinsic corresponds to the <c> <i> VPMINUD / PMINUD </i> </c>
828 /// A 128-bit vector of [4 x u32].
830 /// A 128-bit vector of [4 x u32].
831 /// \returns A 128-bit vector of [4 x u32] containing the lesser values.
832 static __inline__ __m128i __DEFAULT_FN_ATTRS
833 _mm_min_epu32 (__m128i __V1, __m128i __V2)
835 return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
838 /// \brief Compares the corresponding elements of two 128-bit vectors of
839 /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
840 /// greater value of the two.
842 /// \headerfile <x86intrin.h>
844 /// This intrinsic corresponds to the <c> <i> VPMAXUD / PMAXUD </i> </c>
848 /// A 128-bit vector of [4 x u32].
850 /// A 128-bit vector of [4 x u32].
851 /// \returns A 128-bit vector of [4 x u32] containing the greater values.
852 static __inline__ __m128i __DEFAULT_FN_ATTRS
853 _mm_max_epu32 (__m128i __V1, __m128i __V2)
855 return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
858 /* SSE4 Insertion and Extraction from XMM Register Instructions. */
859 /// \brief Takes the first argument \a X and inserts an element from the second
860 /// argument \a Y as selected by the third argument \a N. That result then
861 /// has elements zeroed out also as selected by the third argument \a N. The
862 /// resulting 128-bit vector of [4 x float] is then returned.
864 /// \headerfile <x86intrin.h>
867 /// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
870 /// This intrinsic corresponds to the <c> <i> VINSERTPS </i> </c> instruction.
873 /// A 128-bit vector source operand of [4 x float]. With the exception of
874 /// those bits in the result copied from parameter \a Y and zeroed by bits
875 /// [3:0] of \a N, all bits from this parameter are copied to the result.
877 /// A 128-bit vector source operand of [4 x float]. One single-precision
878 /// floating-point element from this source, as determined by the immediate
879 /// parameter, is copied to the result.
881 /// Specifies which bits from operand \a Y will be copied, which bits in the
882 /// result they will be be copied to, and which bits in the result will be
883 /// cleared. The following assignments are made: \n
884 /// Bits [7:6] specify the bits to copy from operand \a Y: \n
885 /// 00: Selects bits [31:0] from operand \a Y. \n
886 /// 01: Selects bits [63:32] from operand \a Y. \n
887 /// 10: Selects bits [95:64] from operand \a Y. \n
888 /// 11: Selects bits [127:96] from operand \a Y. \n
889 /// Bits [5:4] specify the bits in the result to which the selected bits
890 /// from operand \a Y are copied: \n
891 /// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
892 /// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
893 /// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
894 /// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
895 /// Bits[3:0]: If any of these bits are set, the corresponding result
896 /// element is cleared.
897 /// \returns A 128-bit vector of [4 x float] containing the copied single-
898 /// precision floating point elements from the operands.
899 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
901 /// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
902 /// returns it, using the immediate value parameter \a N as a selector.
904 /// \headerfile <x86intrin.h>
907 /// int _mm_extract_ps(__m128 X, const int N);
910 /// This intrinsic corresponds to the <c> <i> VEXTRACTPS / EXTRACTPS </i> </c>
914 /// A 128-bit vector of [4 x float].
916 /// An immediate value. Bits [1:0] determines which bits from the argument
917 /// \a X are extracted and returned: \n
918 /// 00: Bits [31:0] of parameter \a X are returned. \n
919 /// 01: Bits [63:32] of parameter \a X are returned. \n
920 /// 10: Bits [95:64] of parameter \a X are returned. \n
921 /// 11: Bits [127:96] of parameter \a X are returned.
922 /// \returns A 32-bit integer containing the extracted 32 bits of float data.
923 #define _mm_extract_ps(X, N) (__extension__ \
924 ({ union { int __i; float __f; } __t; \
925 __v4sf __a = (__v4sf)(__m128)(X); \
926 __t.__f = __a[(N) & 3]; \
929 /* Miscellaneous insert and extract macros. */
930 /* Extract a single-precision float from X at index N into D. */
931 #define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
934 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
935 an index suitable for _mm_insert_ps. */
936 #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
938 /* Extract a float from X at index N into the first index of the return. */
939 #define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
940 _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
942 /* Insert int into packed integer array at index. */
943 /// \brief Constructs a 128-bit vector of [16 x i8] by first making a copy of
944 /// the 128-bit integer vector parameter, and then inserting the lower 8 bits
945 /// of an integer parameter \a I into an offset specified by the immediate
946 /// value parameter \a N.
948 /// \headerfile <x86intrin.h>
951 /// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
954 /// This intrinsic corresponds to the <c> <i> VPINSRB / PINSRB </i> </c>
958 /// A 128-bit integer vector of [16 x i8]. This vector is copied to the
959 /// result and then one of the sixteen elements in the result vector is
960 /// replaced by the lower 8 bits of \a I.
962 /// An integer. The lower 8 bits of this operand are written to the result
963 /// beginning at the offset specified by \a N.
965 /// An immediate value. Bits [3:0] specify the bit offset in the result at
966 /// which the lower 8 bits of \a I are written. \n
967 /// 0000: Bits [7:0] of the result are used for insertion. \n
968 /// 0001: Bits [15:8] of the result are used for insertion. \n
969 /// 0010: Bits [23:16] of the result are used for insertion. \n
970 /// 0011: Bits [31:24] of the result are used for insertion. \n
971 /// 0100: Bits [39:32] of the result are used for insertion. \n
972 /// 0101: Bits [47:40] of the result are used for insertion. \n
973 /// 0110: Bits [55:48] of the result are used for insertion. \n
974 /// 0111: Bits [63:56] of the result are used for insertion. \n
975 /// 1000: Bits [71:64] of the result are used for insertion. \n
976 /// 1001: Bits [79:72] of the result are used for insertion. \n
977 /// 1010: Bits [87:80] of the result are used for insertion. \n
978 /// 1011: Bits [95:88] of the result are used for insertion. \n
979 /// 1100: Bits [103:96] of the result are used for insertion. \n
980 /// 1101: Bits [111:104] of the result are used for insertion. \n
981 /// 1110: Bits [119:112] of the result are used for insertion. \n
982 /// 1111: Bits [127:120] of the result are used for insertion.
983 /// \returns A 128-bit integer vector containing the constructed values.
984 #define _mm_insert_epi8(X, I, N) (__extension__ \
985 ({ __v16qi __a = (__v16qi)(__m128i)(X); \
986 __a[(N) & 15] = (I); \
989 /// \brief Constructs a 128-bit vector of [4 x i32] by first making a copy of
990 /// the 128-bit integer vector parameter, and then inserting the 32-bit
991 /// integer parameter \a I at the offset specified by the immediate value
994 /// \headerfile <x86intrin.h>
997 /// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
1000 /// This intrinsic corresponds to the <c> <i> VPINSRD / PINSRD </i> </c>
1004 /// A 128-bit integer vector of [4 x i32]. This vector is copied to the
1005 /// result and then one of the four elements in the result vector is
1006 /// replaced by \a I.
1008 /// A 32-bit integer that is written to the result beginning at the offset
1009 /// specified by \a N.
1011 /// An immediate value. Bits [1:0] specify the bit offset in the result at
1012 /// which the integer \a I is written.
1013 /// 00: Bits [31:0] of the result are used for insertion. \n
1014 /// 01: Bits [63:32] of the result are used for insertion. \n
1015 /// 10: Bits [95:64] of the result are used for insertion. \n
1016 /// 11: Bits [127:96] of the result are used for insertion.
1017 /// \returns A 128-bit integer vector containing the constructed values.
1018 #define _mm_insert_epi32(X, I, N) (__extension__ \
1019 ({ __v4si __a = (__v4si)(__m128i)(X); \
1020 __a[(N) & 3] = (I); \
1023 /// \brief Constructs a 128-bit vector of [2 x i64] by first making a copy of
1024 /// the 128-bit integer vector parameter, and then inserting the 64-bit
1025 /// integer parameter \a I, using the immediate value parameter \a N as an
1026 /// insertion location selector.
1028 /// \headerfile <x86intrin.h>
1031 /// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
1034 /// This intrinsic corresponds to the <c> <i> VPINSRQ / PINSRQ </i> </c>
1038 /// A 128-bit integer vector of [2 x i64]. This vector is copied to the
1039 /// result and then one of the two elements in the result vector is replaced
1042 /// A 64-bit integer that is written to the result beginning at the offset
1043 /// specified by \a N.
1045 /// An immediate value. Bit [0] specifies the bit offset in the result at
1046 /// which the integer \a I is written.
1047 /// 0: Bits [63:0] of the result are used for insertion. \n
1048 /// 1: Bits [127:64] of the result are used for insertion. \n
1049 /// \returns A 128-bit integer vector containing the constructed values.
1050 #define _mm_insert_epi64(X, I, N) (__extension__ \
1051 ({ __v2di __a = (__v2di)(__m128i)(X); \
1052 __a[(N) & 1] = (I); \
1054 #endif /* __x86_64__ */
1056 /* Extract int from packed integer array at index. This returns the element
1057 * as a zero extended value, so it is unsigned.
1059 /// \brief Extracts an 8-bit element from the 128-bit integer vector of
1060 /// [16 x i8], using the immediate value parameter \a N as a selector.
1062 /// \headerfile <x86intrin.h>
1065 /// int _mm_extract_epi8(__m128i X, const int N);
1068 /// This intrinsic corresponds to the <c> <i> VPEXTRB / PEXTRB </i> </c>
1072 /// A 128-bit integer vector.
1074 /// An immediate value. Bits [3:0] specify which 8-bit vector element
1075 /// from the argument \a X to extract and copy to the result. \n
1076 /// 0000: Bits [7:0] of parameter \a X are extracted. \n
1077 /// 0001: Bits [15:8] of the parameter \a X are extracted. \n
1078 /// 0010: Bits [23:16] of the parameter \a X are extracted. \n
1079 /// 0011: Bits [31:24] of the parameter \a X are extracted. \n
1080 /// 0100: Bits [39:32] of the parameter \a X are extracted. \n
1081 /// 0101: Bits [47:40] of the parameter \a X are extracted. \n
1082 /// 0110: Bits [55:48] of the parameter \a X are extracted. \n
1083 /// 0111: Bits [63:56] of the parameter \a X are extracted. \n
1084 /// 1000: Bits [71:64] of the parameter \a X are extracted. \n
1085 /// 1001: Bits [79:72] of the parameter \a X are extracted. \n
1086 /// 1010: Bits [87:80] of the parameter \a X are extracted. \n
1087 /// 1011: Bits [95:88] of the parameter \a X are extracted. \n
1088 /// 1100: Bits [103:96] of the parameter \a X are extracted. \n
1089 /// 1101: Bits [111:104] of the parameter \a X are extracted. \n
1090 /// 1110: Bits [119:112] of the parameter \a X are extracted. \n
1091 /// 1111: Bits [127:120] of the parameter \a X are extracted.
1092 /// \returns An unsigned integer, whose lower 8 bits are selected from the
1093 /// 128-bit integer vector parameter and the remaining bits are assigned
1095 #define _mm_extract_epi8(X, N) (__extension__ \
1096 ({ __v16qi __a = (__v16qi)(__m128i)(X); \
1097 (int)(unsigned char) __a[(N) & 15];}))
1099 /// \brief Extracts a 32-bit element from the 128-bit integer vector of
1100 /// [4 x i32], using the immediate value parameter \a N as a selector.
1102 /// \headerfile <x86intrin.h>
1105 /// int _mm_extract_epi32(__m128i X, const int N);
1108 /// This intrinsic corresponds to the <c> <i> VPEXTRD / PEXTRD </i> </c>
1112 /// A 128-bit integer vector.
1114 /// An immediate value. Bits [1:0] specify which 32-bit vector element
1115 /// from the argument \a X to extract and copy to the result. \n
1116 /// 00: Bits [31:0] of the parameter \a X are extracted. \n
1117 /// 01: Bits [63:32] of the parameter \a X are extracted. \n
1118 /// 10: Bits [95:64] of the parameter \a X are extracted. \n
1119 /// 11: Bits [127:96] of the parameter \a X are exracted.
1120 /// \returns An integer, whose lower 32 bits are selected from the 128-bit
1121 /// integer vector parameter and the remaining bits are assigned zeros.
1122 #define _mm_extract_epi32(X, N) (__extension__ \
1123 ({ __v4si __a = (__v4si)(__m128i)(X); \
1124 (int)__a[(N) & 3];}))
1126 /// \brief Extracts a 64-bit element from the 128-bit integer vector of
1127 /// [2 x i64], using the immediate value parameter \a N as a selector.
1129 /// \headerfile <x86intrin.h>
1132 /// long long _mm_extract_epi64(__m128i X, const int N);
1135 /// This intrinsic corresponds to the <c> <i> VPEXTRQ / PEXTRQ </i> </c>
1139 /// A 128-bit integer vector.
1141 /// An immediate value. Bit [0] specifies which 64-bit vector element
1142 /// from the argument \a X to return. \n
1143 /// 0: Bits [63:0] are returned. \n
1144 /// 1: Bits [127:64] are returned. \n
1145 /// \returns A 64-bit integer.
1146 #define _mm_extract_epi64(X, N) (__extension__ \
1147 ({ __v2di __a = (__v2di)(__m128i)(X); \
1148 (long long)__a[(N) & 1];}))
1149 #endif /* __x86_64 */
1151 /* SSE4 128-bit Packed Integer Comparisons. */
1152 /// \brief Tests whether the specified bits in a 128-bit integer vector are all
1155 /// \headerfile <x86intrin.h>
1157 /// This intrinsic corresponds to the <c> <i> VPTEST / PTEST </i> </c>
1161 /// A 128-bit integer vector containing the bits to be tested.
1163 /// A 128-bit integer vector selecting which bits to test in operand \a __M.
1164 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1165 static __inline__ int __DEFAULT_FN_ATTRS
1166 _mm_testz_si128(__m128i __M, __m128i __V)
1168 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
1171 /// \brief Tests whether the specified bits in a 128-bit integer vector are all
1174 /// \headerfile <x86intrin.h>
1176 /// This intrinsic corresponds to the <c> <i> VPTEST / PTEST </i> </c>
1180 /// A 128-bit integer vector containing the bits to be tested.
1182 /// A 128-bit integer vector selecting which bits to test in operand \a __M.
1183 /// \returns TRUE if the specified bits are all ones; FALSE otherwise.
1184 static __inline__ int __DEFAULT_FN_ATTRS
1185 _mm_testc_si128(__m128i __M, __m128i __V)
1187 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
1190 /// \brief Tests whether the specified bits in a 128-bit integer vector are
1191 /// neither all zeros nor all ones.
1193 /// \headerfile <x86intrin.h>
1195 /// This intrinsic corresponds to the <c> <i> VPTEST / PTEST </i> </c>
1199 /// A 128-bit integer vector containing the bits to be tested.
1201 /// A 128-bit integer vector selecting which bits to test in operand \a __M.
1202 /// \returns TRUE if the specified bits are neither all zeros nor all ones;
1203 /// FALSE otherwise.
1204 static __inline__ int __DEFAULT_FN_ATTRS
1205 _mm_testnzc_si128(__m128i __M, __m128i __V)
1207 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
1210 /// \brief Tests whether the specified bits in a 128-bit integer vector are all
1213 /// \headerfile <x86intrin.h>
1216 /// int _mm_test_all_ones(__m128i V);
1219 /// This intrinsic corresponds to the <c> <i> VPTEST / PTEST </i> </c>
1223 /// A 128-bit integer vector containing the bits to be tested.
1224 /// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
1226 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
1228 /// \brief Tests whether the specified bits in a 128-bit integer vector are
1229 /// neither all zeros nor all ones.
1231 /// \headerfile <x86intrin.h>
1234 /// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
1237 /// This intrinsic corresponds to the <c> <i> VPTEST / PTEST </i> </c>
1241 /// A 128-bit integer vector containing the bits to be tested.
1243 /// A 128-bit integer vector selecting which bits to test in operand \a M.
1244 /// \returns TRUE if the specified bits are neither all zeros nor all ones;
1245 /// FALSE otherwise.
1246 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
1248 /// \brief Tests whether the specified bits in a 128-bit integer vector are all
1251 /// \headerfile <x86intrin.h>
1254 /// int _mm_test_all_zeros(__m128i M, __m128i V);
1257 /// This intrinsic corresponds to the <c> <i> VPTEST / PTEST </i> </c>
1261 /// A 128-bit integer vector containing the bits to be tested.
1263 /// A 128-bit integer vector selecting which bits to test in operand \a M.
1264 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1265 #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
1267 /* SSE4 64-bit Packed Integer Comparisons. */
1268 /// \brief Compares each of the corresponding 64-bit values of the 128-bit
1269 /// integer vectors for equality.
1271 /// \headerfile <x86intrin.h>
1273 /// This intrinsic corresponds to the <c> <i> VPCMPEQQ / PCMPEQQ </i> </c>
1277 /// A 128-bit integer vector.
1279 /// A 128-bit integer vector.
1280 /// \returns A 128-bit integer vector containing the comparison results.
1281 static __inline__ __m128i __DEFAULT_FN_ATTRS
1282 _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
1284 return (__m128i)((__v2di)__V1 == (__v2di)__V2);
1287 /* SSE4 Packed Integer Sign-Extension. */
1288 /// \brief Sign-extends each of the lower eight 8-bit integer elements of a
1289 /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1290 /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1293 /// \headerfile <x86intrin.h>
1295 /// This intrinsic corresponds to the <c> <i> VPMOVSXBW / PMOVSXBW </i> </c>
1299 /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
1300 /// extended to 16-bit values.
1301 /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
1302 static __inline__ __m128i __DEFAULT_FN_ATTRS
1303 _mm_cvtepi8_epi16(__m128i __V)
1305 /* This function always performs a signed extension, but __v16qi is a char
1306 which may be signed or unsigned, so use __v16qs. */
1307 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
1310 /// \brief Sign-extends each of the lower four 8-bit integer elements of a
1311 /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1312 /// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1313 /// vector are unused.
1315 /// \headerfile <x86intrin.h>
1317 /// This intrinsic corresponds to the <c> <i> VPMOVSXBD / PMOVSXBD </i> </c>
1321 /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are sign-
1322 /// extended to 32-bit values.
1323 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1324 static __inline__ __m128i __DEFAULT_FN_ATTRS
1325 _mm_cvtepi8_epi32(__m128i __V)
1327 /* This function always performs a signed extension, but __v16qi is a char
1328 which may be signed or unsigned, so use __v16qs. */
1329 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
1332 /// \brief Sign-extends each of the lower two 8-bit integer elements of a
1333 /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1334 /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1335 /// vector are unused.
1337 /// \headerfile <x86intrin.h>
1339 /// This intrinsic corresponds to the <c> <i> VPMOVSXBQ / PMOVSXBQ </i> </c>
1343 /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are sign-
1344 /// extended to 64-bit values.
1345 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1346 static __inline__ __m128i __DEFAULT_FN_ATTRS
1347 _mm_cvtepi8_epi64(__m128i __V)
1349 /* This function always performs a signed extension, but __v16qi is a char
1350 which may be signed or unsigned, so use __v16qs. */
1351 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
1354 /// \brief Sign-extends each of the lower four 16-bit integer elements of a
1355 /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1356 /// a 128-bit vector of [4 x i32]. The upper four elements of the input
1357 /// vector are unused.
1359 /// \headerfile <x86intrin.h>
1361 /// This intrinsic corresponds to the <c> <i> VPMOVSXWD / PMOVSXWD </i> </c>
1365 /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are sign-
1366 /// extended to 32-bit values.
1367 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1368 static __inline__ __m128i __DEFAULT_FN_ATTRS
1369 _mm_cvtepi16_epi32(__m128i __V)
1371 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
1374 /// \brief Sign-extends each of the lower two 16-bit integer elements of a
1375 /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1376 /// a 128-bit vector of [2 x i64]. The upper six elements of the input
1377 /// vector are unused.
1379 /// \headerfile <x86intrin.h>
1381 /// This intrinsic corresponds to the <c> <i> VPMOVSXWQ / PMOVSXWQ </i> </c>
1385 /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are sign-
1386 /// extended to 64-bit values.
1387 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1388 static __inline__ __m128i __DEFAULT_FN_ATTRS
1389 _mm_cvtepi16_epi64(__m128i __V)
1391 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
1394 /// \brief Sign-extends each of the lower two 32-bit integer elements of a
1395 /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1396 /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1399 /// \headerfile <x86intrin.h>
1401 /// This intrinsic corresponds to the <c> <i> VPMOVSXDQ / PMOVSXDQ </i> </c>
1405 /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are sign-
1406 /// extended to 64-bit values.
1407 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1408 static __inline__ __m128i __DEFAULT_FN_ATTRS
1409 _mm_cvtepi32_epi64(__m128i __V)
1411 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
1414 /* SSE4 Packed Integer Zero-Extension. */
1415 /// \brief Zero-extends each of the lower eight 8-bit integer elements of a
1416 /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1417 /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1420 /// \headerfile <x86intrin.h>
1422 /// This intrinsic corresponds to the <c> <i> VPMOVZXBW / PMOVZXBW </i> </c>
1426 /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are zero-
1427 /// extended to 16-bit values.
1428 /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
1429 static __inline__ __m128i __DEFAULT_FN_ATTRS
1430 _mm_cvtepu8_epi16(__m128i __V)
1432 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
1435 /// \brief Zero-extends each of the lower four 8-bit integer elements of a
1436 /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1437 /// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1438 /// vector are unused.
1440 /// \headerfile <x86intrin.h>
1442 /// This intrinsic corresponds to the <c> <i> VPMOVZXBD / PMOVZXBD </i> </c>
1446 /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are zero-
1447 /// extended to 32-bit values.
1448 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1449 static __inline__ __m128i __DEFAULT_FN_ATTRS
1450 _mm_cvtepu8_epi32(__m128i __V)
1452 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
1455 /// \brief Zero-extends each of the lower two 8-bit integer elements of a
1456 /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1457 /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1458 /// vector are unused.
1460 /// \headerfile <x86intrin.h>
1462 /// This intrinsic corresponds to the <c> <i> VPMOVZXBQ / PMOVZXBQ </i> </c>
1466 /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are zero-
1467 /// extended to 64-bit values.
1468 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1469 static __inline__ __m128i __DEFAULT_FN_ATTRS
1470 _mm_cvtepu8_epi64(__m128i __V)
1472 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
1475 /// \brief Zero-extends each of the lower four 16-bit integer elements of a
1476 /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1477 /// a 128-bit vector of [4 x i32]. The upper four elements of the input
1478 /// vector are unused.
1480 /// \headerfile <x86intrin.h>
1482 /// This intrinsic corresponds to the <c> <i> VPMOVZXWD / PMOVZXWD </i> </c>
1486 /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are zero-
1487 /// extended to 32-bit values.
1488 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1489 static __inline__ __m128i __DEFAULT_FN_ATTRS
1490 _mm_cvtepu16_epi32(__m128i __V)
1492 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
1495 /// \brief Zero-extends each of the lower two 16-bit integer elements of a
1496 /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1497 /// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
1500 /// \headerfile <x86intrin.h>
1502 /// This intrinsic corresponds to the <c> <i> VPMOVZXWQ / PMOVZXWQ </i> </c>
1506 /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are zero-
1507 /// extended to 64-bit values.
1508 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1509 static __inline__ __m128i __DEFAULT_FN_ATTRS
1510 _mm_cvtepu16_epi64(__m128i __V)
1512 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
1515 /// \brief Zero-extends each of the lower two 32-bit integer elements of a
1516 /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1517 /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1520 /// \headerfile <x86intrin.h>
1522 /// This intrinsic corresponds to the <c> <i> VPMOVZXDQ / PMOVZXDQ </i> </c>
1526 /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are zero-
1527 /// extended to 64-bit values.
1528 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1529 static __inline__ __m128i __DEFAULT_FN_ATTRS
1530 _mm_cvtepu32_epi64(__m128i __V)
1532 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
1535 /* SSE4 Pack with Unsigned Saturation. */
1536 /// \brief Converts 32-bit signed integers from both 128-bit integer vector
1537 /// operands into 16-bit unsigned integers, and returns the packed result.
1538 /// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
1539 /// 0x0000 are saturated to 0x0000.
1541 /// \headerfile <x86intrin.h>
1543 /// This intrinsic corresponds to the <c> <i> VPACKUSDW / PACKUSDW </i> </c>
1547 /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
1548 /// signed integer and is converted to a 16-bit unsigned integer with
1549 /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
1550 /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
1551 /// are written to the lower 64 bits of the result.
1553 /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
1554 /// signed integer and is converted to a 16-bit unsigned integer with
1555 /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
1556 /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
1557 /// are written to the higher 64 bits of the result.
1558 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
1559 static __inline__ __m128i __DEFAULT_FN_ATTRS
1560 _mm_packus_epi32(__m128i __V1, __m128i __V2)
1562 return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
1565 /* SSE4 Multiple Packed Sums of Absolute Difference. */
1566 /// \brief Subtracts 8-bit unsigned integer values and computes the absolute
1567 /// values of the differences to the corresponding bits in the destination.
1568 /// Then sums of the absolute differences are returned according to the bit
1569 /// fields in the immediate operand.
1571 /// \headerfile <x86intrin.h>
1574 /// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
1577 /// This intrinsic corresponds to the <c> <i> VMPSADBW / MPSADBW </i> </c>
1581 /// A 128-bit vector of [16 x i8].
1583 /// A 128-bit vector of [16 x i8].
1585 /// An 8-bit immediate operand specifying how the absolute differences are to
1586 /// be calculated, according to the following algorithm:
1588 /// // M2 represents bit 2 of the immediate operand
1589 /// // M10 represents bits [1:0] of the immediate operand
1592 /// for (k = 0; k < 8; k = k + 1) {
1593 /// d0 = abs(X[i + k + 0] - Y[j + 0])
1594 /// d1 = abs(X[i + k + 1] - Y[j + 1])
1595 /// d2 = abs(X[i + k + 2] - Y[j + 2])
1596 /// d3 = abs(X[i + k + 3] - Y[j + 3])
1597 /// r[k] = d0 + d1 + d2 + d3
1600 /// \returns A 128-bit integer vector containing the sums of the sets of
1601 /// absolute differences between both operands.
1602 #define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
1603 (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
1604 (__v16qi)(__m128i)(Y), (M)); })
1606 /// \brief Finds the minimum unsigned 16-bit element in the input 128-bit
1607 /// vector of [8 x u16] and returns it and along with its index.
1609 /// \headerfile <x86intrin.h>
1611 /// This intrinsic corresponds to the <c> <i> VPHMINPOSUW / PHMINPOSUW </i> </c>
1615 /// A 128-bit vector of [8 x u16].
1616 /// \returns A 128-bit value where bits [15:0] contain the minimum value found
1617 /// in parameter \a __V, bits [18:16] contain the index of the minimum value
1618 /// and the remaining bits are set to 0.
1619 static __inline__ __m128i __DEFAULT_FN_ATTRS
1620 _mm_minpos_epu16(__m128i __V)
1622 return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
1625 /* Handle the sse4.2 definitions here. */
1627 /* These definitions are normally in nmmintrin.h, but gcc puts them in here
1628 so we'll do the same. */
1630 #undef __DEFAULT_FN_ATTRS
1631 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
1633 /* These specify the type of data that we're comparing. */
1634 #define _SIDD_UBYTE_OPS 0x00
1635 #define _SIDD_UWORD_OPS 0x01
1636 #define _SIDD_SBYTE_OPS 0x02
1637 #define _SIDD_SWORD_OPS 0x03
1639 /* These specify the type of comparison operation. */
1640 #define _SIDD_CMP_EQUAL_ANY 0x00
1641 #define _SIDD_CMP_RANGES 0x04
1642 #define _SIDD_CMP_EQUAL_EACH 0x08
1643 #define _SIDD_CMP_EQUAL_ORDERED 0x0c
1645 /* These macros specify the polarity of the operation. */
1646 #define _SIDD_POSITIVE_POLARITY 0x00
1647 #define _SIDD_NEGATIVE_POLARITY 0x10
1648 #define _SIDD_MASKED_POSITIVE_POLARITY 0x20
1649 #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
1651 /* These macros are used in _mm_cmpXstri() to specify the return. */
1652 #define _SIDD_LEAST_SIGNIFICANT 0x00
1653 #define _SIDD_MOST_SIGNIFICANT 0x40
1655 /* These macros are used in _mm_cmpXstri() to specify the return. */
1656 #define _SIDD_BIT_MASK 0x00
1657 #define _SIDD_UNIT_MASK 0x40
1659 /* SSE4.2 Packed Comparison Intrinsics. */
1660 /// \brief Uses the immediate operand \a M to perform a comparison of string
1661 /// data with implicitly defined lengths that is contained in source operands
1662 /// \a A and \a B. Returns a 128-bit integer vector representing the result
1663 /// mask of the comparison.
1665 /// \headerfile <x86intrin.h>
1668 /// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
1671 /// This intrinsic corresponds to the <c> <i> VPCMPISTRM / PCMPISTRM </i> </c>
1675 /// A 128-bit integer vector containing one of the source operands to be
1678 /// A 128-bit integer vector containing one of the source operands to be
1681 /// An 8-bit immediate operand specifying whether the characters are bytes or
1682 /// words, the type of comparison to perform, and the format of the return
1684 /// Bits [1:0]: Determine source data format. \n
1685 /// 00: 16 unsigned bytes \n
1686 /// 01: 8 unsigned words \n
1687 /// 10: 16 signed bytes \n
1688 /// 11: 8 signed words \n
1689 /// Bits [3:2]: Determine comparison type and aggregation method. \n
1690 /// 00: Subset: Each character in \a B is compared for equality with all
1691 /// the characters in \a A. \n
1692 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1693 /// basis is greater than or equal for even-indexed elements in \a A,
1694 /// and less than or equal for odd-indexed elements in \a A. \n
1695 /// 10: Match: Compare each pair of corresponding characters in \a A and
1696 /// \a B for equality. \n
1697 /// 11: Substring: Search \a B for substring matches of \a A. \n
1698 /// Bits [5:4]: Determine whether to perform a one's complement on the bit
1699 /// mask of the comparison results. \n
1700 /// 00: No effect. \n
1701 /// 01: Negate the bit mask. \n
1702 /// 10: No effect. \n
1703 /// 11: Negate the bit mask only for bits with an index less than or equal
1704 /// to the size of \a A or \a B. \n
1705 /// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1707 /// 0: The result is zero-extended to 16 bytes. \n
1708 /// 1: The result is expanded to 16 bytes (this expansion is performed by
1709 /// repeating each bit 8 or 16 times).
1710 /// \returns Returns a 128-bit integer vector representing the result mask of
1712 #define _mm_cmpistrm(A, B, M) \
1713 (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
1714 (__v16qi)(__m128i)(B), (int)(M))
1716 /// \brief Uses the immediate operand \a M to perform a comparison of string
1717 /// data with implicitly defined lengths that is contained in source operands
1718 /// \a A and \a B. Returns an integer representing the result index of the
1721 /// \headerfile <x86intrin.h>
1724 /// int _mm_cmpistri(__m128i A, __m128i B, const int M);
1727 /// This intrinsic corresponds to the <c> <i> VPCMPISTRI / PCMPISTRI </i> </c>
1731 /// A 128-bit integer vector containing one of the source operands to be
1734 /// A 128-bit integer vector containing one of the source operands to be
1737 /// An 8-bit immediate operand specifying whether the characters are bytes or
1738 /// words, the type of comparison to perform, and the format of the return
1740 /// Bits [1:0]: Determine source data format. \n
1741 /// 00: 16 unsigned bytes \n
1742 /// 01: 8 unsigned words \n
1743 /// 10: 16 signed bytes \n
1744 /// 11: 8 signed words \n
1745 /// Bits [3:2]: Determine comparison type and aggregation method. \n
1746 /// 00: Subset: Each character in \a B is compared for equality with all
1747 /// the characters in \a A. \n
1748 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1749 /// basis is greater than or equal for even-indexed elements in \a A,
1750 /// and less than or equal for odd-indexed elements in \a A. \n
1751 /// 10: Match: Compare each pair of corresponding characters in \a A and
1752 /// \a B for equality. \n
1753 /// 11: Substring: Search B for substring matches of \a A. \n
1754 /// Bits [5:4]: Determine whether to perform a one's complement on the bit
1755 /// mask of the comparison results. \n
1756 /// 00: No effect. \n
1757 /// 01: Negate the bit mask. \n
1758 /// 10: No effect. \n
1759 /// 11: Negate the bit mask only for bits with an index less than or equal
1760 /// to the size of \a A or \a B. \n
1761 /// Bit [6]: Determines whether the index of the lowest set bit or the
1762 /// highest set bit is returned. \n
1763 /// 0: The index of the least significant set bit. \n
1764 /// 1: The index of the most significant set bit. \n
1765 /// \returns Returns an integer representing the result index of the comparison.
1766 #define _mm_cmpistri(A, B, M) \
1767 (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
1768 (__v16qi)(__m128i)(B), (int)(M))
1770 /// \brief Uses the immediate operand \a M to perform a comparison of string
1771 /// data with explicitly defined lengths that is contained in source operands
1772 /// \a A and \a B. Returns a 128-bit integer vector representing the result
1773 /// mask of the comparison.
1775 /// \headerfile <x86intrin.h>
1778 /// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
1781 /// This intrinsic corresponds to the <c> <i> VPCMPESTRM / PCMPESTRM </i> </c>
1785 /// A 128-bit integer vector containing one of the source operands to be
1788 /// An integer that specifies the length of the string in \a A.
1790 /// A 128-bit integer vector containing one of the source operands to be
1793 /// An integer that specifies the length of the string in \a B.
1795 /// An 8-bit immediate operand specifying whether the characters are bytes or
1796 /// words, the type of comparison to perform, and the format of the return
1798 /// Bits [1:0]: Determine source data format. \n
1799 /// 00: 16 unsigned bytes \n
1800 /// 01: 8 unsigned words \n
1801 /// 10: 16 signed bytes \n
1802 /// 11: 8 signed words \n
1803 /// Bits [3:2]: Determine comparison type and aggregation method. \n
1804 /// 00: Subset: Each character in \a B is compared for equality with all
1805 /// the characters in \a A. \n
1806 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1807 /// basis is greater than or equal for even-indexed elements in \a A,
1808 /// and less than or equal for odd-indexed elements in \a A. \n
1809 /// 10: Match: Compare each pair of corresponding characters in \a A and
1810 /// \a B for equality. \n
1811 /// 11: Substring: Search \a B for substring matches of \a A. \n
1812 /// Bits [5:4]: Determine whether to perform a one's complement on the bit
1813 /// mask of the comparison results. \n
1814 /// 00: No effect. \n
1815 /// 01: Negate the bit mask. \n
1816 /// 10: No effect. \n
1817 /// 11: Negate the bit mask only for bits with an index less than or equal
1818 /// to the size of \a A or \a B. \n
1819 /// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1821 /// 0: The result is zero-extended to 16 bytes. \n
1822 /// 1: The result is expanded to 16 bytes (this expansion is performed by
1823 /// repeating each bit 8 or 16 times). \n
1824 /// \returns Returns a 128-bit integer vector representing the result mask of
1826 #define _mm_cmpestrm(A, LA, B, LB, M) \
1827 (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
1828 (__v16qi)(__m128i)(B), (int)(LB), \
1831 /// \brief Uses the immediate operand \a M to perform a comparison of string
1832 /// data with explicitly defined lengths that is contained in source operands
1833 /// \a A and \a B. Returns an integer representing the result index of the
1836 /// \headerfile <x86intrin.h>
1839 /// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
1842 /// This intrinsic corresponds to the <c> <i> VPCMPESTRI / PCMPESTRI </i> </c>
1846 /// A 128-bit integer vector containing one of the source operands to be
1849 /// An integer that specifies the length of the string in \a A.
1851 /// A 128-bit integer vector containing one of the source operands to be
1854 /// An integer that specifies the length of the string in \a B.
1856 /// An 8-bit immediate operand specifying whether the characters are bytes or
1857 /// words, the type of comparison to perform, and the format of the return
1859 /// Bits [1:0]: Determine source data format. \n
1860 /// 00: 16 unsigned bytes \n
1861 /// 01: 8 unsigned words \n
1862 /// 10: 16 signed bytes \n
1863 /// 11: 8 signed words \n
1864 /// Bits [3:2]: Determine comparison type and aggregation method. \n
1865 /// 00: Subset: Each character in \a B is compared for equality with all
1866 /// the characters in \a A. \n
1867 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1868 /// basis is greater than or equal for even-indexed elements in \a A,
1869 /// and less than or equal for odd-indexed elements in \a A. \n
1870 /// 10: Match: Compare each pair of corresponding characters in \a A and
1871 /// \a B for equality. \n
1872 /// 11: Substring: Search B for substring matches of \a A. \n
1873 /// Bits [5:4]: Determine whether to perform a one's complement on the bit
1874 /// mask of the comparison results. \n
1875 /// 00: No effect. \n
1876 /// 01: Negate the bit mask. \n
1877 /// 10: No effect. \n
1878 /// 11: Negate the bit mask only for bits with an index less than or equal
1879 /// to the size of \a A or \a B. \n
1880 /// Bit [6]: Determines whether the index of the lowest set bit or the
1881 /// highest set bit is returned. \n
1882 /// 0: The index of the least significant set bit. \n
1883 /// 1: The index of the most significant set bit. \n
1884 /// \returns Returns an integer representing the result index of the comparison.
1885 #define _mm_cmpestri(A, LA, B, LB, M) \
1886 (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
1887 (__v16qi)(__m128i)(B), (int)(LB), \
1890 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
1891 /// \brief Uses the immediate operand \a M to perform a comparison of string
1892 /// data with implicitly defined lengths that is contained in source operands
1893 /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
1894 /// string in \a B is the maximum, otherwise, returns 0.
1896 /// \headerfile <x86intrin.h>
1899 /// int _mm_cmpistra(__m128i A, __m128i B, const int M);
1902 /// This intrinsic corresponds to the <c> <i> VPCMPISTRI / PCMPISTRI </i> </c>
1906 /// A 128-bit integer vector containing one of the source operands to be
1909 /// A 128-bit integer vector containing one of the source operands to be
1912 /// An 8-bit immediate operand specifying whether the characters are bytes or
1913 /// words and the type of comparison to perform. \n
1914 /// Bits [1:0]: Determine source data format. \n
1915 /// 00: 16 unsigned bytes \n
1916 /// 01: 8 unsigned words \n
1917 /// 10: 16 signed bytes \n
1918 /// 11: 8 signed words \n
1919 /// Bits [3:2]: Determine comparison type and aggregation method. \n
1920 /// 00: Subset: Each character in \a B is compared for equality with all
1921 /// the characters in \a A. \n
1922 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1923 /// basis is greater than or equal for even-indexed elements in \a A,
1924 /// and less than or equal for odd-indexed elements in \a A. \n
1925 /// 10: Match: Compare each pair of corresponding characters in \a A and
1926 /// \a B for equality. \n
1927 /// 11: Substring: Search \a B for substring matches of \a A. \n
1928 /// Bits [5:4]: Determine whether to perform a one's complement on the bit
1929 /// mask of the comparison results. \n
1930 /// 00: No effect. \n
1931 /// 01: Negate the bit mask. \n
1932 /// 10: No effect. \n
1933 /// 11: Negate the bit mask only for bits with an index less than or equal
1934 /// to the size of \a A or \a B. \n
1935 /// \returns Returns 1 if the bit mask is zero and the length of the string in
1936 /// \a B is the maximum; otherwise, returns 0.
1937 #define _mm_cmpistra(A, B, M) \
1938 (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
1939 (__v16qi)(__m128i)(B), (int)(M))
1941 /// \brief Uses the immediate operand \a M to perform a comparison of string
1942 /// data with implicitly defined lengths that is contained in source operands
1943 /// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
1946 /// \headerfile <x86intrin.h>
1949 /// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
1952 /// This intrinsic corresponds to the <c> <i> VPCMPISTRI / PCMPISTRI </i> </c>
1956 /// A 128-bit integer vector containing one of the source operands to be
1959 /// A 128-bit integer vector containing one of the source operands to be
1962 /// An 8-bit immediate operand specifying whether the characters are bytes or
1963 /// words and the type of comparison to perform. \n
1964 /// Bits [1:0]: Determine source data format. \n
1965 /// 00: 16 unsigned bytes \n
1966 /// 01: 8 unsigned words \n
1967 /// 10: 16 signed bytes \n
1968 /// 11: 8 signed words \n
1969 /// Bits [3:2]: Determine comparison type and aggregation method. \n
1970 /// 00: Subset: Each character in \a B is compared for equality with all
1971 /// the characters in \a A. \n
1972 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1973 /// basis is greater than or equal for even-indexed elements in \a A,
1974 /// and less than or equal for odd-indexed elements in \a A. \n
1975 /// 10: Match: Compare each pair of corresponding characters in \a A and
1976 /// \a B for equality. \n
1977 /// 11: Substring: Search B for substring matches of \a A. \n
1978 /// Bits [5:4]: Determine whether to perform a one's complement on the bit
1979 /// mask of the comparison results. \n
1980 /// 00: No effect. \n
1981 /// 01: Negate the bit mask. \n
1982 /// 10: No effect. \n
1983 /// 11: Negate the bit mask only for bits with an index less than or equal
1984 /// to the size of \a A or \a B.
1985 /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
1986 #define _mm_cmpistrc(A, B, M) \
1987 (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
1988 (__v16qi)(__m128i)(B), (int)(M))
1990 /// \brief Uses the immediate operand \a M to perform a comparison of string
1991 /// data with implicitly defined lengths that is contained in source operands
1992 /// \a A and \a B. Returns bit 0 of the resulting bit mask.
1994 /// \headerfile <x86intrin.h>
1997 /// int _mm_cmpistro(__m128i A, __m128i B, const int M);
2000 /// This intrinsic corresponds to the <c> <i> VPCMPISTRI / PCMPISTRI </i> </c>
2004 /// A 128-bit integer vector containing one of the source operands to be
2007 /// A 128-bit integer vector containing one of the source operands to be
2010 /// An 8-bit immediate operand specifying whether the characters are bytes or
2011 /// words and the type of comparison to perform. \n
2012 /// Bits [1:0]: Determine source data format. \n
2013 /// 00: 16 unsigned bytes \n
2014 /// 01: 8 unsigned words \n
2015 /// 10: 16 signed bytes \n
2016 /// 11: 8 signed words \n
2017 /// Bits [3:2]: Determine comparison type and aggregation method. \n
2018 /// 00: Subset: Each character in \a B is compared for equality with all
2019 /// the characters in \a A. \n
2020 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2021 /// basis is greater than or equal for even-indexed elements in \a A,
2022 /// and less than or equal for odd-indexed elements in \a A. \n
2023 /// 10: Match: Compare each pair of corresponding characters in \a A and
2024 /// \a B for equality. \n
2025 /// 11: Substring: Search B for substring matches of \a A. \n
2026 /// Bits [5:4]: Determine whether to perform a one's complement on the bit
2027 /// mask of the comparison results. \n
2028 /// 00: No effect. \n
2029 /// 01: Negate the bit mask. \n
2030 /// 10: No effect. \n
2031 /// 11: Negate the bit mask only for bits with an index less than or equal
2032 /// to the size of \a A or \a B. \n
2033 /// \returns Returns bit 0 of the resulting bit mask.
2034 #define _mm_cmpistro(A, B, M) \
2035 (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
2036 (__v16qi)(__m128i)(B), (int)(M))
2038 /// \brief Uses the immediate operand \a M to perform a comparison of string
2039 /// data with implicitly defined lengths that is contained in source operands
2040 /// \a A and \a B. Returns 1 if the length of the string in \a A is less than
2041 /// the maximum, otherwise, returns 0.
2043 /// \headerfile <x86intrin.h>
2046 /// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
2049 /// This intrinsic corresponds to the <c> <i> VPCMPISTRI / PCMPISTRI </i> </c>
2053 /// A 128-bit integer vector containing one of the source operands to be
2056 /// A 128-bit integer vector containing one of the source operands to be
2059 /// An 8-bit immediate operand specifying whether the characters are bytes or
2060 /// words and the type of comparison to perform. \n
2061 /// Bits [1:0]: Determine source data format. \n
2062 /// 00: 16 unsigned bytes \n
2063 /// 01: 8 unsigned words \n
2064 /// 10: 16 signed bytes \n
2065 /// 11: 8 signed words \n
2066 /// Bits [3:2]: Determine comparison type and aggregation method. \n
2067 /// 00: Subset: Each character in \a B is compared for equality with all
2068 /// the characters in \a A. \n
2069 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2070 /// basis is greater than or equal for even-indexed elements in \a A,
2071 /// and less than or equal for odd-indexed elements in \a A. \n
2072 /// 10: Match: Compare each pair of corresponding characters in \a A and
2073 /// \a B for equality. \n
2074 /// 11: Substring: Search \a B for substring matches of \a A. \n
2075 /// Bits [5:4]: Determine whether to perform a one's complement on the bit
2076 /// mask of the comparison results. \n
2077 /// 00: No effect. \n
2078 /// 01: Negate the bit mask. \n
2079 /// 10: No effect. \n
2080 /// 11: Negate the bit mask only for bits with an index less than or equal
2081 /// to the size of \a A or \a B. \n
2082 /// \returns Returns 1 if the length of the string in \a A is less than the
2083 /// maximum, otherwise, returns 0.
2084 #define _mm_cmpistrs(A, B, M) \
2085 (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
2086 (__v16qi)(__m128i)(B), (int)(M))
2088 /// \brief Uses the immediate operand \a M to perform a comparison of string
2089 /// data with implicitly defined lengths that is contained in source operands
2090 /// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2091 /// the maximum, otherwise, returns 0.
2093 /// \headerfile <x86intrin.h>
2096 /// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
2099 /// This intrinsic corresponds to the <c> <i> VPCMPISTRI / PCMPISTRI </i> </c>
2103 /// A 128-bit integer vector containing one of the source operands to be
2106 /// A 128-bit integer vector containing one of the source operands to be
2109 /// An 8-bit immediate operand specifying whether the characters are bytes or
2110 /// words and the type of comparison to perform. \n
2111 /// Bits [1:0]: Determine source data format. \n
2112 /// 00: 16 unsigned bytes \n
2113 /// 01: 8 unsigned words \n
2114 /// 10: 16 signed bytes \n
2115 /// 11: 8 signed words \n
2116 /// Bits [3:2]: Determine comparison type and aggregation method. \n
2117 /// 00: Subset: Each character in \a B is compared for equality with all
2118 /// the characters in \a A. \n
2119 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2120 /// basis is greater than or equal for even-indexed elements in \a A,
2121 /// and less than or equal for odd-indexed elements in \a A. \n
2122 /// 10: Match: Compare each pair of corresponding characters in \a A and
2123 /// \a B for equality. \n
2124 /// 11: Substring: Search \a B for substring matches of \a A. \n
2125 /// Bits [5:4]: Determine whether to perform a one's complement on the bit
2126 /// mask of the comparison results. \n
2127 /// 00: No effect. \n
2128 /// 01: Negate the bit mask. \n
2129 /// 10: No effect. \n
2130 /// 11: Negate the bit mask only for bits with an index less than or equal
2131 /// to the size of \a A or \a B.
2132 /// \returns Returns 1 if the length of the string in \a B is less than the
2133 /// maximum, otherwise, returns 0.
2134 #define _mm_cmpistrz(A, B, M) \
2135 (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
2136 (__v16qi)(__m128i)(B), (int)(M))
2138 /// \brief Uses the immediate operand \a M to perform a comparison of string
2139 /// data with explicitly defined lengths that is contained in source operands
2140 /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
2141 /// string in \a B is the maximum, otherwise, returns 0.
2143 /// \headerfile <x86intrin.h>
2146 /// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
2149 /// This intrinsic corresponds to the <c> <i> VPCMPESTRI / PCMPESTRI </i> </c>
2153 /// A 128-bit integer vector containing one of the source operands to be
2156 /// An integer that specifies the length of the string in \a A.
2158 /// A 128-bit integer vector containing one of the source operands to be
2161 /// An integer that specifies the length of the string in \a B.
2163 /// An 8-bit immediate operand specifying whether the characters are bytes or
2164 /// words and the type of comparison to perform. \n
2165 /// Bits [1:0]: Determine source data format. \n
2166 /// 00: 16 unsigned bytes \n
2167 /// 01: 8 unsigned words \n
2168 /// 10: 16 signed bytes \n
2169 /// 11: 8 signed words \n
2170 /// Bits [3:2]: Determine comparison type and aggregation method. \n
2171 /// 00: Subset: Each character in \a B is compared for equality with all
2172 /// the characters in \a A. \n
2173 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2174 /// basis is greater than or equal for even-indexed elements in \a A,
2175 /// and less than or equal for odd-indexed elements in \a A. \n
2176 /// 10: Match: Compare each pair of corresponding characters in \a A and
2177 /// \a B for equality. \n
2178 /// 11: Substring: Search \a B for substring matches of \a A. \n
2179 /// Bits [5:4]: Determine whether to perform a one's complement on the bit
2180 /// mask of the comparison results. \n
2181 /// 00: No effect. \n
2182 /// 01: Negate the bit mask. \n
2183 /// 10: No effect. \n
2184 /// 11: Negate the bit mask only for bits with an index less than or equal
2185 /// to the size of \a A or \a B.
2186 /// \returns Returns 1 if the bit mask is zero and the length of the string in
2187 /// \a B is the maximum, otherwise, returns 0.
2188 #define _mm_cmpestra(A, LA, B, LB, M) \
2189 (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
2190 (__v16qi)(__m128i)(B), (int)(LB), \
2193 /// \brief Uses the immediate operand \a M to perform a comparison of string
2194 /// data with explicitly defined lengths that is contained in source operands
2195 /// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
2198 /// \headerfile <x86intrin.h>
2201 /// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
2204 /// This intrinsic corresponds to the <c> <i> VPCMPESTRI / PCMPESTRI </i> </c>
2208 /// A 128-bit integer vector containing one of the source operands to be
2211 /// An integer that specifies the length of the string in \a A.
2213 /// A 128-bit integer vector containing one of the source operands to be
2216 /// An integer that specifies the length of the string in \a B.
2218 /// An 8-bit immediate operand specifying whether the characters are bytes or
2219 /// words and the type of comparison to perform. \n
2220 /// Bits [1:0]: Determine source data format. \n
2221 /// 00: 16 unsigned bytes \n
2222 /// 01: 8 unsigned words \n
2223 /// 10: 16 signed bytes \n
2224 /// 11: 8 signed words \n
2225 /// Bits [3:2]: Determine comparison type and aggregation method. \n
2226 /// 00: Subset: Each character in \a B is compared for equality with all
2227 /// the characters in \a A. \n
2228 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2229 /// basis is greater than or equal for even-indexed elements in \a A,
2230 /// and less than or equal for odd-indexed elements in \a A. \n
2231 /// 10: Match: Compare each pair of corresponding characters in \a A and
2232 /// \a B for equality. \n
2233 /// 11: Substring: Search \a B for substring matches of \a A. \n
2234 /// Bits [5:4]: Determine whether to perform a one's complement on the bit
2235 /// mask of the comparison results. \n
2236 /// 00: No effect. \n
2237 /// 01: Negate the bit mask. \n
2238 /// 10: No effect. \n
2239 /// 11: Negate the bit mask only for bits with an index less than or equal
2240 /// to the size of \a A or \a B. \n
2241 /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
2242 #define _mm_cmpestrc(A, LA, B, LB, M) \
2243 (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
2244 (__v16qi)(__m128i)(B), (int)(LB), \
2246 /// \brief Uses the immediate operand \a M to perform a comparison of string
2247 /// data with explicitly defined lengths that is contained in source operands
2248 /// \a A and \a B. Returns bit 0 of the resulting bit mask.
2250 /// \headerfile <x86intrin.h>
2253 /// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
2256 /// This intrinsic corresponds to the <c> <i> VPCMPESTRI / PCMPESTRI </i> </c>
2260 /// A 128-bit integer vector containing one of the source operands to be
2263 /// An integer that specifies the length of the string in \a A.
2265 /// A 128-bit integer vector containing one of the source operands to be
2268 /// An integer that specifies the length of the string in \a B.
2270 /// An 8-bit immediate operand specifying whether the characters are bytes or
2271 /// words and the type of comparison to perform. \n
2272 /// Bits [1:0]: Determine source data format. \n
2273 /// 00: 16 unsigned bytes \n
2274 /// 01: 8 unsigned words \n
2275 /// 10: 16 signed bytes \n
2276 /// 11: 8 signed words \n
2277 /// Bits [3:2]: Determine comparison type and aggregation method. \n
2278 /// 00: Subset: Each character in \a B is compared for equality with all
2279 /// the characters in \a A. \n
2280 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2281 /// basis is greater than or equal for even-indexed elements in \a A,
2282 /// and less than or equal for odd-indexed elements in \a A. \n
2283 /// 10: Match: Compare each pair of corresponding characters in \a A and
2284 /// \a B for equality. \n
2285 /// 11: Substring: Search \a B for substring matches of \a A. \n
2286 /// Bits [5:4]: Determine whether to perform a one's complement on the bit
2287 /// mask of the comparison results. \n
2288 /// 00: No effect. \n
2289 /// 01: Negate the bit mask. \n
2290 /// 10: No effect. \n
2291 /// 11: Negate the bit mask only for bits with an index less than or equal
2292 /// to the size of \a A or \a B.
2293 /// \returns Returns bit 0 of the resulting bit mask.
2294 #define _mm_cmpestro(A, LA, B, LB, M) \
2295 (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
2296 (__v16qi)(__m128i)(B), (int)(LB), \
2299 /// \brief Uses the immediate operand \a M to perform a comparison of string
2300 /// data with explicitly defined lengths that is contained in source operands
2301 /// \a A and \a B. Returns 1 if the length of the string in \a A is less than
2302 /// the maximum, otherwise, returns 0.
2304 /// \headerfile <x86intrin.h>
2307 /// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
2310 /// This intrinsic corresponds to the <c> <i> VPCMPESTRI / PCMPESTRI </i> </c>
2314 /// A 128-bit integer vector containing one of the source operands to be
2317 /// An integer that specifies the length of the string in \a A.
2319 /// A 128-bit integer vector containing one of the source operands to be
2322 /// An integer that specifies the length of the string in \a B.
2324 /// An 8-bit immediate operand specifying whether the characters are bytes or
2325 /// words and the type of comparison to perform. \n
2326 /// Bits [1:0]: Determine source data format. \n
2327 /// 00: 16 unsigned bytes \n
2328 /// 01: 8 unsigned words \n
2329 /// 10: 16 signed bytes \n
2330 /// 11: 8 signed words \n
2331 /// Bits [3:2]: Determine comparison type and aggregation method. \n
2332 /// 00: Subset: Each character in \a B is compared for equality with all
2333 /// the characters in \a A. \n
2334 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2335 /// basis is greater than or equal for even-indexed elements in \a A,
2336 /// and less than or equal for odd-indexed elements in \a A. \n
2337 /// 10: Match: Compare each pair of corresponding characters in \a A and
2338 /// \a B for equality. \n
2339 /// 11: Substring: Search \a B for substring matches of \a A. \n
2340 /// Bits [5:4]: Determine whether to perform a one's complement in the bit
2341 /// mask of the comparison results. \n
2342 /// 00: No effect. \n
2343 /// 01: Negate the bit mask. \n
2344 /// 10: No effect. \n
2345 /// 11: Negate the bit mask only for bits with an index less than or equal
2346 /// to the size of \a A or \a B. \n
2347 /// \returns Returns 1 if the length of the string in \a A is less than the
2348 /// maximum, otherwise, returns 0.
2349 #define _mm_cmpestrs(A, LA, B, LB, M) \
2350 (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
2351 (__v16qi)(__m128i)(B), (int)(LB), \
2354 /// \brief Uses the immediate operand \a M to perform a comparison of string
2355 /// data with explicitly defined lengths that is contained in source operands
2356 /// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2357 /// the maximum, otherwise, returns 0.
2359 /// \headerfile <x86intrin.h>
2362 /// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
2365 /// This intrinsic corresponds to the <c> <i> VPCMPESTRI </i> </c> instruction.
2368 /// A 128-bit integer vector containing one of the source operands to be
2371 /// An integer that specifies the length of the string in \a A.
2373 /// A 128-bit integer vector containing one of the source operands to be
2376 /// An integer that specifies the length of the string in \a B.
2378 /// An 8-bit immediate operand specifying whether the characters are bytes or
2379 /// words and the type of comparison to perform. \n
2380 /// Bits [1:0]: Determine source data format. \n
2381 /// 00: 16 unsigned bytes \n
2382 /// 01: 8 unsigned words \n
2383 /// 10: 16 signed bytes \n
2384 /// 11: 8 signed words \n
2385 /// Bits [3:2]: Determine comparison type and aggregation method. \n
2386 /// 00: Subset: Each character in \a B is compared for equality with all
2387 /// the characters in \a A. \n
2388 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2389 /// basis is greater than or equal for even-indexed elements in \a A,
2390 /// and less than or equal for odd-indexed elements in \a A. \n
2391 /// 10: Match: Compare each pair of corresponding characters in \a A and
2392 /// \a B for equality. \n
2393 /// 11: Substring: Search \a B for substring matches of \a A. \n
2394 /// Bits [5:4]: Determine whether to perform a one's complement on the bit
2395 /// mask of the comparison results. \n
2396 /// 00: No effect. \n
2397 /// 01: Negate the bit mask. \n
2398 /// 10: No effect. \n
2399 /// 11: Negate the bit mask only for bits with an index less than or equal
2400 /// to the size of \a A or \a B.
2401 /// \returns Returns 1 if the length of the string in \a B is less than the
2402 /// maximum, otherwise, returns 0.
2403 #define _mm_cmpestrz(A, LA, B, LB, M) \
2404 (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
2405 (__v16qi)(__m128i)(B), (int)(LB), \
2408 /* SSE4.2 Compare Packed Data -- Greater Than. */
2409 /// \brief Compares each of the corresponding 64-bit values of the 128-bit
2410 /// integer vectors to determine if the values in the first operand are
2411 /// greater than those in the second operand.
2413 /// \headerfile <x86intrin.h>
2415 /// This intrinsic corresponds to the <c> <i> VPCMPGTQ / PCMPGTQ </i> </c>
2419 /// A 128-bit integer vector.
2421 /// A 128-bit integer vector.
2422 /// \returns A 128-bit integer vector containing the comparison results.
2423 static __inline__ __m128i __DEFAULT_FN_ATTRS
2424 _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
2426 return (__m128i)((__v2di)__V1 > (__v2di)__V2);
2429 /* SSE4.2 Accumulate CRC32. */
2430 /// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
2431 /// unsigned char operand.
2433 /// \headerfile <x86intrin.h>
2435 /// This intrinsic corresponds to the <c> <i> CRC32B </i> </c> instruction.
2438 /// An unsigned integer operand to add to the CRC-32C checksum of operand
2441 /// An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
2442 /// \returns The result of adding operand \a __C to the CRC-32C checksum of
2444 static __inline__ unsigned int __DEFAULT_FN_ATTRS
2445 _mm_crc32_u8(unsigned int __C, unsigned char __D)
2447 return __builtin_ia32_crc32qi(__C, __D);
2450 /// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
2451 /// unsigned short operand.
2453 /// \headerfile <x86intrin.h>
2455 /// This intrinsic corresponds to the <c> <i> CRC32W </i> </c> instruction.
2458 /// An unsigned integer operand to add to the CRC-32C checksum of operand
2461 /// An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
2462 /// \returns The result of adding operand \a __C to the CRC-32C checksum of
2464 static __inline__ unsigned int __DEFAULT_FN_ATTRS
2465 _mm_crc32_u16(unsigned int __C, unsigned short __D)
2467 return __builtin_ia32_crc32hi(__C, __D);
2470 /// \brief Adds the first unsigned integer operand to the CRC-32C checksum of
2471 /// the second unsigned integer operand.
2473 /// \headerfile <x86intrin.h>
2475 /// This intrinsic corresponds to the <c> <i> CRC32L </i> </c> instruction.
2478 /// An unsigned integer operand to add to the CRC-32C checksum of operand
2481 /// An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
2482 /// \returns The result of adding operand \a __C to the CRC-32C checksum of
2484 static __inline__ unsigned int __DEFAULT_FN_ATTRS
2485 _mm_crc32_u32(unsigned int __C, unsigned int __D)
2487 return __builtin_ia32_crc32si(__C, __D);
2491 /// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
2492 /// unsigned 64-bit integer operand.
2494 /// \headerfile <x86intrin.h>
2496 /// This intrinsic corresponds to the <c> <i> CRC32Q </i> </c> instruction.
2499 /// An unsigned integer operand to add to the CRC-32C checksum of operand
2502 /// An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
2503 /// \returns The result of adding operand \a __C to the CRC-32C checksum of
2505 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
2506 _mm_crc32_u64(unsigned long long __C, unsigned long long __D)
2508 return __builtin_ia32_crc32di(__C, __D);
2510 #endif /* __x86_64__ */
2512 #undef __DEFAULT_FN_ATTRS
2515 #include <popcntintrin.h>
2518 #endif /* _SMMINTRIN_H */