contrib/llvm/tools/clang/lib/Headers/xmmintrin.h

   1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
   2  *
   3  * Permission is hereby granted, free of charge, to any person obtaining a copy
   4  * of this software and associated documentation files (the "Software"), to deal
   5  * in the Software without restriction, including without limitation the rights
   6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   7  * copies of the Software, and to permit persons to whom the Software is
   8  * furnished to do so, subject to the following conditions:
   9  *
  10  * The above copyright notice and this permission notice shall be included in
  11  * all copies or substantial portions of the Software.
  12  *
  13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19  * THE SOFTWARE.
  20  *
  21  *===-----------------------------------------------------------------------===
  22  */
  23
  24 #ifndef __XMMINTRIN_H
  25 #define __XMMINTRIN_H
  26
  27 #include <mmintrin.h>
  28
  29 typedef int __v4si __attribute__((__vector_size__(16)));
  30 typedef float __v4sf __attribute__((__vector_size__(16)));
  31 typedef float __m128 __attribute__((__vector_size__(16)));
  32
  33 /* Unsigned types */
  34 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
  35
  36 /* This header should only be included in a hosted environment as it depends on
  37  * a standard library to provide allocation routines. */
  38 #if __STDC_HOSTED__
  39 #include <mm_malloc.h>
  40 #endif
  41
  42 /* Define the default attributes for the functions in this file. */
  43 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
  44
  45 /// \brief Adds the 32-bit float values in the low-order bits of the operands.
  46 ///
  47 /// \headerfile <x86intrin.h>
  48 ///
  49 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
  50 ///
  51 /// \param __a
  52 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  53 ///    The lower 32 bits of this operand are used in the calculation.
  54 /// \param __b
  55 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  56 ///    The lower 32 bits of this operand are used in the calculation.
  57 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
  58 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
  59 ///    the upper 96 bits of the first source operand.
  60 static __inline__ __m128 __DEFAULT_FN_ATTRS
  61 _mm_add_ss(__m128 __a, __m128 __b)
  62 {
  63   __a[0] += __b[0];
  64   return __a;
  65 }
  66
  67 /// \brief Adds two 128-bit vectors of [4 x float], and returns the results of
  68 ///    the addition.
  69 ///
  70 /// \headerfile <x86intrin.h>
  71 ///
  72 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
  73 ///
  74 /// \param __a
  75 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  76 /// \param __b
  77 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  78 /// \returns A 128-bit vector of [4 x float] containing the sums of both
  79 ///    operands.
  80 static __inline__ __m128 __DEFAULT_FN_ATTRS
  81 _mm_add_ps(__m128 __a, __m128 __b)
  82 {
  83   return (__m128)((__v4sf)__a + (__v4sf)__b);
  84 }
  85
  86 /// \brief Subtracts the 32-bit float value in the low-order bits of the second
  87 ///    operand from the corresponding value in the first operand.
  88 ///
  89 /// \headerfile <x86intrin.h>
  90 ///
  91 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
  92 ///
  93 /// \param __a
  94 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
  95 ///    of this operand are used in the calculation.
  96 /// \param __b
  97 ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
  98 ///    bits of this operand are used in the calculation.
  99 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 100 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
 101 ///    copied from the upper 96 bits of the first source operand.
 102 static __inline__ __m128 __DEFAULT_FN_ATTRS
 103 _mm_sub_ss(__m128 __a, __m128 __b)
 104 {
 105   __a[0] -= __b[0];
 106   return __a;
 107 }
 108
 109 /// \brief Subtracts each of the values of the second operand from the first
 110 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
 111 ///    the results of the subtraction.
 112 ///
 113 /// \headerfile <x86intrin.h>
 114 ///
 115 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
 116 ///
 117 /// \param __a
 118 ///    A 128-bit vector of [4 x float] containing the minuend.
 119 /// \param __b
 120 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 121 /// \returns A 128-bit vector of [4 x float] containing the differences between
 122 ///    both operands.
 123 static __inline__ __m128 __DEFAULT_FN_ATTRS
 124 _mm_sub_ps(__m128 __a, __m128 __b)
 125 {
 126   return (__m128)((__v4sf)__a - (__v4sf)__b);
 127 }
 128
 129 /// \brief Multiplies two 32-bit float values in the low-order bits of the
 130 ///    operands.
 131 ///
 132 /// \headerfile <x86intrin.h>
 133 ///
 134 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
 135 ///
 136 /// \param __a
 137 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 138 ///    The lower 32 bits of this operand are used in the calculation.
 139 /// \param __b
 140 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 141 ///    The lower 32 bits of this operand are used in the calculation.
 142 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
 143 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
 144 ///    bits of the first source operand.
 145 static __inline__ __m128 __DEFAULT_FN_ATTRS
 146 _mm_mul_ss(__m128 __a, __m128 __b)
 147 {
 148   __a[0] *= __b[0];
 149   return __a;
 150 }
 151
 152 /// \brief Multiplies two 128-bit vectors of [4 x float] and returns the
 153 ///    results of the multiplication.
 154 ///
 155 /// \headerfile <x86intrin.h>
 156 ///
 157 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
 158 ///
 159 /// \param __a
 160 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 161 /// \param __b
 162 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 163 /// \returns A 128-bit vector of [4 x float] containing the products of both
 164 ///    operands.
 165 static __inline__ __m128 __DEFAULT_FN_ATTRS
 166 _mm_mul_ps(__m128 __a, __m128 __b)
 167 {
 168   return (__m128)((__v4sf)__a * (__v4sf)__b);
 169 }
 170
 171 /// \brief Divides the value in the low-order 32 bits of the first operand by
 172 ///    the corresponding value in the second operand.
 173 ///
 174 /// \headerfile <x86intrin.h>
 175 ///
 176 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
 177 ///
 178 /// \param __a
 179 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
 180 ///    bits of this operand are used in the calculation.
 181 /// \param __b
 182 ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
 183 ///    of this operand are used in the calculation.
 184 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
 185 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
 186 ///    upper 96 bits of the first source operand.
 187 static __inline__ __m128 __DEFAULT_FN_ATTRS
 188 _mm_div_ss(__m128 __a, __m128 __b)
 189 {
 190   __a[0] /= __b[0];
 191   return __a;
 192 }
 193
 194 /// \brief Divides two 128-bit vectors of [4 x float].
 195 ///
 196 /// \headerfile <x86intrin.h>
 197 ///
 198 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
 199 ///
 200 /// \param __a
 201 ///    A 128-bit vector of [4 x float] containing the dividend.
 202 /// \param __b
 203 ///    A 128-bit vector of [4 x float] containing the divisor.
 204 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
 205 ///    operands.
 206 static __inline__ __m128 __DEFAULT_FN_ATTRS
 207 _mm_div_ps(__m128 __a, __m128 __b)
 208 {
 209   return (__m128)((__v4sf)__a / (__v4sf)__b);
 210 }
 211
 212 /// \brief Calculates the square root of the value stored in the low-order bits
 213 ///    of a 128-bit vector of [4 x float].
 214 ///
 215 /// \headerfile <x86intrin.h>
 216 ///
 217 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
 218 ///
 219 /// \param __a
 220 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 221 ///    used in the calculation.
 222 /// \returns A 128-bit vector of [4 x float] containing the square root of the
 223 ///    value in the low-order bits of the operand.
 224 static __inline__ __m128 __DEFAULT_FN_ATTRS
 225 _mm_sqrt_ss(__m128 __a)
 226 {
 227   __m128 __c = __builtin_ia32_sqrtss((__v4sf)__a);
 228   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 229 }
 230
 231 /// \brief Calculates the square roots of the values stored in a 128-bit vector
 232 ///    of [4 x float].
 233 ///
 234 /// \headerfile <x86intrin.h>
 235 ///
 236 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
 237 ///
 238 /// \param __a
 239 ///    A 128-bit vector of [4 x float].
 240 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
 241 ///    values in the operand.
 242 static __inline__ __m128 __DEFAULT_FN_ATTRS
 243 _mm_sqrt_ps(__m128 __a)
 244 {
 245   return __builtin_ia32_sqrtps((__v4sf)__a);
 246 }
 247
 248 /// \brief Calculates the approximate reciprocal of the value stored in the
 249 ///    low-order bits of a 128-bit vector of [4 x float].
 250 ///
 251 /// \headerfile <x86intrin.h>
 252 ///
 253 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
 254 ///
 255 /// \param __a
 256 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 257 ///    used in the calculation.
 258 /// \returns A 128-bit vector of [4 x float] containing the approximate
 259 ///    reciprocal of the value in the low-order bits of the operand.
 260 static __inline__ __m128 __DEFAULT_FN_ATTRS
 261 _mm_rcp_ss(__m128 __a)
 262 {
 263   __m128 __c = __builtin_ia32_rcpss((__v4sf)__a);
 264   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 265 }
 266
 267 /// \brief Calculates the approximate reciprocals of the values stored in a
 268 ///    128-bit vector of [4 x float].
 269 ///
 270 /// \headerfile <x86intrin.h>
 271 ///
 272 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
 273 ///
 274 /// \param __a
 275 ///    A 128-bit vector of [4 x float].
 276 /// \returns A 128-bit vector of [4 x float] containing the approximate
 277 ///    reciprocals of the values in the operand.
 278 static __inline__ __m128 __DEFAULT_FN_ATTRS
 279 _mm_rcp_ps(__m128 __a)
 280 {
 281   return __builtin_ia32_rcpps((__v4sf)__a);
 282 }
 283
 284 /// \brief Calculates the approximate reciprocal of the square root of the value
 285 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
 286 ///
 287 /// \headerfile <x86intrin.h>
 288 ///
 289 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
 290 ///
 291 /// \param __a
 292 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 293 ///    used in the calculation.
 294 /// \returns A 128-bit vector of [4 x float] containing the approximate
 295 ///    reciprocal of the square root of the value in the low-order bits of the
 296 ///    operand.
 297 static __inline__ __m128 __DEFAULT_FN_ATTRS
 298 _mm_rsqrt_ss(__m128 __a)
 299 {
 300   __m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a);
 301   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 302 }
 303
 304 /// \brief Calculates the approximate reciprocals of the square roots of the
 305 ///    values stored in a 128-bit vector of [4 x float].
 306 ///
 307 /// \headerfile <x86intrin.h>
 308 ///
 309 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
 310 ///
 311 /// \param __a
 312 ///    A 128-bit vector of [4 x float].
 313 /// \returns A 128-bit vector of [4 x float] containing the approximate
 314 ///    reciprocals of the square roots of the values in the operand.
 315 static __inline__ __m128 __DEFAULT_FN_ATTRS
 316 _mm_rsqrt_ps(__m128 __a)
 317 {
 318   return __builtin_ia32_rsqrtps((__v4sf)__a);
 319 }
 320
 321 /// \brief Compares two 32-bit float values in the low-order bits of both
 322 ///    operands and returns the lesser value in the low-order bits of the
 323 ///    vector of [4 x float].
 324 ///
 325 /// \headerfile <x86intrin.h>
 326 ///
 327 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
 328 ///
 329 /// \param __a
 330 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 331 ///    32 bits of this operand are used in the comparison.
 332 /// \param __b
 333 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 334 ///    32 bits of this operand are used in the comparison.
 335 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 336 ///    minimum value between both operands. The upper 96 bits are copied from
 337 ///    the upper 96 bits of the first source operand.
 338 static __inline__ __m128 __DEFAULT_FN_ATTRS
 339 _mm_min_ss(__m128 __a, __m128 __b)
 340 {
 341   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
 342 }
 343
 344 /// \brief Compares two 128-bit vectors of [4 x float] and returns the lesser
 345 ///    of each pair of values.
 346 ///
 347 /// \headerfile <x86intrin.h>
 348 ///
 349 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
 350 ///
 351 /// \param __a
 352 ///    A 128-bit vector of [4 x float] containing one of the operands.
 353 /// \param __b
 354 ///    A 128-bit vector of [4 x float] containing one of the operands.
 355 /// \returns A 128-bit vector of [4 x float] containing the minimum values
 356 ///    between both operands.
 357 static __inline__ __m128 __DEFAULT_FN_ATTRS
 358 _mm_min_ps(__m128 __a, __m128 __b)
 359 {
 360   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
 361 }
 362
 363 /// \brief Compares two 32-bit float values in the low-order bits of both
 364 ///    operands and returns the greater value in the low-order bits of a 128-bit
 365 ///    vector of [4 x float].
 366 ///
 367 /// \headerfile <x86intrin.h>
 368 ///
 369 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
 370 ///
 371 /// \param __a
 372 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 373 ///    32 bits of this operand are used in the comparison.
 374 /// \param __b
 375 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 376 ///    32 bits of this operand are used in the comparison.
 377 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 378 ///    maximum value between both operands. The upper 96 bits are copied from
 379 ///    the upper 96 bits of the first source operand.
 380 static __inline__ __m128 __DEFAULT_FN_ATTRS
 381 _mm_max_ss(__m128 __a, __m128 __b)
 382 {
 383   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
 384 }
 385
 386 /// \brief Compares two 128-bit vectors of [4 x float] and returns the greater
 387 ///    of each pair of values.
 388 ///
 389 /// \headerfile <x86intrin.h>
 390 ///
 391 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
 392 ///
 393 /// \param __a
 394 ///    A 128-bit vector of [4 x float] containing one of the operands.
 395 /// \param __b
 396 ///    A 128-bit vector of [4 x float] containing one of the operands.
 397 /// \returns A 128-bit vector of [4 x float] containing the maximum values
 398 ///    between both operands.
 399 static __inline__ __m128 __DEFAULT_FN_ATTRS
 400 _mm_max_ps(__m128 __a, __m128 __b)
 401 {
 402   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
 403 }
 404
 405 /// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float].
 406 ///
 407 /// \headerfile <x86intrin.h>
 408 ///
 409 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
 410 ///
 411 /// \param __a
 412 ///    A 128-bit vector containing one of the source operands.
 413 /// \param __b
 414 ///    A 128-bit vector containing one of the source operands.
 415 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 416 ///    values between both operands.
 417 static __inline__ __m128 __DEFAULT_FN_ATTRS
 418 _mm_and_ps(__m128 __a, __m128 __b)
 419 {
 420   return (__m128)((__v4su)__a & (__v4su)__b);
 421 }
 422
 423 /// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using
 424 ///    the one's complement of the values contained in the first source
 425 ///    operand.
 426 ///
 427 /// \headerfile <x86intrin.h>
 428 ///
 429 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
 430 ///
 431 /// \param __a
 432 ///    A 128-bit vector of [4 x float] containing the first source operand. The
 433 ///    one's complement of this value is used in the bitwise AND.
 434 /// \param __b
 435 ///    A 128-bit vector of [4 x float] containing the second source operand.
 436 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 437 ///    one's complement of the first operand and the values in the second
 438 ///    operand.
 439 static __inline__ __m128 __DEFAULT_FN_ATTRS
 440 _mm_andnot_ps(__m128 __a, __m128 __b)
 441 {
 442   return (__m128)(~(__v4su)__a & (__v4su)__b);
 443 }
 444
 445 /// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float].
 446 ///
 447 /// \headerfile <x86intrin.h>
 448 ///
 449 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
 450 ///
 451 /// \param __a
 452 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 453 /// \param __b
 454 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 455 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
 456 ///    values between both operands.
 457 static __inline__ __m128 __DEFAULT_FN_ATTRS
 458 _mm_or_ps(__m128 __a, __m128 __b)
 459 {
 460   return (__m128)((__v4su)__a | (__v4su)__b);
 461 }
 462
 463 /// \brief Performs a bitwise exclusive OR of two 128-bit vectors of
 464 ///    [4 x float].
 465 ///
 466 /// \headerfile <x86intrin.h>
 467 ///
 468 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
 469 ///
 470 /// \param __a
 471 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 472 /// \param __b
 473 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 474 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
 475 ///    of the values between both operands.
 476 static __inline__ __m128 __DEFAULT_FN_ATTRS
 477 _mm_xor_ps(__m128 __a, __m128 __b)
 478 {
 479   return (__m128)((__v4su)__a ^ (__v4su)__b);
 480 }
 481
 482 /// \brief Compares two 32-bit float values in the low-order bits of both
 483 ///    operands for equality and returns the result of the comparison in the
 484 ///    low-order bits of a vector [4 x float].
 485 ///
 486 /// \headerfile <x86intrin.h>
 487 ///
 488 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
 489 ///
 490 /// \param __a
 491 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 492 ///    32 bits of this operand are used in the comparison.
 493 /// \param __b
 494 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 495 ///    32 bits of this operand are used in the comparison.
 496 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 497 ///    in the low-order bits.
 498 static __inline__ __m128 __DEFAULT_FN_ATTRS
 499 _mm_cmpeq_ss(__m128 __a, __m128 __b)
 500 {
 501   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
 502 }
 503
 504 /// \brief Compares each of the corresponding 32-bit float values of the
 505 ///    128-bit vectors of [4 x float] for equality.
 506 ///
 507 /// \headerfile <x86intrin.h>
 508 ///
 509 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
 510 ///
 511 /// \param __a
 512 ///    A 128-bit vector of [4 x float].
 513 /// \param __b
 514 ///    A 128-bit vector of [4 x float].
 515 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 516 static __inline__ __m128 __DEFAULT_FN_ATTRS
 517 _mm_cmpeq_ps(__m128 __a, __m128 __b)
 518 {
 519   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
 520 }
 521
 522 /// \brief Compares two 32-bit float values in the low-order bits of both
 523 ///    operands to determine if the value in the first operand is less than the
 524 ///    corresponding value in the second operand and returns the result of the
 525 ///    comparison in the low-order bits of a vector of [4 x float].
 526 ///
 527 /// \headerfile <x86intrin.h>
 528 ///
 529 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
 530 ///
 531 /// \param __a
 532 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 533 ///    32 bits of this operand are used in the comparison.
 534 /// \param __b
 535 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 536 ///    32 bits of this operand are used in the comparison.
 537 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 538 ///    in the low-order bits.
 539 static __inline__ __m128 __DEFAULT_FN_ATTRS
 540 _mm_cmplt_ss(__m128 __a, __m128 __b)
 541 {
 542   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
 543 }
 544
 545 /// \brief Compares each of the corresponding 32-bit float values of the
 546 ///    128-bit vectors of [4 x float] to determine if the values in the first
 547 ///    operand are less than those in the second operand.
 548 ///
 549 /// \headerfile <x86intrin.h>
 550 ///
 551 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
 552 ///
 553 /// \param __a
 554 ///    A 128-bit vector of [4 x float].
 555 /// \param __b
 556 ///    A 128-bit vector of [4 x float].
 557 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 558 static __inline__ __m128 __DEFAULT_FN_ATTRS
 559 _mm_cmplt_ps(__m128 __a, __m128 __b)
 560 {
 561   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
 562 }
 563
 564 /// \brief Compares two 32-bit float values in the low-order bits of both
 565 ///    operands to determine if the value in the first operand is less than or
 566 ///    equal to the corresponding value in the second operand and returns the
 567 ///    result of the comparison in the low-order bits of a vector of
 568 ///    [4 x float].
 569 ///
 570 /// \headerfile <x86intrin.h>
 571 ///
 572 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
 573 ///
 574 /// \param __a
 575 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 576 ///    32 bits of this operand are used in the comparison.
 577 /// \param __b
 578 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 579 ///    32 bits of this operand are used in the comparison.
 580 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 581 ///    in the low-order bits.
 582 static __inline__ __m128 __DEFAULT_FN_ATTRS
 583 _mm_cmple_ss(__m128 __a, __m128 __b)
 584 {
 585   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
 586 }
 587
 588 /// \brief Compares each of the corresponding 32-bit float values of the
 589 ///    128-bit vectors of [4 x float] to determine if the values in the first
 590 ///    operand are less than or equal to those in the second operand.
 591 ///
 592 /// \headerfile <x86intrin.h>
 593 ///
 594 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
 595 ///
 596 /// \param __a
 597 ///    A 128-bit vector of [4 x float].
 598 /// \param __b
 599 ///    A 128-bit vector of [4 x float].
 600 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 601 static __inline__ __m128 __DEFAULT_FN_ATTRS
 602 _mm_cmple_ps(__m128 __a, __m128 __b)
 603 {
 604   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
 605 }
 606
 607 /// \brief Compares two 32-bit float values in the low-order bits of both
 608 ///    operands to determine if the value in the first operand is greater than
 609 ///    the corresponding value in the second operand and returns the result of
 610 ///    the comparison in the low-order bits of a vector of [4 x float].
 611 ///
 612 /// \headerfile <x86intrin.h>
 613 ///
 614 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
 615 ///
 616 /// \param __a
 617 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 618 ///    32 bits of this operand are used in the comparison.
 619 /// \param __b
 620 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 621 ///    32 bits of this operand are used in the comparison.
 622 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 623 ///    in the low-order bits.
 624 static __inline__ __m128 __DEFAULT_FN_ATTRS
 625 _mm_cmpgt_ss(__m128 __a, __m128 __b)
 626 {
 627   return (__m128)__builtin_shufflevector((__v4sf)__a,
 628                                          (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
 629                                          4, 1, 2, 3);
 630 }
 631
 632 /// \brief Compares each of the corresponding 32-bit float values of the
 633 ///    128-bit vectors of [4 x float] to determine if the values in the first
 634 ///    operand are greater than those in the second operand.
 635 ///
 636 /// \headerfile <x86intrin.h>
 637 ///
 638 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
 639 ///
 640 /// \param __a
 641 ///    A 128-bit vector of [4 x float].
 642 /// \param __b
 643 ///    A 128-bit vector of [4 x float].
 644 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 645 static __inline__ __m128 __DEFAULT_FN_ATTRS
 646 _mm_cmpgt_ps(__m128 __a, __m128 __b)
 647 {
 648   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
 649 }
 650
 651 /// \brief Compares two 32-bit float values in the low-order bits of both
 652 ///    operands to determine if the value in the first operand is greater than
 653 ///    or equal to the corresponding value in the second operand and returns
 654 ///    the result of the comparison in the low-order bits of a vector of
 655 ///    [4 x float].
 656 ///
 657 /// \headerfile <x86intrin.h>
 658 ///
 659 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
 660 ///
 661 /// \param __a
 662 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 663 ///    32 bits of this operand are used in the comparison.
 664 /// \param __b
 665 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 666 ///    32 bits of this operand are used in the comparison.
 667 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 668 ///    in the low-order bits.
 669 static __inline__ __m128 __DEFAULT_FN_ATTRS
 670 _mm_cmpge_ss(__m128 __a, __m128 __b)
 671 {
 672   return (__m128)__builtin_shufflevector((__v4sf)__a,
 673                                          (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
 674                                          4, 1, 2, 3);
 675 }
 676
 677 /// \brief Compares each of the corresponding 32-bit float values of the
 678 ///    128-bit vectors of [4 x float] to determine if the values in the first
 679 ///    operand are greater than or equal to those in the second operand.
 680 ///
 681 /// \headerfile <x86intrin.h>
 682 ///
 683 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
 684 ///
 685 /// \param __a
 686 ///    A 128-bit vector of [4 x float].
 687 /// \param __b
 688 ///    A 128-bit vector of [4 x float].
 689 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 690 static __inline__ __m128 __DEFAULT_FN_ATTRS
 691 _mm_cmpge_ps(__m128 __a, __m128 __b)
 692 {
 693   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
 694 }
 695
 696 /// \brief Compares two 32-bit float values in the low-order bits of both
 697 ///    operands for inequality and returns the result of the comparison in the
 698 ///    low-order bits of a vector of [4 x float].
 699 ///
 700 /// \headerfile <x86intrin.h>
 701 ///
 702 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
 703 ///   instructions.
 704 ///
 705 /// \param __a
 706 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 707 ///    32 bits of this operand are used in the comparison.
 708 /// \param __b
 709 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 710 ///    32 bits of this operand are used in the comparison.
 711 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 712 ///    in the low-order bits.
 713 static __inline__ __m128 __DEFAULT_FN_ATTRS
 714 _mm_cmpneq_ss(__m128 __a, __m128 __b)
 715 {
 716   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
 717 }
 718
 719 /// \brief Compares each of the corresponding 32-bit float values of the
 720 ///    128-bit vectors of [4 x float] for inequality.
 721 ///
 722 /// \headerfile <x86intrin.h>
 723 ///
 724 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
 725 ///   instructions.
 726 ///
 727 /// \param __a
 728 ///    A 128-bit vector of [4 x float].
 729 /// \param __b
 730 ///    A 128-bit vector of [4 x float].
 731 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 732 static __inline__ __m128 __DEFAULT_FN_ATTRS
 733 _mm_cmpneq_ps(__m128 __a, __m128 __b)
 734 {
 735   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
 736 }
 737
 738 /// \brief Compares two 32-bit float values in the low-order bits of both
 739 ///    operands to determine if the value in the first operand is not less than
 740 ///    the corresponding value in the second operand and returns the result of
 741 ///    the comparison in the low-order bits of a vector of [4 x float].
 742 ///
 743 /// \headerfile <x86intrin.h>
 744 ///
 745 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
 746 ///   instructions.
 747 ///
 748 /// \param __a
 749 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 750 ///    32 bits of this operand are used in the comparison.
 751 /// \param __b
 752 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 753 ///    32 bits of this operand are used in the comparison.
 754 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 755 ///    in the low-order bits.
 756 static __inline__ __m128 __DEFAULT_FN_ATTRS
 757 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
 758 {
 759   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
 760 }
 761
 762 /// \brief Compares each of the corresponding 32-bit float values of the
 763 ///    128-bit vectors of [4 x float] to determine if the values in the first
 764 ///    operand are not less than those in the second operand.
 765 ///
 766 /// \headerfile <x86intrin.h>
 767 ///
 768 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
 769 ///   instructions.
 770 ///
 771 /// \param __a
 772 ///    A 128-bit vector of [4 x float].
 773 /// \param __b
 774 ///    A 128-bit vector of [4 x float].
 775 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 776 static __inline__ __m128 __DEFAULT_FN_ATTRS
 777 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
 778 {
 779   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
 780 }
 781
 782 /// \brief Compares two 32-bit float values in the low-order bits of both
 783 ///    operands to determine if the value in the first operand is not less than
 784 ///    or equal to the corresponding value in the second operand and returns
 785 ///    the result of the comparison in the low-order bits of a vector of
 786 ///    [4 x float].
 787 ///
 788 /// \headerfile <x86intrin.h>
 789 ///
 790 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
 791 ///   instructions.
 792 ///
 793 /// \param __a
 794 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 795 ///    32 bits of this operand are used in the comparison.
 796 /// \param __b
 797 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 798 ///    32 bits of this operand are used in the comparison.
 799 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 800 ///    in the low-order bits.
 801 static __inline__ __m128 __DEFAULT_FN_ATTRS
 802 _mm_cmpnle_ss(__m128 __a, __m128 __b)
 803 {
 804   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
 805 }
 806
 807 /// \brief Compares each of the corresponding 32-bit float values of the
 808 ///    128-bit vectors of [4 x float] to determine if the values in the first
 809 ///    operand are not less than or equal to those in the second operand.
 810 ///
 811 /// \headerfile <x86intrin.h>
 812 ///
 813 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
 814 ///   instructions.
 815 ///
 816 /// \param __a
 817 ///    A 128-bit vector of [4 x float].
 818 /// \param __b
 819 ///    A 128-bit vector of [4 x float].
 820 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 821 static __inline__ __m128 __DEFAULT_FN_ATTRS
 822 _mm_cmpnle_ps(__m128 __a, __m128 __b)
 823 {
 824   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
 825 }
 826
 827 /// \brief Compares two 32-bit float values in the low-order bits of both
 828 ///    operands to determine if the value in the first operand is not greater
 829 ///    than the corresponding value in the second operand and returns the
 830 ///    result of the comparison in the low-order bits of a vector of
 831 ///    [4 x float].
 832 ///
 833 /// \headerfile <x86intrin.h>
 834 ///
 835 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
 836 ///   instructions.
 837 ///
 838 /// \param __a
 839 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 840 ///    32 bits of this operand are used in the comparison.
 841 /// \param __b
 842 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 843 ///    32 bits of this operand are used in the comparison.
 844 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 845 ///    in the low-order bits.
 846 static __inline__ __m128 __DEFAULT_FN_ATTRS
 847 _mm_cmpngt_ss(__m128 __a, __m128 __b)
 848 {
 849   return (__m128)__builtin_shufflevector((__v4sf)__a,
 850                                          (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
 851                                          4, 1, 2, 3);
 852 }
 853
 854 /// \brief Compares each of the corresponding 32-bit float values of the
 855 ///    128-bit vectors of [4 x float] to determine if the values in the first
 856 ///    operand are not greater than those in the second operand.
 857 ///
 858 /// \headerfile <x86intrin.h>
 859 ///
 860 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
 861 ///   instructions.
 862 ///
 863 /// \param __a
 864 ///    A 128-bit vector of [4 x float].
 865 /// \param __b
 866 ///    A 128-bit vector of [4 x float].
 867 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 868 static __inline__ __m128 __DEFAULT_FN_ATTRS
 869 _mm_cmpngt_ps(__m128 __a, __m128 __b)
 870 {
 871   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
 872 }
 873
 874 /// \brief Compares two 32-bit float values in the low-order bits of both
 875 ///    operands to determine if the value in the first operand is not greater
 876 ///    than or equal to the corresponding value in the second operand and
 877 ///    returns the result of the comparison in the low-order bits of a vector
 878 ///    of [4 x float].
 879 ///
 880 /// \headerfile <x86intrin.h>
 881 ///
 882 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
 883 ///   instructions.
 884 ///
 885 /// \param __a
 886 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 887 ///    32 bits of this operand are used in the comparison.
 888 /// \param __b
 889 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 890 ///    32 bits of this operand are used in the comparison.
 891 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 892 ///    in the low-order bits.
 893 static __inline__ __m128 __DEFAULT_FN_ATTRS
 894 _mm_cmpnge_ss(__m128 __a, __m128 __b)
 895 {
 896   return (__m128)__builtin_shufflevector((__v4sf)__a,
 897                                          (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
 898                                          4, 1, 2, 3);
 899 }
 900
 901 /// \brief Compares each of the corresponding 32-bit float values of the
 902 ///    128-bit vectors of [4 x float] to determine if the values in the first
 903 ///    operand are not greater than or equal to those in the second operand.
 904 ///
 905 /// \headerfile <x86intrin.h>
 906 ///
 907 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
 908 ///   instructions.
 909 ///
 910 /// \param __a
 911 ///    A 128-bit vector of [4 x float].
 912 /// \param __b
 913 ///    A 128-bit vector of [4 x float].
 914 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 915 static __inline__ __m128 __DEFAULT_FN_ATTRS
 916 _mm_cmpnge_ps(__m128 __a, __m128 __b)
 917 {
 918   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
 919 }
 920
 921 /// \brief Compares two 32-bit float values in the low-order bits of both
 922 ///    operands to determine if the value in the first operand is ordered with
 923 ///    respect to the corresponding value in the second operand and returns the
 924 ///    result of the comparison in the low-order bits of a vector of
 925 ///    [4 x float].
 926 ///
 927 /// \headerfile <x86intrin.h>
 928 ///
 929 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
 930 ///   instructions.
 931 ///
 932 /// \param __a
 933 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 934 ///    32 bits of this operand are used in the comparison.
 935 /// \param __b
 936 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 937 ///    32 bits of this operand are used in the comparison.
 938 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 939 ///    in the low-order bits.
 940 static __inline__ __m128 __DEFAULT_FN_ATTRS
 941 _mm_cmpord_ss(__m128 __a, __m128 __b)
 942 {
 943   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
 944 }
 945
 946 /// \brief Compares each of the corresponding 32-bit float values of the
 947 ///    128-bit vectors of [4 x float] to determine if the values in the first
 948 ///    operand are ordered with respect to those in the second operand.
 949 ///
 950 /// \headerfile <x86intrin.h>
 951 ///
 952 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
 953 ///   instructions.
 954 ///
 955 /// \param __a
 956 ///    A 128-bit vector of [4 x float].
 957 /// \param __b
 958 ///    A 128-bit vector of [4 x float].
 959 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 960 static __inline__ __m128 __DEFAULT_FN_ATTRS
 961 _mm_cmpord_ps(__m128 __a, __m128 __b)
 962 {
 963   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
 964 }
 965
 966 /// \brief Compares two 32-bit float values in the low-order bits of both
 967 ///    operands to determine if the value in the first operand is unordered
 968 ///    with respect to the corresponding value in the second operand and
 969 ///    returns the result of the comparison in the low-order bits of a vector
 970 ///    of [4 x float].
 971 ///
 972 /// \headerfile <x86intrin.h>
 973 ///
 974 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
 975 ///   instructions.
 976 ///
 977 /// \param __a
 978 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 979 ///    32 bits of this operand are used in the comparison.
 980 /// \param __b
 981 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 982 ///    32 bits of this operand are used in the comparison.
 983 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 984 ///    in the low-order bits.
 985 static __inline__ __m128 __DEFAULT_FN_ATTRS
 986 _mm_cmpunord_ss(__m128 __a, __m128 __b)
 987 {
 988   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
 989 }
 990
 991 /// \brief Compares each of the corresponding 32-bit float values of the
 992 ///    128-bit vectors of [4 x float] to determine if the values in the first
 993 ///    operand are unordered with respect to those in the second operand.
 994 ///
 995 /// \headerfile <x86intrin.h>
 996 ///
 997 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
 998 ///   instructions.
 999 ///
1000 /// \param __a
1001 ///    A 128-bit vector of [4 x float].
1002 /// \param __b
1003 ///    A 128-bit vector of [4 x float].
1004 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1005 static __inline__ __m128 __DEFAULT_FN_ATTRS
1006 _mm_cmpunord_ps(__m128 __a, __m128 __b)
1007 {
1008   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1009 }
1010
1011 /// \brief Compares two 32-bit float values in the low-order bits of both
1012 ///    operands for equality and returns the result of the comparison.
1013 ///
1014 /// \headerfile <x86intrin.h>
1015 ///
1016 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1017 ///   instructions.
1018 ///
1019 /// \param __a
1020 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1021 ///    used in the comparison.
1022 /// \param __b
1023 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1024 ///    used in the comparison.
1025 /// \returns An integer containing the comparison results.
1026 static __inline__ int __DEFAULT_FN_ATTRS
1027 _mm_comieq_ss(__m128 __a, __m128 __b)
1028 {
1029   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1030 }
1031
1032 /// \brief Compares two 32-bit float values in the low-order bits of both
1033 ///    operands to determine if the first operand is less than the second
1034 ///    operand and returns the result of the comparison.
1035 ///
1036 /// \headerfile <x86intrin.h>
1037 ///
1038 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1039 ///   instructions.
1040 ///
1041 /// \param __a
1042 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1043 ///    used in the comparison.
1044 /// \param __b
1045 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1046 ///    used in the comparison.
1047 /// \returns An integer containing the comparison results.
1048 static __inline__ int __DEFAULT_FN_ATTRS
1049 _mm_comilt_ss(__m128 __a, __m128 __b)
1050 {
1051   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1052 }
1053
1054 /// \brief Compares two 32-bit float values in the low-order bits of both
1055 ///    operands to determine if the first operand is less than or equal to the
1056 ///    second operand and returns the result of the comparison.
1057 ///
1058 /// \headerfile <x86intrin.h>
1059 ///
1060 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1061 ///
1062 /// \param __a
1063 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1064 ///    used in the comparison.
1065 /// \param __b
1066 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1067 ///    used in the comparison.
1068 /// \returns An integer containing the comparison results.
1069 static __inline__ int __DEFAULT_FN_ATTRS
1070 _mm_comile_ss(__m128 __a, __m128 __b)
1071 {
1072   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1073 }
1074
1075 /// \brief Compares two 32-bit float values in the low-order bits of both
1076 ///    operands to determine if the first operand is greater than the second
1077 ///    operand and returns the result of the comparison.
1078 ///
1079 /// \headerfile <x86intrin.h>
1080 ///
1081 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1082 ///
1083 /// \param __a
1084 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1085 ///    used in the comparison.
1086 /// \param __b
1087 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1088 ///    used in the comparison.
1089 /// \returns An integer containing the comparison results.
1090 static __inline__ int __DEFAULT_FN_ATTRS
1091 _mm_comigt_ss(__m128 __a, __m128 __b)
1092 {
1093   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1094 }
1095
1096 /// \brief Compares two 32-bit float values in the low-order bits of both
1097 ///    operands to determine if the first operand is greater than or equal to
1098 ///    the second operand and returns the result of the comparison.
1099 ///
1100 /// \headerfile <x86intrin.h>
1101 ///
1102 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1103 ///
1104 /// \param __a
1105 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1106 ///    used in the comparison.
1107 /// \param __b
1108 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1109 ///    used in the comparison.
1110 /// \returns An integer containing the comparison results.
1111 static __inline__ int __DEFAULT_FN_ATTRS
1112 _mm_comige_ss(__m128 __a, __m128 __b)
1113 {
1114   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1115 }
1116
1117 /// \brief Compares two 32-bit float values in the low-order bits of both
1118 ///    operands to determine if the first operand is not equal to the second
1119 ///    operand and returns the result of the comparison.
1120 ///
1121 /// \headerfile <x86intrin.h>
1122 ///
1123 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1124 ///
1125 /// \param __a
1126 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1127 ///    used in the comparison.
1128 /// \param __b
1129 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1130 ///    used in the comparison.
1131 /// \returns An integer containing the comparison results.
1132 static __inline__ int __DEFAULT_FN_ATTRS
1133 _mm_comineq_ss(__m128 __a, __m128 __b)
1134 {
1135   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1136 }
1137
1138 /// \brief Performs an unordered comparison of two 32-bit float values using
1139 ///    the low-order bits of both operands to determine equality and returns
1140 ///    the result of the comparison.
1141 ///
1142 /// \headerfile <x86intrin.h>
1143 ///
1144 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1145 ///
1146 /// \param __a
1147 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1148 ///    used in the comparison.
1149 /// \param __b
1150 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1151 ///    used in the comparison.
1152 /// \returns An integer containing the comparison results.
1153 static __inline__ int __DEFAULT_FN_ATTRS
1154 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1155 {
1156   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1157 }
1158
1159 /// \brief Performs an unordered comparison of two 32-bit float values using
1160 ///    the low-order bits of both operands to determine if the first operand is
1161 ///    less than the second operand and returns the result of the comparison.
1162 ///
1163 /// \headerfile <x86intrin.h>
1164 ///
1165 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1166 ///
1167 /// \param __a
1168 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1169 ///    used in the comparison.
1170 /// \param __b
1171 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1172 ///    used in the comparison.
1173 /// \returns An integer containing the comparison results.
1174 static __inline__ int __DEFAULT_FN_ATTRS
1175 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1176 {
1177   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1178 }
1179
1180 /// \brief Performs an unordered comparison of two 32-bit float values using
1181 ///    the low-order bits of both operands to determine if the first operand is
1182 ///    less than or equal to the second operand and returns the result of the
1183 ///    comparison.
1184 ///
1185 /// \headerfile <x86intrin.h>
1186 ///
1187 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1188 ///
1189 /// \param __a
1190 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1191 ///    used in the comparison.
1192 /// \param __b
1193 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1194 ///    used in the comparison.
1195 /// \returns An integer containing the comparison results.
1196 static __inline__ int __DEFAULT_FN_ATTRS
1197 _mm_ucomile_ss(__m128 __a, __m128 __b)
1198 {
1199   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1200 }
1201
1202 /// \brief Performs an unordered comparison of two 32-bit float values using
1203 ///    the low-order bits of both operands to determine if the first operand is
1204 ///    greater than the second operand and returns the result of the
1205 ///    comparison.
1206 ///
1207 /// \headerfile <x86intrin.h>
1208 ///
1209 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1210 ///
1211 /// \param __a
1212 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1213 ///    used in the comparison.
1214 /// \param __b
1215 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1216 ///    used in the comparison.
1217 /// \returns An integer containing the comparison results.
1218 static __inline__ int __DEFAULT_FN_ATTRS
1219 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1220 {
1221   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1222 }
1223
1224 /// \brief Performs an unordered comparison of two 32-bit float values using
1225 ///    the low-order bits of both operands to determine if the first operand is
1226 ///    greater than or equal to the second operand and returns the result of
1227 ///    the comparison.
1228 ///
1229 /// \headerfile <x86intrin.h>
1230 ///
1231 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1232 ///
1233 /// \param __a
1234 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1235 ///    used in the comparison.
1236 /// \param __b
1237 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1238 ///    used in the comparison.
1239 /// \returns An integer containing the comparison results.
1240 static __inline__ int __DEFAULT_FN_ATTRS
1241 _mm_ucomige_ss(__m128 __a, __m128 __b)
1242 {
1243   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1244 }
1245
1246 /// \brief Performs an unordered comparison of two 32-bit float values using
1247 ///    the low-order bits of both operands to determine inequality and returns
1248 ///    the result of the comparison.
1249 ///
1250 /// \headerfile <x86intrin.h>
1251 ///
1252 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1253 ///
1254 /// \param __a
1255 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1256 ///    used in the comparison.
1257 /// \param __b
1258 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1259 ///    used in the comparison.
1260 /// \returns An integer containing the comparison results.
1261 static __inline__ int __DEFAULT_FN_ATTRS
1262 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1263 {
1264   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1265 }
1266
1267 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1268 ///    [4 x float] into a 32-bit integer.
1269 ///
1270 /// \headerfile <x86intrin.h>
1271 ///
1272 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1273 ///   instructions.
1274 ///
1275 /// \param __a
1276 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1277 ///    used in the conversion.
1278 /// \returns A 32-bit integer containing the converted value.
1279 static __inline__ int __DEFAULT_FN_ATTRS
1280 _mm_cvtss_si32(__m128 __a)
1281 {
1282   return __builtin_ia32_cvtss2si((__v4sf)__a);
1283 }
1284
1285 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1286 ///    [4 x float] into a 32-bit integer.
1287 ///
1288 /// \headerfile <x86intrin.h>
1289 ///
1290 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1291 ///   instructions.
1292 ///
1293 /// \param __a
1294 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1295 ///    used in the conversion.
1296 /// \returns A 32-bit integer containing the converted value.
1297 static __inline__ int __DEFAULT_FN_ATTRS
1298 _mm_cvt_ss2si(__m128 __a)
1299 {
1300   return _mm_cvtss_si32(__a);
1301 }
1302
1303 #ifdef __x86_64__
1304
1305 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1306 ///    [4 x float] into a 64-bit integer.
1307 ///
1308 /// \headerfile <x86intrin.h>
1309 ///
1310 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1311 ///   instructions.
1312 ///
1313 /// \param __a
1314 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1315 ///    used in the conversion.
1316 /// \returns A 64-bit integer containing the converted value.
1317 static __inline__ long long __DEFAULT_FN_ATTRS
1318 _mm_cvtss_si64(__m128 __a)
1319 {
1320   return __builtin_ia32_cvtss2si64((__v4sf)__a);
1321 }
1322
1323 #endif
1324
1325 /// \brief Converts two low-order float values in a 128-bit vector of
1326 ///    [4 x float] into a 64-bit vector of [2 x i32].
1327 ///
1328 /// \headerfile <x86intrin.h>
1329 ///
1330 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1331 ///
1332 /// \param __a
1333 ///    A 128-bit vector of [4 x float].
1334 /// \returns A 64-bit integer vector containing the converted values.
1335 static __inline__ __m64 __DEFAULT_FN_ATTRS
1336 _mm_cvtps_pi32(__m128 __a)
1337 {
1338   return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1339 }
1340
1341 /// \brief Converts two low-order float values in a 128-bit vector of
1342 ///    [4 x float] into a 64-bit vector of [2 x i32].
1343 ///
1344 /// \headerfile <x86intrin.h>
1345 ///
1346 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1347 ///
1348 /// \param __a
1349 ///    A 128-bit vector of [4 x float].
1350 /// \returns A 64-bit integer vector containing the converted values.
1351 static __inline__ __m64 __DEFAULT_FN_ATTRS
1352 _mm_cvt_ps2pi(__m128 __a)
1353 {
1354   return _mm_cvtps_pi32(__a);
1355 }
1356
1357 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1358 ///    [4 x float] into a 32-bit integer, truncating the result when it is
1359 ///    inexact.
1360 ///
1361 /// \headerfile <x86intrin.h>
1362 ///
1363 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1364 ///   instructions.
1365 ///
1366 /// \param __a
1367 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1368 ///    used in the conversion.
1369 /// \returns A 32-bit integer containing the converted value.
1370 static __inline__ int __DEFAULT_FN_ATTRS
1371 _mm_cvttss_si32(__m128 __a)
1372 {
1373   return __builtin_ia32_cvttss2si((__v4sf)__a);
1374 }
1375
1376 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1377 ///    [4 x float] into a 32-bit integer, truncating the result when it is
1378 ///    inexact.
1379 ///
1380 /// \headerfile <x86intrin.h>
1381 ///
1382 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1383 ///   instructions.
1384 ///
1385 /// \param __a
1386 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1387 ///    used in the conversion.
1388 /// \returns A 32-bit integer containing the converted value.
1389 static __inline__ int __DEFAULT_FN_ATTRS
1390 _mm_cvtt_ss2si(__m128 __a)
1391 {
1392   return _mm_cvttss_si32(__a);
1393 }
1394
1395 #ifdef __x86_64__
1396 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1397 ///    [4 x float] into a 64-bit integer, truncating the result when it is
1398 ///    inexact.
1399 ///
1400 /// \headerfile <x86intrin.h>
1401 ///
1402 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1403 ///   instructions.
1404 ///
1405 /// \param __a
1406 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1407 ///    used in the conversion.
1408 /// \returns A 64-bit integer containing the converted value.
1409 static __inline__ long long __DEFAULT_FN_ATTRS
1410 _mm_cvttss_si64(__m128 __a)
1411 {
1412   return __builtin_ia32_cvttss2si64((__v4sf)__a);
1413 }
1414 #endif
1415
1416 /// \brief Converts two low-order float values in a 128-bit vector of
1417 ///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1418 ///    when it is inexact.
1419 ///
1420 /// \headerfile <x86intrin.h>
1421 ///
1422 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1423 ///   instructions.
1424 ///
1425 /// \param __a
1426 ///    A 128-bit vector of [4 x float].
1427 /// \returns A 64-bit integer vector containing the converted values.
1428 static __inline__ __m64 __DEFAULT_FN_ATTRS
1429 _mm_cvttps_pi32(__m128 __a)
1430 {
1431   return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1432 }
1433
1434 /// \brief Converts two low-order float values in a 128-bit vector of [4 x
1435 ///    float] into a 64-bit vector of [2 x i32], truncating the result when it
1436 ///    is inexact.
1437 ///
1438 /// \headerfile <x86intrin.h>
1439 ///
1440 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1441 ///
1442 /// \param __a
1443 ///    A 128-bit vector of [4 x float].
1444 /// \returns A 64-bit integer vector containing the converted values.
1445 static __inline__ __m64 __DEFAULT_FN_ATTRS
1446 _mm_cvtt_ps2pi(__m128 __a)
1447 {
1448   return _mm_cvttps_pi32(__a);
1449 }
1450
1451 /// \brief Converts a 32-bit signed integer value into a floating point value
1452 ///    and writes it to the lower 32 bits of the destination. The remaining
1453 ///    higher order elements of the destination vector are copied from the
1454 ///    corresponding elements in the first operand.
1455 ///
1456 /// \headerfile <x86intrin.h>
1457 ///
1458 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1459 ///
1460 /// \param __a
1461 ///    A 128-bit vector of [4 x float].
1462 /// \param __b
1463 ///    A 32-bit signed integer operand containing the value to be converted.
1464 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1465 ///    converted value of the second operand. The upper 96 bits are copied from
1466 ///    the upper 96 bits of the first operand.
1467 static __inline__ __m128 __DEFAULT_FN_ATTRS
1468 _mm_cvtsi32_ss(__m128 __a, int __b)
1469 {
1470   __a[0] = __b;
1471   return __a;
1472 }
1473
1474 /// \brief Converts a 32-bit signed integer value into a floating point value
1475 ///    and writes it to the lower 32 bits of the destination. The remaining
1476 ///    higher order elements of the destination are copied from the
1477 ///    corresponding elements in the first operand.
1478 ///
1479 /// \headerfile <x86intrin.h>
1480 ///
1481 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1482 ///
1483 /// \param __a
1484 ///    A 128-bit vector of [4 x float].
1485 /// \param __b
1486 ///    A 32-bit signed integer operand containing the value to be converted.
1487 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1488 ///    converted value of the second operand. The upper 96 bits are copied from
1489 ///    the upper 96 bits of the first operand.
1490 static __inline__ __m128 __DEFAULT_FN_ATTRS
1491 _mm_cvt_si2ss(__m128 __a, int __b)
1492 {
1493   return _mm_cvtsi32_ss(__a, __b);
1494 }
1495
1496 #ifdef __x86_64__
1497
1498 /// \brief Converts a 64-bit signed integer value into a floating point value
1499 ///    and writes it to the lower 32 bits of the destination. The remaining
1500 ///    higher order elements of the destination are copied from the
1501 ///    corresponding elements in the first operand.
1502 ///
1503 /// \headerfile <x86intrin.h>
1504 ///
1505 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1506 ///
1507 /// \param __a
1508 ///    A 128-bit vector of [4 x float].
1509 /// \param __b
1510 ///    A 64-bit signed integer operand containing the value to be converted.
1511 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1512 ///    converted value of the second operand. The upper 96 bits are copied from
1513 ///    the upper 96 bits of the first operand.
1514 static __inline__ __m128 __DEFAULT_FN_ATTRS
1515 _mm_cvtsi64_ss(__m128 __a, long long __b)
1516 {
1517   __a[0] = __b;
1518   return __a;
1519 }
1520
1521 #endif
1522
1523 /// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
1524 ///    floating point values and writes them to the lower 64-bits of the
1525 ///    destination. The remaining higher order elements of the destination are
1526 ///    copied from the corresponding elements in the first operand.
1527 ///
1528 /// \headerfile <x86intrin.h>
1529 ///
1530 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1531 ///
1532 /// \param __a
1533 ///    A 128-bit vector of [4 x float].
1534 /// \param __b
1535 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1536 ///    and written to the corresponding low-order elements in the destination.
1537 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1538 ///    converted value of the second operand. The upper 64 bits are copied from
1539 ///    the upper 64 bits of the first operand.
1540 static __inline__ __m128 __DEFAULT_FN_ATTRS
1541 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1542 {
1543   return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1544 }
1545
1546 /// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
1547 ///    floating point values and writes them to the lower 64-bits of the
1548 ///    destination. The remaining higher order elements of the destination are
1549 ///    copied from the corresponding elements in the first operand.
1550 ///
1551 /// \headerfile <x86intrin.h>
1552 ///
1553 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1554 ///
1555 /// \param __a
1556 ///    A 128-bit vector of [4 x float].
1557 /// \param __b
1558 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1559 ///    and written to the corresponding low-order elements in the destination.
1560 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1561 ///    converted value from the second operand. The upper 64 bits are copied
1562 ///    from the upper 64 bits of the first operand.
1563 static __inline__ __m128 __DEFAULT_FN_ATTRS
1564 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1565 {
1566   return _mm_cvtpi32_ps(__a, __b);
1567 }
1568
1569 /// \brief Extracts a float value contained in the lower 32 bits of a vector of
1570 ///    [4 x float].
1571 ///
1572 /// \headerfile <x86intrin.h>
1573 ///
1574 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1575 ///
1576 /// \param __a
1577 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1578 ///    used in the extraction.
1579 /// \returns A 32-bit float containing the extracted value.
1580 static __inline__ float __DEFAULT_FN_ATTRS
1581 _mm_cvtss_f32(__m128 __a)
1582 {
1583   return __a[0];
1584 }
1585
1586 /// \brief Loads two packed float values from the address \a __p into the
1587 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1588 ///     are copied from the low-order bits of the first operand.
1589 ///
1590 /// \headerfile <x86intrin.h>
1591 ///
1592 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1593 ///
1594 /// \param __a
1595 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1596 ///    of the destination.
1597 /// \param __p
1598 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1599 ///    [127:64] of the destination.
1600 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1601 static __inline__ __m128 __DEFAULT_FN_ATTRS
1602 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1603 {
1604   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1605   struct __mm_loadh_pi_struct {
1606     __mm_loadh_pi_v2f32 __u;
1607   } __attribute__((__packed__, __may_alias__));
1608   __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
1609   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1610   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1611 }
1612
1613 /// \brief Loads two packed float values from the address \a __p into the
1614 ///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1615 ///    are copied from the high-order bits of the first operand.
1616 ///
1617 /// \headerfile <x86intrin.h>
1618 ///
1619 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1620 ///
1621 /// \param __a
1622 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1623 ///    [127:64] of the destination.
1624 /// \param __p
1625 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1626 ///    [63:0] of the destination.
1627 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1628 static __inline__ __m128 __DEFAULT_FN_ATTRS
1629 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1630 {
1631   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1632   struct __mm_loadl_pi_struct {
1633     __mm_loadl_pi_v2f32 __u;
1634   } __attribute__((__packed__, __may_alias__));
1635   __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
1636   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1637   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1638 }
1639
1640 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
1641 ///    32 bits of the vector are initialized with the single-precision
1642 ///    floating-point value loaded from a specified memory location. The upper
1643 ///    96 bits are set to zero.
1644 ///
1645 /// \headerfile <x86intrin.h>
1646 ///
1647 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1648 ///
1649 /// \param __p
1650 ///    A pointer to a 32-bit memory location containing a single-precision
1651 ///    floating-point value.
1652 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1653 ///    lower 32 bits contain the value loaded from the memory location. The
1654 ///    upper 96 bits are set to zero.
1655 static __inline__ __m128 __DEFAULT_FN_ATTRS
1656 _mm_load_ss(const float *__p)
1657 {
1658   struct __mm_load_ss_struct {
1659     float __u;
1660   } __attribute__((__packed__, __may_alias__));
1661   float __u = ((struct __mm_load_ss_struct*)__p)->__u;
1662   return (__m128){ __u, 0, 0, 0 };
1663 }
1664
1665 /// \brief Loads a 32-bit float value and duplicates it to all four vector
1666 ///    elements of a 128-bit vector of [4 x float].
1667 ///
1668 /// \headerfile <x86intrin.h>
1669 ///
1670 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS + shuffling </c>
1671 ///    instruction.
1672 ///
1673 /// \param __p
1674 ///    A pointer to a float value to be loaded and duplicated.
1675 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1676 ///    duplicated values.
1677 static __inline__ __m128 __DEFAULT_FN_ATTRS
1678 _mm_load1_ps(const float *__p)
1679 {
1680   struct __mm_load1_ps_struct {
1681     float __u;
1682   } __attribute__((__packed__, __may_alias__));
1683   float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
1684   return (__m128){ __u, __u, __u, __u };
1685 }
1686
1687 #define        _mm_load_ps1(p) _mm_load1_ps(p)
1688
1689 /// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned
1690 ///    memory location.
1691 ///
1692 /// \headerfile <x86intrin.h>
1693 ///
1694 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1695 ///
1696 /// \param __p
1697 ///    A pointer to a 128-bit memory location. The address of the memory
1698 ///    location has to be 128-bit aligned.
1699 /// \returns A 128-bit vector of [4 x float] containing the loaded valus.
1700 static __inline__ __m128 __DEFAULT_FN_ATTRS
1701 _mm_load_ps(const float *__p)
1702 {
1703   return *(__m128*)__p;
1704 }
1705
1706 /// \brief Loads a 128-bit floating-point vector of [4 x float] from an
1707 ///    unaligned memory location.
1708 ///
1709 /// \headerfile <x86intrin.h>
1710 ///
1711 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1712 ///
1713 /// \param __p
1714 ///    A pointer to a 128-bit memory location. The address of the memory
1715 ///    location does not have to be aligned.
1716 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1717 static __inline__ __m128 __DEFAULT_FN_ATTRS
1718 _mm_loadu_ps(const float *__p)
1719 {
1720   struct __loadu_ps {
1721     __m128 __v;
1722   } __attribute__((__packed__, __may_alias__));
1723   return ((struct __loadu_ps*)__p)->__v;
1724 }
1725
1726 /// \brief Loads four packed float values, in reverse order, from an aligned
1727 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1728 ///
1729 /// \headerfile <x86intrin.h>
1730 ///
1731 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1732 ///    instruction.
1733 ///
1734 /// \param __p
1735 ///    A pointer to a 128-bit memory location. The address of the memory
1736 ///    location has to be 128-bit aligned.
1737 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1738 ///    in reverse order.
1739 static __inline__ __m128 __DEFAULT_FN_ATTRS
1740 _mm_loadr_ps(const float *__p)
1741 {
1742   __m128 __a = _mm_load_ps(__p);
1743   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1744 }
1745
1746 /// \brief Create a 128-bit vector of [4 x float] with undefined values.
1747 ///
1748 /// \headerfile <x86intrin.h>
1749 ///
1750 /// This intrinsic has no corresponding instruction.
1751 ///
1752 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1753 static __inline__ __m128 __DEFAULT_FN_ATTRS
1754 _mm_undefined_ps(void)
1755 {
1756   return (__m128)__builtin_ia32_undef128();
1757 }
1758
1759 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
1760 ///    32 bits of the vector are initialized with the specified single-precision
1761 ///    floating-point value. The upper 96 bits are set to zero.
1762 ///
1763 /// \headerfile <x86intrin.h>
1764 ///
1765 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1766 ///
1767 /// \param __w
1768 ///    A single-precision floating-point value used to initialize the lower 32
1769 ///    bits of the result.
1770 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1771 ///    lower 32 bits contain the value provided in the source operand. The
1772 ///    upper 96 bits are set to zero.
1773 static __inline__ __m128 __DEFAULT_FN_ATTRS
1774 _mm_set_ss(float __w)
1775 {
1776   return (__m128){ __w, 0, 0, 0 };
1777 }
1778
1779 /// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
1780 ///    of the four single-precision floating-point vector elements set to the
1781 ///    specified single-precision floating-point value.
1782 ///
1783 /// \headerfile <x86intrin.h>
1784 ///
1785 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1786 ///
1787 /// \param __w
1788 ///    A single-precision floating-point value used to initialize each vector
1789 ///    element of the result.
1790 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1791 static __inline__ __m128 __DEFAULT_FN_ATTRS
1792 _mm_set1_ps(float __w)
1793 {
1794   return (__m128){ __w, __w, __w, __w };
1795 }
1796
1797 /* Microsoft specific. */
1798 /// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
1799 ///    of the four single-precision floating-point vector elements set to the
1800 ///    specified single-precision floating-point value.
1801 ///
1802 /// \headerfile <x86intrin.h>
1803 ///
1804 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1805 ///
1806 /// \param __w
1807 ///    A single-precision floating-point value used to initialize each vector
1808 ///    element of the result.
1809 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1810 static __inline__ __m128 __DEFAULT_FN_ATTRS
1811 _mm_set_ps1(float __w)
1812 {
1813     return _mm_set1_ps(__w);
1814 }
1815
1816 /// \brief Constructs a 128-bit floating-point vector of [4 x float]
1817 ///    initialized with the specified single-precision floating-point values.
1818 ///
1819 /// \headerfile <x86intrin.h>
1820 ///
1821 /// This intrinsic is a utility function and does not correspond to a specific
1822 ///    instruction.
1823 ///
1824 /// \param __z
1825 ///    A single-precision floating-point value used to initialize bits [127:96]
1826 ///    of the result.
1827 /// \param __y
1828 ///    A single-precision floating-point value used to initialize bits [95:64]
1829 ///    of the result.
1830 /// \param __x
1831 ///    A single-precision floating-point value used to initialize bits [63:32]
1832 ///    of the result.
1833 /// \param __w
1834 ///    A single-precision floating-point value used to initialize bits [31:0]
1835 ///    of the result.
1836 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1837 static __inline__ __m128 __DEFAULT_FN_ATTRS
1838 _mm_set_ps(float __z, float __y, float __x, float __w)
1839 {
1840   return (__m128){ __w, __x, __y, __z };
1841 }
1842
1843 /// \brief Constructs a 128-bit floating-point vector of [4 x float],
1844 ///    initialized in reverse order with the specified 32-bit single-precision
1845 ///    float-point values.
1846 ///
1847 /// \headerfile <x86intrin.h>
1848 ///
1849 /// This intrinsic is a utility function and does not correspond to a specific
1850 ///    instruction.
1851 ///
1852 /// \param __z
1853 ///    A single-precision floating-point value used to initialize bits [31:0]
1854 ///    of the result.
1855 /// \param __y
1856 ///    A single-precision floating-point value used to initialize bits [63:32]
1857 ///    of the result.
1858 /// \param __x
1859 ///    A single-precision floating-point value used to initialize bits [95:64]
1860 ///    of the result.
1861 /// \param __w
1862 ///    A single-precision floating-point value used to initialize bits [127:96]
1863 ///    of the result.
1864 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1865 static __inline__ __m128 __DEFAULT_FN_ATTRS
1866 _mm_setr_ps(float __z, float __y, float __x, float __w)
1867 {
1868   return (__m128){ __z, __y, __x, __w };
1869 }
1870
1871 /// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized
1872 ///    to zero.
1873 ///
1874 /// \headerfile <x86intrin.h>
1875 ///
1876 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1877 ///
1878 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
1879 ///    all elements set to zero.
1880 static __inline__ __m128 __DEFAULT_FN_ATTRS
1881 _mm_setzero_ps(void)
1882 {
1883   return (__m128){ 0, 0, 0, 0 };
1884 }
1885
1886 /// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1887 ///    memory location.
1888 ///
1889 /// \headerfile <x86intrin.h>
1890 ///
1891 /// This intrinsic corresponds to the <c> VPEXTRQ / MOVQ </c> instruction.
1892 ///
1893 /// \param __p
1894 ///    A pointer to a 64-bit memory location.
1895 /// \param __a
1896 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1897 static __inline__ void __DEFAULT_FN_ATTRS
1898 _mm_storeh_pi(__m64 *__p, __m128 __a)
1899 {
1900   __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
1901 }
1902
1903 /// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1904 ///     memory location.
1905 ///
1906 /// \headerfile <x86intrin.h>
1907 ///
1908 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1909 ///
1910 /// \param __p
1911 ///    A pointer to a memory location that will receive the float values.
1912 /// \param __a
1913 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1914 static __inline__ void __DEFAULT_FN_ATTRS
1915 _mm_storel_pi(__m64 *__p, __m128 __a)
1916 {
1917   __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
1918 }
1919
1920 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1921 ///     memory location.
1922 ///
1923 /// \headerfile <x86intrin.h>
1924 ///
1925 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1926 ///
1927 /// \param __p
1928 ///    A pointer to a 32-bit memory location.
1929 /// \param __a
1930 ///    A 128-bit vector of [4 x float] containing the value to be stored.
1931 static __inline__ void __DEFAULT_FN_ATTRS
1932 _mm_store_ss(float *__p, __m128 __a)
1933 {
1934   struct __mm_store_ss_struct {
1935     float __u;
1936   } __attribute__((__packed__, __may_alias__));
1937   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1938 }
1939
1940 /// \brief Stores a 128-bit vector of [4 x float] to an unaligned memory
1941 ///    location.
1942 ///
1943 /// \headerfile <x86intrin.h>
1944 ///
1945 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1946 ///
1947 /// \param __p
1948 ///    A pointer to a 128-bit memory location. The address of the memory
1949 ///    location does not have to be aligned.
1950 /// \param __a
1951 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1952 static __inline__ void __DEFAULT_FN_ATTRS
1953 _mm_storeu_ps(float *__p, __m128 __a)
1954 {
1955   struct __storeu_ps {
1956     __m128 __v;
1957   } __attribute__((__packed__, __may_alias__));
1958   ((struct __storeu_ps*)__p)->__v = __a;
1959 }
1960
1961 /// \brief Stores a 128-bit vector of [4 x float] into an aligned memory
1962 ///    location.
1963 ///
1964 /// \headerfile <x86intrin.h>
1965 ///
1966 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1967 ///
1968 /// \param __p
1969 ///    A pointer to a 128-bit memory location. The address of the memory
1970 ///    location has to be 16-byte aligned.
1971 /// \param __a
1972 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1973 static __inline__ void __DEFAULT_FN_ATTRS
1974 _mm_store_ps(float *__p, __m128 __a)
1975 {
1976   *(__m128*)__p = __a;
1977 }
1978
1979 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
1980 ///    four contiguous elements in an aligned memory location.
1981 ///
1982 /// \headerfile <x86intrin.h>
1983 ///
1984 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
1985 ///    instruction.
1986 ///
1987 /// \param __p
1988 ///    A pointer to a 128-bit memory location.
1989 /// \param __a
1990 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
1991 ///    of the four contiguous elements pointed by \a __p.
1992 static __inline__ void __DEFAULT_FN_ATTRS
1993 _mm_store1_ps(float *__p, __m128 __a)
1994 {
1995   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
1996   _mm_store_ps(__p, __a);
1997 }
1998
1999 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2000 ///    four contiguous elements in an aligned memory location.
2001 ///
2002 /// \headerfile <x86intrin.h>
2003 ///
2004 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2005 ///    instruction.
2006 ///
2007 /// \param __p
2008 ///    A pointer to a 128-bit memory location.
2009 /// \param __a
2010 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2011 ///    of the four contiguous elements pointed by \a __p.
2012 static __inline__ void __DEFAULT_FN_ATTRS
2013 _mm_store_ps1(float *__p, __m128 __a)
2014 {
2015   return _mm_store1_ps(__p, __a);
2016 }
2017
2018 /// \brief Stores float values from a 128-bit vector of [4 x float] to an
2019 ///    aligned memory location in reverse order.
2020 ///
2021 /// \headerfile <x86intrin.h>
2022 ///
2023 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2024 ///    instruction.
2025 ///
2026 /// \param __p
2027 ///    A pointer to a 128-bit memory location. The address of the memory
2028 ///    location has to be 128-bit aligned.
2029 /// \param __a
2030 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2031 static __inline__ void __DEFAULT_FN_ATTRS
2032 _mm_storer_ps(float *__p, __m128 __a)
2033 {
2034   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2035   _mm_store_ps(__p, __a);
2036 }
2037
2038 #define _MM_HINT_ET0 7
2039 #define _MM_HINT_ET1 6
2040 #define _MM_HINT_T0  3
2041 #define _MM_HINT_T1  2
2042 #define _MM_HINT_T2  1
2043 #define _MM_HINT_NTA 0
2044
2045 #ifndef _MSC_VER
2046 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2047    Sema doesn't do any form of constant propagation yet. */
2048
2049 /// \brief Loads one cache line of data from the specified address to a location
2050 ///    closer to the processor.
2051 ///
2052 /// \headerfile <x86intrin.h>
2053 ///
2054 /// \code
2055 /// void _mm_prefetch(const void * a, const int sel);
2056 /// \endcode
2057 ///
2058 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2059 ///
2060 /// \param a
2061 ///    A pointer to a memory location containing a cache line of data.
2062 /// \param sel
2063 ///    A predefined integer constant specifying the type of prefetch
2064 ///    operation: \n
2065 ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2066 ///    PREFETCHNTA instruction will be generated. \n
2067 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2068 ///    be generated. \n
2069 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2070 ///    be generated. \n
2071 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2072 ///    be generated.
2073 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \
2074                                                  ((sel) >> 2) & 1, (sel) & 0x3))
2075 #endif
2076
2077 /// \brief Stores a 64-bit integer in the specified aligned memory location. To
2078 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
2079 ///    used again soon).
2080 ///
2081 /// \headerfile <x86intrin.h>
2082 ///
2083 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2084 ///
2085 /// \param __p
2086 ///    A pointer to an aligned memory location used to store the register value.
2087 /// \param __a
2088 ///    A 64-bit integer containing the value to be stored.
2089 static __inline__ void __DEFAULT_FN_ATTRS
2090 _mm_stream_pi(__m64 *__p, __m64 __a)
2091 {
2092   __builtin_ia32_movntq(__p, __a);
2093 }
2094
2095 /// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
2096 ///    128-bit aligned memory location. To minimize caching, the data is flagged
2097 ///    as non-temporal (unlikely to be used again soon).
2098 ///
2099 /// \headerfile <x86intrin.h>
2100 ///
2101 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2102 ///
2103 /// \param __p
2104 ///    A pointer to a 128-bit aligned memory location that will receive the
2105 ///    single-precision floating-point values.
2106 /// \param __a
2107 ///    A 128-bit vector of [4 x float] containing the values to be moved.
2108 static __inline__ void __DEFAULT_FN_ATTRS
2109 _mm_stream_ps(float *__p, __m128 __a)
2110 {
2111   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2112 }
2113
2114 #if defined(__cplusplus)
2115 extern "C" {
2116 #endif
2117
2118 /// \brief Forces strong memory ordering (serialization) between store
2119 ///    instructions preceding this instruction and store instructions following
2120 ///    this instruction, ensuring the system completes all previous stores
2121 ///    before executing subsequent stores.
2122 ///
2123 /// \headerfile <x86intrin.h>
2124 ///
2125 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2126 ///
2127 void _mm_sfence(void);
2128
2129 #if defined(__cplusplus)
2130 } // extern "C"
2131 #endif
2132
2133 /// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2134 ///    returns it, as specified by the immediate integer operand.
2135 ///
2136 /// \headerfile <x86intrin.h>
2137 ///
2138 /// \code
2139 /// int _mm_extract_pi16(__m64 a, int n);
2140 /// \endcode
2141 ///
2142 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2143 ///
2144 /// \param a
2145 ///    A 64-bit vector of [4 x i16].
2146 /// \param n
2147 ///    An immediate integer operand that determines which bits are extracted: \n
2148 ///    0: Bits [15:0] are copied to the destination. \n
2149 ///    1: Bits [31:16] are copied to the destination. \n
2150 ///    2: Bits [47:32] are copied to the destination. \n
2151 ///    3: Bits [63:48] are copied to the destination.
2152 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2153 #define _mm_extract_pi16(a, n) __extension__ ({ \
2154   (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); })
2155
2156 /// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
2157 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2158 ///    specified by the immediate operand \a n.
2159 ///
2160 /// \headerfile <x86intrin.h>
2161 ///
2162 /// \code
2163 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2164 /// \endcode
2165 ///
2166 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
2167 ///
2168 /// \param a
2169 ///    A 64-bit vector of [4 x i16].
2170 /// \param d
2171 ///    An integer. The lower 16-bit value from this operand is written to the
2172 ///    destination at the offset specified by operand \a n.
2173 /// \param n
2174 ///    An immediate integer operant that determines which the bits to be used
2175 ///    in the destination. \n
2176 ///    0: Bits [15:0] are copied to the destination. \n
2177 ///    1: Bits [31:16] are copied to the destination. \n
2178 ///    2: Bits [47:32] are copied to the destination. \n
2179 ///    3: Bits [63:48] are copied to the destination.  \n
2180 ///    The remaining bits in the destination are copied from the corresponding
2181 ///    bits in operand \a a.
2182 /// \returns A 64-bit integer vector containing the copied packed data from the
2183 ///    operands.
2184 #define _mm_insert_pi16(a, d, n) __extension__ ({ \
2185   (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); })
2186
2187 /// \brief Compares each of the corresponding packed 16-bit integer values of
2188 ///    the 64-bit integer vectors, and writes the greater value to the
2189 ///    corresponding bits in the destination.
2190 ///
2191 /// \headerfile <x86intrin.h>
2192 ///
2193 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2194 ///
2195 /// \param __a
2196 ///    A 64-bit integer vector containing one of the source operands.
2197 /// \param __b
2198 ///    A 64-bit integer vector containing one of the source operands.
2199 /// \returns A 64-bit integer vector containing the comparison results.
2200 static __inline__ __m64 __DEFAULT_FN_ATTRS
2201 _mm_max_pi16(__m64 __a, __m64 __b)
2202 {
2203   return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2204 }
2205
2206 /// \brief Compares each of the corresponding packed 8-bit unsigned integer
2207 ///    values of the 64-bit integer vectors, and writes the greater value to the
2208 ///    corresponding bits in the destination.
2209 ///
2210 /// \headerfile <x86intrin.h>
2211 ///
2212 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2213 ///
2214 /// \param __a
2215 ///    A 64-bit integer vector containing one of the source operands.
2216 /// \param __b
2217 ///    A 64-bit integer vector containing one of the source operands.
2218 /// \returns A 64-bit integer vector containing the comparison results.
2219 static __inline__ __m64 __DEFAULT_FN_ATTRS
2220 _mm_max_pu8(__m64 __a, __m64 __b)
2221 {
2222   return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2223 }
2224
2225 /// \brief Compares each of the corresponding packed 16-bit integer values of
2226 ///    the 64-bit integer vectors, and writes the lesser value to the
2227 ///    corresponding bits in the destination.
2228 ///
2229 /// \headerfile <x86intrin.h>
2230 ///
2231 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2232 ///
2233 /// \param __a
2234 ///    A 64-bit integer vector containing one of the source operands.
2235 /// \param __b
2236 ///    A 64-bit integer vector containing one of the source operands.
2237 /// \returns A 64-bit integer vector containing the comparison results.
2238 static __inline__ __m64 __DEFAULT_FN_ATTRS
2239 _mm_min_pi16(__m64 __a, __m64 __b)
2240 {
2241   return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2242 }
2243
2244 /// \brief Compares each of the corresponding packed 8-bit unsigned integer
2245 ///    values of the 64-bit integer vectors, and writes the lesser value to the
2246 ///    corresponding bits in the destination.
2247 ///
2248 /// \headerfile <x86intrin.h>
2249 ///
2250 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2251 ///
2252 /// \param __a
2253 ///    A 64-bit integer vector containing one of the source operands.
2254 /// \param __b
2255 ///    A 64-bit integer vector containing one of the source operands.
2256 /// \returns A 64-bit integer vector containing the comparison results.
2257 static __inline__ __m64 __DEFAULT_FN_ATTRS
2258 _mm_min_pu8(__m64 __a, __m64 __b)
2259 {
2260   return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2261 }
2262
2263 /// \brief Takes the most significant bit from each 8-bit element in a 64-bit
2264 ///    integer vector to create a 16-bit mask value. Zero-extends the value to
2265 ///    32-bit integer and writes it to the destination.
2266 ///
2267 /// \headerfile <x86intrin.h>
2268 ///
2269 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2270 ///
2271 /// \param __a
2272 ///    A 64-bit integer vector containing the values with bits to be extracted.
2273 /// \returns The most significant bit from each 8-bit element in the operand,
2274 ///    written to bits [15:0].
2275 static __inline__ int __DEFAULT_FN_ATTRS
2276 _mm_movemask_pi8(__m64 __a)
2277 {
2278   return __builtin_ia32_pmovmskb((__v8qi)__a);
2279 }
2280
2281 /// \brief Multiplies packed 16-bit unsigned integer values and writes the
2282 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
2283 ///    the destination.
2284 ///
2285 /// \headerfile <x86intrin.h>
2286 ///
2287 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2288 ///
2289 /// \param __a
2290 ///    A 64-bit integer vector containing one of the source operands.
2291 /// \param __b
2292 ///    A 64-bit integer vector containing one of the source operands.
2293 /// \returns A 64-bit integer vector containing the products of both operands.
2294 static __inline__ __m64 __DEFAULT_FN_ATTRS
2295 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2296 {
2297   return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2298 }
2299
2300 /// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2301 ///    destination, as specified by the immediate value operand.
2302 ///
2303 /// \headerfile <x86intrin.h>
2304 ///
2305 /// \code
2306 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2307 /// \endcode
2308 ///
2309 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2310 ///
2311 /// \param a
2312 ///    A 64-bit integer vector containing the values to be shuffled.
2313 /// \param n
2314 ///    An immediate value containing an 8-bit value specifying which elements to
2315 ///    copy from \a a. The destinations within the 64-bit destination are
2316 ///    assigned values as follows: \n
2317 ///    Bits [1:0] are used to assign values to bits [15:0] in the
2318 ///    destination. \n
2319 ///    Bits [3:2] are used to assign values to bits [31:16] in the
2320 ///    destination. \n
2321 ///    Bits [5:4] are used to assign values to bits [47:32] in the
2322 ///    destination. \n
2323 ///    Bits [7:6] are used to assign values to bits [63:48] in the
2324 ///    destination. \n
2325 ///    Bit value assignments: \n
2326 ///    00: assigned from bits [15:0] of \a a. \n
2327 ///    01: assigned from bits [31:16] of \a a. \n
2328 ///    10: assigned from bits [47:32] of \a a. \n
2329 ///    11: assigned from bits [63:48] of \a a.
2330 /// \returns A 64-bit integer vector containing the shuffled values.
2331 #define _mm_shuffle_pi16(a, n) __extension__ ({ \
2332   (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
2333
2334 /// \brief Conditionally copies the values from each 8-bit element in the first
2335 ///    64-bit integer vector operand to the specified memory location, as
2336 ///    specified by the most significant bit in the corresponding element in the
2337 ///    second 64-bit integer vector operand.
2338 ///
2339 ///    To minimize caching, the data is flagged as non-temporal
2340 ///    (unlikely to be used again soon).
2341 ///
2342 /// \headerfile <x86intrin.h>
2343 ///
2344 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2345 ///
2346 /// \param __d
2347 ///    A 64-bit integer vector containing the values with elements to be copied.
2348 /// \param __n
2349 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2350 ///    element determines whether the corresponding element in operand \a __d
2351 ///    is copied. If the most significant bit of a given element is 1, the
2352 ///    corresponding element in operand \a __d is copied.
2353 /// \param __p
2354 ///    A pointer to a 64-bit memory location that will receive the conditionally
2355 ///    copied integer values. The address of the memory location does not have
2356 ///    to be aligned.
2357 static __inline__ void __DEFAULT_FN_ATTRS
2358 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2359 {
2360   __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2361 }
2362
2363 /// \brief Computes the rounded averages of the packed unsigned 8-bit integer
2364 ///    values and writes the averages to the corresponding bits in the
2365 ///    destination.
2366 ///
2367 /// \headerfile <x86intrin.h>
2368 ///
2369 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2370 ///
2371 /// \param __a
2372 ///    A 64-bit integer vector containing one of the source operands.
2373 /// \param __b
2374 ///    A 64-bit integer vector containing one of the source operands.
2375 /// \returns A 64-bit integer vector containing the averages of both operands.
2376 static __inline__ __m64 __DEFAULT_FN_ATTRS
2377 _mm_avg_pu8(__m64 __a, __m64 __b)
2378 {
2379   return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2380 }
2381
2382 /// \brief Computes the rounded averages of the packed unsigned 16-bit integer
2383 ///    values and writes the averages to the corresponding bits in the
2384 ///    destination.
2385 ///
2386 /// \headerfile <x86intrin.h>
2387 ///
2388 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2389 ///
2390 /// \param __a
2391 ///    A 64-bit integer vector containing one of the source operands.
2392 /// \param __b
2393 ///    A 64-bit integer vector containing one of the source operands.
2394 /// \returns A 64-bit integer vector containing the averages of both operands.
2395 static __inline__ __m64 __DEFAULT_FN_ATTRS
2396 _mm_avg_pu16(__m64 __a, __m64 __b)
2397 {
2398   return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2399 }
2400
2401 /// \brief Subtracts the corresponding 8-bit unsigned integer values of the two
2402 ///    64-bit vector operands and computes the absolute value for each of the
2403 ///    difference. Then sum of the 8 absolute differences is written to the
2404 ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2405 ///
2406 /// \headerfile <x86intrin.h>
2407 ///
2408 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2409 ///
2410 /// \param __a
2411 ///    A 64-bit integer vector containing one of the source operands.
2412 /// \param __b
2413 ///    A 64-bit integer vector containing one of the source operands.
2414 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2415 ///    sets of absolute differences between both operands. The upper bits are
2416 ///    cleared.
2417 static __inline__ __m64 __DEFAULT_FN_ATTRS
2418 _mm_sad_pu8(__m64 __a, __m64 __b)
2419 {
2420   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2421 }
2422
2423 #if defined(__cplusplus)
2424 extern "C" {
2425 #endif
2426
2427 /// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
2428 ///    integer value.
2429 ///
2430 ///    There are several groups of macros associated with this
2431 ///    intrinsic, including:
2432 ///    <ul>
2433 ///    <li>
2434 ///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2435 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2436 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2437 ///      _MM_GET_EXCEPTION_STATE().
2438 ///    </li>
2439 ///    <li>
2440 ///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2441 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2442 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2443 ///    </li>
2444 ///    <li>
2445 ///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2446 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2447 ///      _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
2448 ///    </li>
2449 ///    <li>
2450 ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2451 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2452 ///    </li>
2453 ///    <li>
2454 ///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2455 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2456 ///      _MM_GET_DENORMALS_ZERO_MODE().
2457 ///    </li>
2458 ///    </ul>
2459 ///
2460 ///    For example, the expression below checks if an overflow exception has
2461 ///    occurred:
2462 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2463 ///
2464 ///    The following example gets the current rounding mode:
2465 ///      _MM_GET_ROUNDING_MODE()
2466 ///
2467 /// \headerfile <x86intrin.h>
2468 ///
2469 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2470 ///
2471 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2472 ///    register.
2473 unsigned int _mm_getcsr(void);
2474
2475 /// \brief Sets the MXCSR register with the 32-bit unsigned integer value.
2476 ///
2477 ///    There are several groups of macros associated with this intrinsic,
2478 ///    including:
2479 ///    <ul>
2480 ///    <li>
2481 ///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2482 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2483 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2484 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2485 ///    </li>
2486 ///    <li>
2487 ///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2488 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2489 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2490 ///      of these macros.
2491 ///    </li>
2492 ///    <li>
2493 ///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2494 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2495 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2496 ///    </li>
2497 ///    <li>
2498 ///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2499 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2500 ///      one of these macros.
2501 ///    </li>
2502 ///    <li>
2503 ///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2504 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2505 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2506 ///    </li>
2507 ///    </ul>
2508 ///
2509 ///    For example, the following expression causes subsequent floating-point
2510 ///    operations to round up:
2511 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2512 ///
2513 ///    The following example sets the DAZ and FTZ flags:
2514 ///      void setFlags() {
2515 ///        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)
2516 ///        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)
2517 ///      }
2518 ///
2519 /// \headerfile <x86intrin.h>
2520 ///
2521 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2522 ///
2523 /// \param __i
2524 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
2525 void _mm_setcsr(unsigned int __i);
2526
2527 #if defined(__cplusplus)
2528 } // extern "C"
2529 #endif
2530
2531 /// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
2532 ///    specified by the immediate value operand.
2533 ///
2534 /// \headerfile <x86intrin.h>
2535 ///
2536 /// \code
2537 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2538 /// \endcode
2539 ///
2540 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2541 ///
2542 /// \param a
2543 ///    A 128-bit vector of [4 x float].
2544 /// \param b
2545 ///    A 128-bit vector of [4 x float].
2546 /// \param mask
2547 ///    An immediate value containing an 8-bit value specifying which elements to
2548 ///    copy from \a a and \a b. \n
2549 ///    Bits [3:0] specify the values copied from operand \a a. \n
2550 ///    Bits [7:4] specify the values copied from operand \a b. \n
2551 ///    The destinations within the 128-bit destination are assigned values as
2552 ///    follows: \n
2553 ///    Bits [1:0] are used to assign values to bits [31:0] in the
2554 ///    destination. \n
2555 ///    Bits [3:2] are used to assign values to bits [63:32] in the
2556 ///    destination. \n
2557 ///    Bits [5:4] are used to assign values to bits [95:64] in the
2558 ///    destination. \n
2559 ///    Bits [7:6] are used to assign values to bits [127:96] in the
2560 ///    destination. \n
2561 ///    Bit value assignments: \n
2562 ///    00: Bits [31:0] copied from the specified operand. \n
2563 ///    01: Bits [63:32] copied from the specified operand. \n
2564 ///    10: Bits [95:64] copied from the specified operand. \n
2565 ///    11: Bits [127:96] copied from the specified operand.
2566 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2567 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
2568   (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2569                                   0 + (((mask) >> 0) & 0x3), \
2570                                   0 + (((mask) >> 2) & 0x3), \
2571                                   4 + (((mask) >> 4) & 0x3), \
2572                                   4 + (((mask) >> 6) & 0x3)); })
2573
2574 /// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2575 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2576 ///
2577 /// \headerfile <x86intrin.h>
2578 ///
2579 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2580 ///
2581 /// \param __a
2582 ///    A 128-bit vector of [4 x float]. \n
2583 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
2584 ///    Bits [127:96] are written to bits [95:64] of the destination.
2585 /// \param __b
2586 ///    A 128-bit vector of [4 x float].
2587 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
2588 ///    Bits [127:96] are written to bits [127:96] of the destination.
2589 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2590 static __inline__ __m128 __DEFAULT_FN_ATTRS
2591 _mm_unpackhi_ps(__m128 __a, __m128 __b)
2592 {
2593   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2594 }
2595
2596 /// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2597 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2598 ///
2599 /// \headerfile <x86intrin.h>
2600 ///
2601 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2602 ///
2603 /// \param __a
2604 ///    A 128-bit vector of [4 x float]. \n
2605 ///    Bits [31:0] are written to bits [31:0] of the destination.  \n
2606 ///    Bits [63:32] are written to bits [95:64] of the destination.
2607 /// \param __b
2608 ///    A 128-bit vector of [4 x float]. \n
2609 ///    Bits [31:0] are written to bits [63:32] of the destination. \n
2610 ///    Bits [63:32] are written to bits [127:96] of the destination.
2611 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2612 static __inline__ __m128 __DEFAULT_FN_ATTRS
2613 _mm_unpacklo_ps(__m128 __a, __m128 __b)
2614 {
2615   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2616 }
2617
2618 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2619 ///    32 bits are set to the lower 32 bits of the second parameter. The upper
2620 ///    96 bits are set to the upper 96 bits of the first parameter.
2621 ///
2622 /// \headerfile <x86intrin.h>
2623 ///
2624 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2625 ///
2626 /// \param __a
2627 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2628 ///    written to the upper 96 bits of the result.
2629 /// \param __b
2630 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2631 ///    written to the lower 32 bits of the result.
2632 /// \returns A 128-bit floating-point vector of [4 x float].
2633 static __inline__ __m128 __DEFAULT_FN_ATTRS
2634 _mm_move_ss(__m128 __a, __m128 __b)
2635 {
2636   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3);
2637 }
2638
2639 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2640 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
2641 ///    64 bits are set to the upper 64 bits of the first parameter.
2642 ///
2643 /// \headerfile <x86intrin.h>
2644 ///
2645 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2646 ///
2647 /// \param __a
2648 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2649 ///    written to the upper 64 bits of the result.
2650 /// \param __b
2651 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2652 ///    written to the lower 64 bits of the result.
2653 /// \returns A 128-bit floating-point vector of [4 x float].
2654 static __inline__ __m128 __DEFAULT_FN_ATTRS
2655 _mm_movehl_ps(__m128 __a, __m128 __b)
2656 {
2657   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2658 }
2659
2660 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2661 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
2662 ///    64 bits are set to the lower 64 bits of the second parameter.
2663 ///
2664 /// \headerfile <x86intrin.h>
2665 ///
2666 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2667 ///
2668 /// \param __a
2669 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2670 ///    written to the lower 64 bits of the result.
2671 /// \param __b
2672 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2673 ///    written to the upper 64 bits of the result.
2674 /// \returns A 128-bit floating-point vector of [4 x float].
2675 static __inline__ __m128 __DEFAULT_FN_ATTRS
2676 _mm_movelh_ps(__m128 __a, __m128 __b)
2677 {
2678   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2679 }
2680
2681 /// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2682 ///    float].
2683 ///
2684 /// \headerfile <x86intrin.h>
2685 ///
2686 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2687 ///
2688 /// \param __a
2689 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2690 ///    from the corresponding elements in this operand.
2691 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2692 ///    values from the operand.
2693 static __inline__ __m128 __DEFAULT_FN_ATTRS
2694 _mm_cvtpi16_ps(__m64 __a)
2695 {
2696   __m64 __b, __c;
2697   __m128 __r;
2698
2699   __b = _mm_setzero_si64();
2700   __b = _mm_cmpgt_pi16(__b, __a);
2701   __c = _mm_unpackhi_pi16(__a, __b);
2702   __r = _mm_setzero_ps();
2703   __r = _mm_cvtpi32_ps(__r, __c);
2704   __r = _mm_movelh_ps(__r, __r);
2705   __c = _mm_unpacklo_pi16(__a, __b);
2706   __r = _mm_cvtpi32_ps(__r, __c);
2707
2708   return __r;
2709 }
2710
2711 /// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a
2712 ///    128-bit vector of [4 x float].
2713 ///
2714 /// \headerfile <x86intrin.h>
2715 ///
2716 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2717 ///
2718 /// \param __a
2719 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2720 ///    destination are copied from the corresponding elements in this operand.
2721 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2722 ///    values from the operand.
2723 static __inline__ __m128 __DEFAULT_FN_ATTRS
2724 _mm_cvtpu16_ps(__m64 __a)
2725 {
2726   __m64 __b, __c;
2727   __m128 __r;
2728
2729   __b = _mm_setzero_si64();
2730   __c = _mm_unpackhi_pi16(__a, __b);
2731   __r = _mm_setzero_ps();
2732   __r = _mm_cvtpi32_ps(__r, __c);
2733   __r = _mm_movelh_ps(__r, __r);
2734   __c = _mm_unpacklo_pi16(__a, __b);
2735   __r = _mm_cvtpi32_ps(__r, __c);
2736
2737   return __r;
2738 }
2739
2740 /// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2741 ///    into a 128-bit vector of [4 x float].
2742 ///
2743 /// \headerfile <x86intrin.h>
2744 ///
2745 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2746 ///
2747 /// \param __a
2748 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2749 ///    from the corresponding lower 4 elements in this operand.
2750 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2751 ///    values from the operand.
2752 static __inline__ __m128 __DEFAULT_FN_ATTRS
2753 _mm_cvtpi8_ps(__m64 __a)
2754 {
2755   __m64 __b;
2756
2757   __b = _mm_setzero_si64();
2758   __b = _mm_cmpgt_pi8(__b, __a);
2759   __b = _mm_unpacklo_pi8(__a, __b);
2760
2761   return _mm_cvtpi16_ps(__b);
2762 }
2763
2764 /// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit
2765 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2766 ///
2767 /// \headerfile <x86intrin.h>
2768 ///
2769 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2770 ///
2771 /// \param __a
2772 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2773 ///    destination are copied from the corresponding lower 4 elements in this
2774 ///    operand.
2775 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2776 ///    values from the source operand.
2777 static __inline__ __m128 __DEFAULT_FN_ATTRS
2778 _mm_cvtpu8_ps(__m64 __a)
2779 {
2780   __m64 __b;
2781
2782   __b = _mm_setzero_si64();
2783   __b = _mm_unpacklo_pi8(__a, __b);
2784
2785   return _mm_cvtpi16_ps(__b);
2786 }
2787
2788 /// \brief Converts the two 32-bit signed integer values from each 64-bit vector
2789 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2790 ///
2791 /// \headerfile <x86intrin.h>
2792 ///
2793 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2794 ///
2795 /// \param __a
2796 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2797 ///    copied from the elements in this operand.
2798 /// \param __b
2799 ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2800 ///    copied from the elements in this operand.
2801 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2802 ///    copied and converted values from the first operand. The upper 64 bits
2803 ///    contain the copied and converted values from the second operand.
2804 static __inline__ __m128 __DEFAULT_FN_ATTRS
2805 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2806 {
2807   __m128 __c;
2808
2809   __c = _mm_setzero_ps();
2810   __c = _mm_cvtpi32_ps(__c, __b);
2811   __c = _mm_movelh_ps(__c, __c);
2812
2813   return _mm_cvtpi32_ps(__c, __a);
2814 }
2815
2816 /// \brief Converts each single-precision floating-point element of a 128-bit
2817 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2818 ///    packs the results into a 64-bit integer vector of [4 x i16].
2819 ///
2820 ///    If the floating-point element is NaN or infinity, or if the
2821 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2822 ///    it is converted to 0x8000. Otherwise if the floating-point element is
2823 ///    greater than 0x7FFF, it is converted to 0x7FFF.
2824 ///
2825 /// \headerfile <x86intrin.h>
2826 ///
2827 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2828 ///
2829 /// \param __a
2830 ///    A 128-bit floating-point vector of [4 x float].
2831 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2832 ///    values.
2833 static __inline__ __m64 __DEFAULT_FN_ATTRS
2834 _mm_cvtps_pi16(__m128 __a)
2835 {
2836   __m64 __b, __c;
2837
2838   __b = _mm_cvtps_pi32(__a);
2839   __a = _mm_movehl_ps(__a, __a);
2840   __c = _mm_cvtps_pi32(__a);
2841
2842   return _mm_packs_pi32(__b, __c);
2843 }
2844
2845 /// \brief Converts each single-precision floating-point element of a 128-bit
2846 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
2847 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
2848 ///    [8 x i8]. The upper 32 bits of the vector are set to 0.
2849 ///
2850 ///    If the floating-point element is NaN or infinity, or if the
2851 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2852 ///    is converted to 0x80. Otherwise if the floating-point element is greater
2853 ///    than 0x7F, it is converted to 0x7F.
2854 ///
2855 /// \headerfile <x86intrin.h>
2856 ///
2857 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2858 ///
2859 /// \param __a
2860 ///    128-bit floating-point vector of [4 x float].
2861 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2862 ///    converted values and the uppper 32 bits are set to zero.
2863 static __inline__ __m64 __DEFAULT_FN_ATTRS
2864 _mm_cvtps_pi8(__m128 __a)
2865 {
2866   __m64 __b, __c;
2867
2868   __b = _mm_cvtps_pi16(__a);
2869   __c = _mm_setzero_si64();
2870
2871   return _mm_packs_pi16(__b, __c);
2872 }
2873
2874 /// \brief Extracts the sign bits from each single-precision floating-point
2875 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
2876 ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2877 ///    to zero.
2878 ///
2879 /// \headerfile <x86intrin.h>
2880 ///
2881 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2882 ///
2883 /// \param __a
2884 ///    A 128-bit floating-point vector of [4 x float].
2885 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2886 ///    single-precision floating-point element of the parameter. Bits [31:4] are
2887 ///    set to zero.
2888 static __inline__ int __DEFAULT_FN_ATTRS
2889 _mm_movemask_ps(__m128 __a)
2890 {
2891   return __builtin_ia32_movmskps((__v4sf)__a);
2892 }
2893
2894
2895 #define _MM_ALIGN16 __attribute__((aligned(16)))
2896
2897 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2898
2899 #define _MM_EXCEPT_INVALID    (0x0001)
2900 #define _MM_EXCEPT_DENORM     (0x0002)
2901 #define _MM_EXCEPT_DIV_ZERO   (0x0004)
2902 #define _MM_EXCEPT_OVERFLOW   (0x0008)
2903 #define _MM_EXCEPT_UNDERFLOW  (0x0010)
2904 #define _MM_EXCEPT_INEXACT    (0x0020)
2905 #define _MM_EXCEPT_MASK       (0x003f)
2906
2907 #define _MM_MASK_INVALID      (0x0080)
2908 #define _MM_MASK_DENORM       (0x0100)
2909 #define _MM_MASK_DIV_ZERO     (0x0200)
2910 #define _MM_MASK_OVERFLOW     (0x0400)
2911 #define _MM_MASK_UNDERFLOW    (0x0800)
2912 #define _MM_MASK_INEXACT      (0x1000)
2913 #define _MM_MASK_MASK         (0x1f80)
2914
2915 #define _MM_ROUND_NEAREST     (0x0000)
2916 #define _MM_ROUND_DOWN        (0x2000)
2917 #define _MM_ROUND_UP          (0x4000)
2918 #define _MM_ROUND_TOWARD_ZERO (0x6000)
2919 #define _MM_ROUND_MASK        (0x6000)
2920
2921 #define _MM_FLUSH_ZERO_MASK   (0x8000)
2922 #define _MM_FLUSH_ZERO_ON     (0x8000)
2923 #define _MM_FLUSH_ZERO_OFF    (0x0000)
2924
2925 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2926 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2927 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2928 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2929
2930 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2931 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2932 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2933 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2934
2935 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2936 do { \
2937   __m128 tmp3, tmp2, tmp1, tmp0; \
2938   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2939   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2940   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2941   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2942   (row0) = _mm_movelh_ps(tmp0, tmp2); \
2943   (row1) = _mm_movehl_ps(tmp2, tmp0); \
2944   (row2) = _mm_movelh_ps(tmp1, tmp3); \
2945   (row3) = _mm_movehl_ps(tmp3, tmp1); \
2946 } while (0)
2947
2948 /* Aliases for compatibility. */
2949 #define _m_pextrw _mm_extract_pi16
2950 #define _m_pinsrw _mm_insert_pi16
2951 #define _m_pmaxsw _mm_max_pi16
2952 #define _m_pmaxub _mm_max_pu8
2953 #define _m_pminsw _mm_min_pi16
2954 #define _m_pminub _mm_min_pu8
2955 #define _m_pmovmskb _mm_movemask_pi8
2956 #define _m_pmulhuw _mm_mulhi_pu16
2957 #define _m_pshufw _mm_shuffle_pi16
2958 #define _m_maskmovq _mm_maskmove_si64
2959 #define _m_pavgb _mm_avg_pu8
2960 #define _m_pavgw _mm_avg_pu16
2961 #define _m_psadbw _mm_sad_pu8
2962 #define _m_ _mm_
2963 #define _m_ _mm_
2964
2965 #undef __DEFAULT_FN_ATTRS
2966
2967 /* Ugly hack for backwards-compatibility (compatible with gcc) */
2968 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
2969 #include <emmintrin.h>
2970 #endif
2971
2972 #endif /* __XMMINTRIN_H */