contrib/llvm/tools/clang/lib/Headers/xmmintrin.h

   1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
   2  *
   3  * Permission is hereby granted, free of charge, to any person obtaining a copy
   4  * of this software and associated documentation files (the "Software"), to deal
   5  * in the Software without restriction, including without limitation the rights
   6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   7  * copies of the Software, and to permit persons to whom the Software is
   8  * furnished to do so, subject to the following conditions:
   9  *
  10  * The above copyright notice and this permission notice shall be included in
  11  * all copies or substantial portions of the Software.
  12  *
  13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19  * THE SOFTWARE.
  20  *
  21  *===-----------------------------------------------------------------------===
  22  */
  23
  24 #ifndef __XMMINTRIN_H
  25 #define __XMMINTRIN_H
  26
  27 #include <mmintrin.h>
  28
  29 typedef int __v4si __attribute__((__vector_size__(16)));
  30 typedef float __v4sf __attribute__((__vector_size__(16)));
  31 typedef float __m128 __attribute__((__vector_size__(16)));
  32
  33 /* Unsigned types */
  34 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
  35
  36 /* This header should only be included in a hosted environment as it depends on
  37  * a standard library to provide allocation routines. */
  38 #if __STDC_HOSTED__
  39 #include <mm_malloc.h>
  40 #endif
  41
  42 /* Define the default attributes for the functions in this file. */
  43 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
  44
  45 /// \brief Adds the 32-bit float values in the low-order bits of the operands.
  46 ///
  47 /// \headerfile <x86intrin.h>
  48 ///
  49 /// This intrinsic corresponds to the \c VADDSS / ADDSS instructions.
  50 ///
  51 /// \param __a
  52 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  53 ///    The lower 32 bits of this operand are used in the calculation.
  54 /// \param __b
  55 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  56 ///    The lower 32 bits of this operand are used in the calculation.
  57 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
  58 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
  59 ///    the upper 96 bits of the first source operand.
  60 static __inline__ __m128 __DEFAULT_FN_ATTRS
  61 _mm_add_ss(__m128 __a, __m128 __b)
  62 {
  63   __a[0] += __b[0];
  64   return __a;
  65 }
  66
  67 /// \brief Adds two 128-bit vectors of [4 x float], and returns the results of
  68 ///    the addition.
  69 ///
  70 /// \headerfile <x86intrin.h>
  71 ///
  72 /// This intrinsic corresponds to the \c VADDPS / ADDPS instructions.
  73 ///
  74 /// \param __a
  75 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  76 /// \param __b
  77 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  78 /// \returns A 128-bit vector of [4 x float] containing the sums of both
  79 ///    operands.
  80 static __inline__ __m128 __DEFAULT_FN_ATTRS
  81 _mm_add_ps(__m128 __a, __m128 __b)
  82 {
  83   return (__m128)((__v4sf)__a + (__v4sf)__b);
  84 }
  85
  86 /// \brief Subtracts the 32-bit float value in the low-order bits of the second
  87 ///    operand from the corresponding value in the first operand.
  88 ///
  89 /// \headerfile <x86intrin.h>
  90 ///
  91 /// This intrinsic corresponds to the \c VSUBSS / SUBSS instructions.
  92 ///
  93 /// \param __a
  94 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
  95 ///    of this operand are used in the calculation.
  96 /// \param __b
  97 ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
  98 ///    bits of this operand are used in the calculation.
  99 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 100 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
 101 ///    copied from the upper 96 bits of the first source operand.
 102 static __inline__ __m128 __DEFAULT_FN_ATTRS
 103 _mm_sub_ss(__m128 __a, __m128 __b)
 104 {
 105   __a[0] -= __b[0];
 106   return __a;
 107 }
 108
 109 /// \brief Subtracts each of the values of the second operand from the first
 110 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
 111 ///    the results of the subtraction.
 112 ///
 113 /// \headerfile <x86intrin.h>
 114 ///
 115 /// This intrinsic corresponds to the \c VSUBPS / SUBPS instructions.
 116 ///
 117 /// \param __a
 118 ///    A 128-bit vector of [4 x float] containing the minuend.
 119 /// \param __b
 120 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 121 /// \returns A 128-bit vector of [4 x float] containing the differences between
 122 ///    both operands.
 123 static __inline__ __m128 __DEFAULT_FN_ATTRS
 124 _mm_sub_ps(__m128 __a, __m128 __b)
 125 {
 126   return (__m128)((__v4sf)__a - (__v4sf)__b);
 127 }
 128
 129 /// \brief Multiplies two 32-bit float values in the low-order bits of the
 130 ///    operands.
 131 ///
 132 /// \headerfile <x86intrin.h>
 133 ///
 134 /// This intrinsic corresponds to the \c VMULSS / MULSS instructions.
 135 ///
 136 /// \param __a
 137 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 138 ///    The lower 32 bits of this operand are used in the calculation.
 139 /// \param __b
 140 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 141 ///    The lower 32 bits of this operand are used in the calculation.
 142 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
 143 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
 144 ///    bits of the first source operand.
 145 static __inline__ __m128 __DEFAULT_FN_ATTRS
 146 _mm_mul_ss(__m128 __a, __m128 __b)
 147 {
 148   __a[0] *= __b[0];
 149   return __a;
 150 }
 151
 152 /// \brief Multiplies two 128-bit vectors of [4 x float] and returns the
 153 ///    results of the multiplication.
 154 ///
 155 /// \headerfile <x86intrin.h>
 156 ///
 157 /// This intrinsic corresponds to the \c VMULPS / MULPS instructions.
 158 ///
 159 /// \param __a
 160 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 161 /// \param __b
 162 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 163 /// \returns A 128-bit vector of [4 x float] containing the products of both
 164 ///    operands.
 165 static __inline__ __m128 __DEFAULT_FN_ATTRS
 166 _mm_mul_ps(__m128 __a, __m128 __b)
 167 {
 168   return (__m128)((__v4sf)__a * (__v4sf)__b);
 169 }
 170
 171 /// \brief Divides the value in the low-order 32 bits of the first operand by
 172 ///    the corresponding value in the second operand.
 173 ///
 174 /// \headerfile <x86intrin.h>
 175 ///
 176 /// This intrinsic corresponds to the \c VDIVSS / DIVSS instructions.
 177 ///
 178 /// \param __a
 179 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
 180 ///    bits of this operand are used in the calculation.
 181 /// \param __b
 182 ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
 183 ///    of this operand are used in the calculation.
 184 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
 185 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
 186 ///    upper 96 bits of the first source operand.
 187 static __inline__ __m128 __DEFAULT_FN_ATTRS
 188 _mm_div_ss(__m128 __a, __m128 __b)
 189 {
 190   __a[0] /= __b[0];
 191   return __a;
 192 }
 193
 194 /// \brief Divides two 128-bit vectors of [4 x float].
 195 ///
 196 /// \headerfile <x86intrin.h>
 197 ///
 198 /// This intrinsic corresponds to the \c VDIVPS / DIVPS instructions.
 199 ///
 200 /// \param __a
 201 ///    A 128-bit vector of [4 x float] containing the dividend.
 202 /// \param __b
 203 ///    A 128-bit vector of [4 x float] containing the divisor.
 204 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
 205 ///    operands.
 206 static __inline__ __m128 __DEFAULT_FN_ATTRS
 207 _mm_div_ps(__m128 __a, __m128 __b)
 208 {
 209   return (__m128)((__v4sf)__a / (__v4sf)__b);
 210 }
 211
 212 /// \brief Calculates the square root of the value stored in the low-order bits
 213 ///    of a 128-bit vector of [4 x float].
 214 ///
 215 /// \headerfile <x86intrin.h>
 216 ///
 217 /// This intrinsic corresponds to the \c VSQRTSS / SQRTSS instructions.
 218 ///
 219 /// \param __a
 220 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 221 ///    used in the calculation.
 222 /// \returns A 128-bit vector of [4 x float] containing the square root of the
 223 ///    value in the low-order bits of the operand.
 224 static __inline__ __m128 __DEFAULT_FN_ATTRS
 225 _mm_sqrt_ss(__m128 __a)
 226 {
 227   __m128 __c = __builtin_ia32_sqrtss((__v4sf)__a);
 228   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 229 }
 230
 231 /// \brief Calculates the square roots of the values stored in a 128-bit vector
 232 ///    of [4 x float].
 233 ///
 234 /// \headerfile <x86intrin.h>
 235 ///
 236 /// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instructions.
 237 ///
 238 /// \param __a
 239 ///    A 128-bit vector of [4 x float].
 240 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
 241 ///    values in the operand.
 242 static __inline__ __m128 __DEFAULT_FN_ATTRS
 243 _mm_sqrt_ps(__m128 __a)
 244 {
 245   return __builtin_ia32_sqrtps((__v4sf)__a);
 246 }
 247
 248 /// \brief Calculates the approximate reciprocal of the value stored in the
 249 ///    low-order bits of a 128-bit vector of [4 x float].
 250 ///
 251 /// \headerfile <x86intrin.h>
 252 ///
 253 /// This intrinsic corresponds to the \c VRCPSS / RCPSS instructions.
 254 ///
 255 /// \param __a
 256 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 257 ///    used in the calculation.
 258 /// \returns A 128-bit vector of [4 x float] containing the approximate
 259 ///    reciprocal of the value in the low-order bits of the operand.
 260 static __inline__ __m128 __DEFAULT_FN_ATTRS
 261 _mm_rcp_ss(__m128 __a)
 262 {
 263   __m128 __c = __builtin_ia32_rcpss((__v4sf)__a);
 264   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 265 }
 266
 267 /// \brief Calculates the approximate reciprocals of the values stored in a
 268 ///    128-bit vector of [4 x float].
 269 ///
 270 /// \headerfile <x86intrin.h>
 271 ///
 272 /// This intrinsic corresponds to the \c VRCPPS / RCPPS instructions.
 273 ///
 274 /// \param __a
 275 ///    A 128-bit vector of [4 x float].
 276 /// \returns A 128-bit vector of [4 x float] containing the approximate
 277 ///    reciprocals of the values in the operand.
 278 static __inline__ __m128 __DEFAULT_FN_ATTRS
 279 _mm_rcp_ps(__m128 __a)
 280 {
 281   return __builtin_ia32_rcpps((__v4sf)__a);
 282 }
 283
 284 /// \brief Calculates the approximate reciprocal of the square root of the value
 285 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
 286 ///
 287 /// \headerfile <x86intrin.h>
 288 ///
 289 /// This intrinsic corresponds to the \c VRSQRTSS / RSQRTSS instructions.
 290 ///
 291 /// \param __a
 292 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 293 ///    used in the calculation.
 294 /// \returns A 128-bit vector of [4 x float] containing the approximate
 295 ///    reciprocal of the square root of the value in the low-order bits of the
 296 ///    operand.
 297 static __inline__ __m128 __DEFAULT_FN_ATTRS
 298 _mm_rsqrt_ss(__m128 __a)
 299 {
 300   __m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a);
 301   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 302 }
 303
 304 /// \brief Calculates the approximate reciprocals of the square roots of the
 305 ///    values stored in a 128-bit vector of [4 x float].
 306 ///
 307 /// \headerfile <x86intrin.h>
 308 ///
 309 /// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instructions.
 310 ///
 311 /// \param __a
 312 ///    A 128-bit vector of [4 x float].
 313 /// \returns A 128-bit vector of [4 x float] containing the approximate
 314 ///    reciprocals of the square roots of the values in the operand.
 315 static __inline__ __m128 __DEFAULT_FN_ATTRS
 316 _mm_rsqrt_ps(__m128 __a)
 317 {
 318   return __builtin_ia32_rsqrtps((__v4sf)__a);
 319 }
 320
 321 /// \brief Compares two 32-bit float values in the low-order bits of both
 322 ///    operands and returns the lesser value in the low-order bits of the
 323 ///    vector of [4 x float].
 324 ///
 325 /// \headerfile <x86intrin.h>
 326 ///
 327 /// This intrinsic corresponds to the \c VMINSS / MINSS instructions.
 328 ///
 329 /// \param __a
 330 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 331 ///    32 bits of this operand are used in the comparison.
 332 /// \param __b
 333 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 334 ///    32 bits of this operand are used in the comparison.
 335 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 336 ///    minimum value between both operands. The upper 96 bits are copied from
 337 ///    the upper 96 bits of the first source operand.
 338 static __inline__ __m128 __DEFAULT_FN_ATTRS
 339 _mm_min_ss(__m128 __a, __m128 __b)
 340 {
 341   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
 342 }
 343
 344 /// \brief Compares two 128-bit vectors of [4 x float] and returns the
 345 ///    lesser of each pair of values.
 346 ///
 347 /// \headerfile <x86intrin.h>
 348 ///
 349 /// This intrinsic corresponds to the \c VMINPS / MINPS instructions.
 350 ///
 351 /// \param __a
 352 ///    A 128-bit vector of [4 x float] containing one of the operands.
 353 /// \param __b
 354 ///    A 128-bit vector of [4 x float] containing one of the operands.
 355 /// \returns A 128-bit vector of [4 x float] containing the minimum values
 356 ///    between both operands.
 357 static __inline__ __m128 __DEFAULT_FN_ATTRS
 358 _mm_min_ps(__m128 __a, __m128 __b)
 359 {
 360   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
 361 }
 362
 363 /// \brief Compares two 32-bit float values in the low-order bits of both
 364 ///    operands and returns the greater value in the low-order bits of
 365 ///    a vector [4 x float].
 366 ///
 367 /// \headerfile <x86intrin.h>
 368 ///
 369 /// This intrinsic corresponds to the \c VMAXSS / MAXSS instructions.
 370 ///
 371 /// \param __a
 372 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 373 ///    32 bits of this operand are used in the comparison.
 374 /// \param __b
 375 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 376 ///    32 bits of this operand are used in the comparison.
 377 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 378 ///    maximum value between both operands. The upper 96 bits are copied from
 379 ///    the upper 96 bits of the first source operand.
 380 static __inline__ __m128 __DEFAULT_FN_ATTRS
 381 _mm_max_ss(__m128 __a, __m128 __b)
 382 {
 383   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
 384 }
 385
 386 /// \brief Compares two 128-bit vectors of [4 x float] and returns the greater
 387 ///    of each pair of values.
 388 ///
 389 /// \headerfile <x86intrin.h>
 390 ///
 391 /// This intrinsic corresponds to the \c VMAXPS / MAXPS instructions.
 392 ///
 393 /// \param __a
 394 ///    A 128-bit vector of [4 x float] containing one of the operands.
 395 /// \param __b
 396 ///    A 128-bit vector of [4 x float] containing one of the operands.
 397 /// \returns A 128-bit vector of [4 x float] containing the maximum values
 398 ///    between both operands.
 399 static __inline__ __m128 __DEFAULT_FN_ATTRS
 400 _mm_max_ps(__m128 __a, __m128 __b)
 401 {
 402   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
 403 }
 404
 405 /// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float].
 406 ///
 407 /// \headerfile <x86intrin.h>
 408 ///
 409 /// This intrinsic corresponds to the \c VANDPS / ANDPS instructions.
 410 ///
 411 /// \param __a
 412 ///    A 128-bit vector containing one of the source operands.
 413 /// \param __b
 414 ///    A 128-bit vector containing one of the source operands.
 415 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 416 ///    values between both operands.
 417 static __inline__ __m128 __DEFAULT_FN_ATTRS
 418 _mm_and_ps(__m128 __a, __m128 __b)
 419 {
 420   return (__m128)((__v4su)__a & (__v4su)__b);
 421 }
 422
 423 /// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using
 424 ///    the one's complement of the values contained in the first source
 425 ///    operand.
 426 ///
 427 /// \headerfile <x86intrin.h>
 428 ///
 429 /// This intrinsic corresponds to the \c VANDNPS / ANDNPS instructions.
 430 ///
 431 /// \param __a
 432 ///    A 128-bit vector of [4 x float] containing the first source operand. The
 433 ///    one's complement of this value is used in the bitwise AND.
 434 /// \param __b
 435 ///    A 128-bit vector of [4 x float] containing the second source operand.
 436 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 437 ///    one's complement of the first operand and the values in the second
 438 ///    operand.
 439 static __inline__ __m128 __DEFAULT_FN_ATTRS
 440 _mm_andnot_ps(__m128 __a, __m128 __b)
 441 {
 442   return (__m128)(~(__v4su)__a & (__v4su)__b);
 443 }
 444
 445 /// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float].
 446 ///
 447 /// \headerfile <x86intrin.h>
 448 ///
 449 /// This intrinsic corresponds to the \c VORPS / ORPS instructions.
 450 ///
 451 /// \param __a
 452 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 453 /// \param __b
 454 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 455 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
 456 ///    values between both operands.
 457 static __inline__ __m128 __DEFAULT_FN_ATTRS
 458 _mm_or_ps(__m128 __a, __m128 __b)
 459 {
 460   return (__m128)((__v4su)__a | (__v4su)__b);
 461 }
 462
 463 /// \brief Performs a bitwise exclusive OR of two 128-bit vectors of
 464 ///    [4 x float].
 465 ///
 466 /// \headerfile <x86intrin.h>
 467 ///
 468 /// This intrinsic corresponds to the \c VXORPS / XORPS instructions.
 469 ///
 470 /// \param __a
 471 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 472 /// \param __b
 473 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 474 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
 475 ///    of the values between both operands.
 476 static __inline__ __m128 __DEFAULT_FN_ATTRS
 477 _mm_xor_ps(__m128 __a, __m128 __b)
 478 {
 479   return (__m128)((__v4su)__a ^ (__v4su)__b);
 480 }
 481
 482 /// \brief Compares two 32-bit float values in the low-order bits of both
 483 ///    operands for equality and returns the result of the comparison in the
 484 ///    low-order bits of a vector [4 x float].
 485 ///
 486 /// \headerfile <x86intrin.h>
 487 ///
 488 /// This intrinsic corresponds to the \c VCMPEQSS / CMPEQSS instructions.
 489 ///
 490 /// \param __a
 491 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 492 ///    32 bits of this operand are used in the comparison.
 493 /// \param __b
 494 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 495 ///    32 bits of this operand are used in the comparison.
 496 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 497 ///    in the low-order bits.
 498 static __inline__ __m128 __DEFAULT_FN_ATTRS
 499 _mm_cmpeq_ss(__m128 __a, __m128 __b)
 500 {
 501   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
 502 }
 503
 504 /// \brief Compares each of the corresponding 32-bit float values of the
 505 ///    128-bit vectors of [4 x float] for equality.
 506 ///
 507 /// \headerfile <x86intrin.h>
 508 ///
 509 /// This intrinsic corresponds to the \c VCMPEQPS / CMPEQPS instructions.
 510 ///
 511 /// \param __a
 512 ///    A 128-bit vector of [4 x float].
 513 /// \param __b
 514 ///    A 128-bit vector of [4 x float].
 515 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 516 static __inline__ __m128 __DEFAULT_FN_ATTRS
 517 _mm_cmpeq_ps(__m128 __a, __m128 __b)
 518 {
 519   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
 520 }
 521
 522 /// \brief Compares two 32-bit float values in the low-order bits of both
 523 ///    operands to determine if the value in the first operand is less than the
 524 ///    corresponding value in the second operand and returns the result of the
 525 ///    comparison in the low-order bits of a vector of [4 x float].
 526 ///
 527 /// \headerfile <x86intrin.h>
 528 ///
 529 /// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
 530 ///
 531 /// \param __a
 532 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 533 ///    32 bits of this operand are used in the comparison.
 534 /// \param __b
 535 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 536 ///    32 bits of this operand are used in the comparison.
 537 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 538 ///    in the low-order bits.
 539 static __inline__ __m128 __DEFAULT_FN_ATTRS
 540 _mm_cmplt_ss(__m128 __a, __m128 __b)
 541 {
 542   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
 543 }
 544
 545 /// \brief Compares each of the corresponding 32-bit float values of the
 546 ///    128-bit vectors of [4 x float] to determine if the values in the first
 547 ///    operand are less than those in the second operand.
 548 ///
 549 /// \headerfile <x86intrin.h>
 550 ///
 551 /// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
 552 ///
 553 /// \param __a
 554 ///    A 128-bit vector of [4 x float].
 555 /// \param __b
 556 ///    A 128-bit vector of [4 x float].
 557 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 558 static __inline__ __m128 __DEFAULT_FN_ATTRS
 559 _mm_cmplt_ps(__m128 __a, __m128 __b)
 560 {
 561   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
 562 }
 563
 564 /// \brief Compares two 32-bit float values in the low-order bits of both
 565 ///    operands to determine if the value in the first operand is less than or
 566 ///    equal to the corresponding value in the second operand and returns the
 567 ///    result of the comparison in the low-order bits of a vector of
 568 ///    [4 x float].
 569 ///
 570 /// \headerfile <x86intrin.h>
 571 ///
 572 /// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
 573 ///
 574 /// \param __a
 575 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 576 ///    32 bits of this operand are used in the comparison.
 577 /// \param __b
 578 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 579 ///    32 bits of this operand are used in the comparison.
 580 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 581 ///    in the low-order bits.
 582 static __inline__ __m128 __DEFAULT_FN_ATTRS
 583 _mm_cmple_ss(__m128 __a, __m128 __b)
 584 {
 585   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
 586 }
 587
 588 /// \brief Compares each of the corresponding 32-bit float values of the
 589 ///    128-bit vectors of [4 x float] to determine if the values in the first
 590 ///    operand are less than or equal to those in the second operand.
 591 ///
 592 /// \headerfile <x86intrin.h>
 593 ///
 594 /// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
 595 ///
 596 /// \param __a
 597 ///    A 128-bit vector of [4 x float].
 598 /// \param __b
 599 ///    A 128-bit vector of [4 x float].
 600 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 601 static __inline__ __m128 __DEFAULT_FN_ATTRS
 602 _mm_cmple_ps(__m128 __a, __m128 __b)
 603 {
 604   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
 605 }
 606
 607 /// \brief Compares two 32-bit float values in the low-order bits of both
 608 ///    operands to determine if the value in the first operand is greater than
 609 ///    the corresponding value in the second operand and returns the result of
 610 ///    the comparison in the low-order bits of a vector of [4 x float].
 611 ///
 612 /// \headerfile <x86intrin.h>
 613 ///
 614 /// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions.
 615 ///
 616 /// \param __a
 617 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 618 ///    32 bits of this operand are used in the comparison.
 619 /// \param __b
 620 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 621 ///    32 bits of this operand are used in the comparison.
 622 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 623 ///    in the low-order bits.
 624 static __inline__ __m128 __DEFAULT_FN_ATTRS
 625 _mm_cmpgt_ss(__m128 __a, __m128 __b)
 626 {
 627   return (__m128)__builtin_shufflevector((__v4sf)__a,
 628                                          (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
 629                                          4, 1, 2, 3);
 630 }
 631
 632 /// \brief Compares each of the corresponding 32-bit float values of the
 633 ///    128-bit vectors of [4 x float] to determine if the values in the first
 634 ///    operand are greater than those in the second operand.
 635 ///
 636 /// \headerfile <x86intrin.h>
 637 ///
 638 /// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions.
 639 ///
 640 /// \param __a
 641 ///    A 128-bit vector of [4 x float].
 642 /// \param __b
 643 ///    A 128-bit vector of [4 x float].
 644 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 645 static __inline__ __m128 __DEFAULT_FN_ATTRS
 646 _mm_cmpgt_ps(__m128 __a, __m128 __b)
 647 {
 648   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
 649 }
 650
 651 /// \brief Compares two 32-bit float values in the low-order bits of both
 652 ///    operands to determine if the value in the first operand is greater than
 653 ///    or equal to the corresponding value in the second operand and returns
 654 ///    the result of the comparison in the low-order bits of a vector of
 655 ///    [4 x float].
 656 ///
 657 /// \headerfile <x86intrin.h>
 658 ///
 659 /// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions.
 660 ///
 661 /// \param __a
 662 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 663 ///    32 bits of this operand are used in the comparison.
 664 /// \param __b
 665 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 666 ///    32 bits of this operand are used in the comparison.
 667 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 668 ///    in the low-order bits.
 669 static __inline__ __m128 __DEFAULT_FN_ATTRS
 670 _mm_cmpge_ss(__m128 __a, __m128 __b)
 671 {
 672   return (__m128)__builtin_shufflevector((__v4sf)__a,
 673                                          (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
 674                                          4, 1, 2, 3);
 675 }
 676
 677 /// \brief Compares each of the corresponding 32-bit float values of the
 678 ///    128-bit vectors of [4 x float] to determine if the values in the first
 679 ///    operand are greater than or equal to those in the second operand.
 680 ///
 681 /// \headerfile <x86intrin.h>
 682 ///
 683 /// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions.
 684 ///
 685 /// \param __a
 686 ///    A 128-bit vector of [4 x float].
 687 /// \param __b
 688 ///    A 128-bit vector of [4 x float].
 689 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 690 static __inline__ __m128 __DEFAULT_FN_ATTRS
 691 _mm_cmpge_ps(__m128 __a, __m128 __b)
 692 {
 693   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
 694 }
 695
 696 /// \brief Compares two 32-bit float values in the low-order bits of both
 697 ///    operands for inequality and returns the result of the comparison in the
 698 ///    low-order bits of a vector of [4 x float].
 699 ///
 700 /// \headerfile <x86intrin.h>
 701 ///
 702 /// This intrinsic corresponds to the \c VCMPNEQSS / CMPNEQSS instructions.
 703 ///
 704 /// \param __a
 705 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 706 ///    32 bits of this operand are used in the comparison.
 707 /// \param __b
 708 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 709 ///    32 bits of this operand are used in the comparison.
 710 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 711 ///    in the low-order bits.
 712 static __inline__ __m128 __DEFAULT_FN_ATTRS
 713 _mm_cmpneq_ss(__m128 __a, __m128 __b)
 714 {
 715   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
 716 }
 717
 718 /// \brief Compares each of the corresponding 32-bit float values of the
 719 ///    128-bit vectors of [4 x float] for inequality.
 720 ///
 721 /// \headerfile <x86intrin.h>
 722 ///
 723 /// This intrinsic corresponds to the \c VCMPNEQPS / CMPNEQPS instructions.
 724 ///
 725 /// \param __a
 726 ///    A 128-bit vector of [4 x float].
 727 /// \param __b
 728 ///    A 128-bit vector of [4 x float].
 729 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 730 static __inline__ __m128 __DEFAULT_FN_ATTRS
 731 _mm_cmpneq_ps(__m128 __a, __m128 __b)
 732 {
 733   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
 734 }
 735
 736 /// \brief Compares two 32-bit float values in the low-order bits of both
 737 ///    operands to determine if the value in the first operand is not less than
 738 ///    the corresponding value in the second operand and returns the result of
 739 ///    the comparison in the low-order bits of a vector of [4 x float].
 740 ///
 741 /// \headerfile <x86intrin.h>
 742 ///
 743 /// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
 744 ///
 745 /// \param __a
 746 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 747 ///    32 bits of this operand are used in the comparison.
 748 /// \param __b
 749 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 750 ///    32 bits of this operand are used in the comparison.
 751 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 752 ///    in the low-order bits.
 753 static __inline__ __m128 __DEFAULT_FN_ATTRS
 754 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
 755 {
 756   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
 757 }
 758
 759 /// \brief Compares each of the corresponding 32-bit float values of the
 760 ///    128-bit vectors of [4 x float] to determine if the values in the first
 761 ///    operand are not less than those in the second operand.
 762 ///
 763 /// \headerfile <x86intrin.h>
 764 ///
 765 /// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
 766 ///
 767 /// \param __a
 768 ///    A 128-bit vector of [4 x float].
 769 /// \param __b
 770 ///    A 128-bit vector of [4 x float].
 771 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 772 static __inline__ __m128 __DEFAULT_FN_ATTRS
 773 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
 774 {
 775   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
 776 }
 777
 778 /// \brief Compares two 32-bit float values in the low-order bits of both
 779 ///    operands to determine if the value in the first operand is not less than
 780 ///    or equal to the corresponding value in the second operand and returns
 781 ///    the result of the comparison in the low-order bits of a vector of
 782 ///    [4 x float].
 783 ///
 784 /// \headerfile <x86intrin.h>
 785 ///
 786 /// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
 787 ///
 788 /// \param __a
 789 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 790 ///    32 bits of this operand are used in the comparison.
 791 /// \param __b
 792 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 793 ///    32 bits of this operand are used in the comparison.
 794 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 795 ///    in the low-order bits.
 796 static __inline__ __m128 __DEFAULT_FN_ATTRS
 797 _mm_cmpnle_ss(__m128 __a, __m128 __b)
 798 {
 799   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
 800 }
 801
 802 /// \brief Compares each of the corresponding 32-bit float values of the
 803 ///    128-bit vectors of [4 x float] to determine if the values in the first
 804 ///    operand are not less than or equal to those in the second operand.
 805 ///
 806 /// \headerfile <x86intrin.h>
 807 ///
 808 /// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
 809 ///
 810 /// \param __a
 811 ///    A 128-bit vector of [4 x float].
 812 /// \param __b
 813 ///    A 128-bit vector of [4 x float].
 814 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 815 static __inline__ __m128 __DEFAULT_FN_ATTRS
 816 _mm_cmpnle_ps(__m128 __a, __m128 __b)
 817 {
 818   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
 819 }
 820
 821 /// \brief Compares two 32-bit float values in the low-order bits of both
 822 ///    operands to determine if the value in the first operand is not greater
 823 ///    than the corresponding value in the second operand and returns the
 824 ///    result of the comparison in the low-order bits of a vector of
 825 ///    [4 x float].
 826 ///
 827 /// \headerfile <x86intrin.h>
 828 ///
 829 /// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions.
 830 ///
 831 /// \param __a
 832 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 833 ///    32 bits of this operand are used in the comparison.
 834 /// \param __b
 835 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 836 ///    32 bits of this operand are used in the comparison.
 837 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 838 ///    in the low-order bits.
 839 static __inline__ __m128 __DEFAULT_FN_ATTRS
 840 _mm_cmpngt_ss(__m128 __a, __m128 __b)
 841 {
 842   return (__m128)__builtin_shufflevector((__v4sf)__a,
 843                                          (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
 844                                          4, 1, 2, 3);
 845 }
 846
 847 /// \brief Compares each of the corresponding 32-bit float values of the
 848 ///    128-bit vectors of [4 x float] to determine if the values in the first
 849 ///    operand are not greater than those in the second operand.
 850 ///
 851 /// \headerfile <x86intrin.h>
 852 ///
 853 /// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions.
 854 ///
 855 /// \param __a
 856 ///    A 128-bit vector of [4 x float].
 857 /// \param __b
 858 ///    A 128-bit vector of [4 x float].
 859 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 860 static __inline__ __m128 __DEFAULT_FN_ATTRS
 861 _mm_cmpngt_ps(__m128 __a, __m128 __b)
 862 {
 863   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
 864 }
 865
 866 /// \brief Compares two 32-bit float values in the low-order bits of both
 867 ///    operands to determine if the value in the first operand is not greater
 868 ///    than or equal to the corresponding value in the second operand and
 869 ///    returns the result of the comparison in the low-order bits of a vector
 870 ///    of [4 x float].
 871 ///
 872 /// \headerfile <x86intrin.h>
 873 ///
 874 /// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions.
 875 ///
 876 /// \param __a
 877 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 878 ///    32 bits of this operand are used in the comparison.
 879 /// \param __b
 880 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 881 ///    32 bits of this operand are used in the comparison.
 882 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 883 ///    in the low-order bits.
 884 static __inline__ __m128 __DEFAULT_FN_ATTRS
 885 _mm_cmpnge_ss(__m128 __a, __m128 __b)
 886 {
 887   return (__m128)__builtin_shufflevector((__v4sf)__a,
 888                                          (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
 889                                          4, 1, 2, 3);
 890 }
 891
 892 /// \brief Compares each of the corresponding 32-bit float values of the
 893 ///    128-bit vectors of [4 x float] to determine if the values in the first
 894 ///    operand are not greater than or equal to those in the second operand.
 895 ///
 896 /// \headerfile <x86intrin.h>
 897 ///
 898 /// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions.
 899 ///
 900 /// \param __a
 901 ///    A 128-bit vector of [4 x float].
 902 /// \param __b
 903 ///    A 128-bit vector of [4 x float].
 904 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 905 static __inline__ __m128 __DEFAULT_FN_ATTRS
 906 _mm_cmpnge_ps(__m128 __a, __m128 __b)
 907 {
 908   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
 909 }
 910
 911 /// \brief Compares two 32-bit float values in the low-order bits of both
 912 ///    operands to determine if the value in the first operand is ordered with
 913 ///    respect to the corresponding value in the second operand and returns the
 914 ///    result of the comparison in the low-order bits of a vector of
 915 ///    [4 x float].
 916 ///
 917 /// \headerfile <x86intrin.h>
 918 ///
 919 /// This intrinsic corresponds to the \c VCMPORDSS / CMPORDSS instructions.
 920 ///
 921 /// \param __a
 922 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 923 ///    32 bits of this operand are used in the comparison.
 924 /// \param __b
 925 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 926 ///    32 bits of this operand are used in the comparison.
 927 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 928 ///    in the low-order bits.
 929 static __inline__ __m128 __DEFAULT_FN_ATTRS
 930 _mm_cmpord_ss(__m128 __a, __m128 __b)
 931 {
 932   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
 933 }
 934
 935 /// \brief Compares each of the corresponding 32-bit float values of the
 936 ///    128-bit vectors of [4 x float] to determine if the values in the first
 937 ///    operand are ordered with respect to those in the second operand.
 938 ///
 939 /// \headerfile <x86intrin.h>
 940 ///
 941 /// This intrinsic corresponds to the \c VCMPORDPS / CMPORDPS instructions.
 942 ///
 943 /// \param __a
 944 ///    A 128-bit vector of [4 x float].
 945 /// \param __b
 946 ///    A 128-bit vector of [4 x float].
 947 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 948 static __inline__ __m128 __DEFAULT_FN_ATTRS
 949 _mm_cmpord_ps(__m128 __a, __m128 __b)
 950 {
 951   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
 952 }
 953
 954 /// \brief Compares two 32-bit float values in the low-order bits of both
 955 ///    operands to determine if the value in the first operand is unordered
 956 ///    with respect to the corresponding value in the second operand and
 957 ///    returns the result of the comparison in the low-order bits of a vector
 958 ///    of [4 x float].
 959 ///
 960 /// \headerfile <x86intrin.h>
 961 ///
 962 /// This intrinsic corresponds to the \c VCMPUNORDSS / CMPUNORDSS instructions.
 963 ///
 964 /// \param __a
 965 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 966 ///    32 bits of this operand are used in the comparison.
 967 /// \param __b
 968 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 969 ///    32 bits of this operand are used in the comparison.
 970 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 971 ///    in the low-order bits.
 972 static __inline__ __m128 __DEFAULT_FN_ATTRS
 973 _mm_cmpunord_ss(__m128 __a, __m128 __b)
 974 {
 975   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
 976 }
 977
 978 /// \brief Compares each of the corresponding 32-bit float values of the
 979 ///    128-bit vectors of [4 x float] to determine if the values in the first
 980 ///    operand are unordered with respect to those in the second operand.
 981 ///
 982 /// \headerfile <x86intrin.h>
 983 ///
 984 /// This intrinsic corresponds to the \c VCMPUNORDPS / CMPUNORDPS instructions.
 985 ///
 986 /// \param __a
 987 ///    A 128-bit vector of [4 x float].
 988 /// \param __b
 989 ///    A 128-bit vector of [4 x float].
 990 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 991 static __inline__ __m128 __DEFAULT_FN_ATTRS
 992 _mm_cmpunord_ps(__m128 __a, __m128 __b)
 993 {
 994   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
 995 }
 996
 997 /// \brief Compares two 32-bit float values in the low-order bits of both
 998 ///    operands for equality and returns the result of the comparison.
 999 ///
1000 /// \headerfile <x86intrin.h>
1001 ///
1002 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1003 ///
1004 /// \param __a
1005 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1006 ///    used in the comparison.
1007 /// \param __b
1008 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1009 ///    used in the comparison.
1010 /// \returns An integer containing the comparison results.
1011 static __inline__ int __DEFAULT_FN_ATTRS
1012 _mm_comieq_ss(__m128 __a, __m128 __b)
1013 {
1014   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1015 }
1016
1017 /// \brief Compares two 32-bit float values in the low-order bits of both
1018 ///    operands to determine if the first operand is less than the second
1019 ///    operand and returns the result of the comparison.
1020 ///
1021 /// \headerfile <x86intrin.h>
1022 ///
1023 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1024 ///
1025 /// \param __a
1026 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1027 ///    used in the comparison.
1028 /// \param __b
1029 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1030 ///    used in the comparison.
1031 /// \returns An integer containing the comparison results.
1032 static __inline__ int __DEFAULT_FN_ATTRS
1033 _mm_comilt_ss(__m128 __a, __m128 __b)
1034 {
1035   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1036 }
1037
1038 /// \brief Compares two 32-bit float values in the low-order bits of both
1039 ///    operands to determine if the first operand is less than or equal to the
1040 ///    second operand and returns the result of the comparison.
1041 ///
1042 /// \headerfile <x86intrin.h>
1043 ///
1044 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1045 ///
1046 /// \param __a
1047 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1048 ///    used in the comparison.
1049 /// \param __b
1050 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1051 ///    used in the comparison.
1052 /// \returns An integer containing the comparison results.
1053 static __inline__ int __DEFAULT_FN_ATTRS
1054 _mm_comile_ss(__m128 __a, __m128 __b)
1055 {
1056   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1057 }
1058
1059 /// \brief Compares two 32-bit float values in the low-order bits of both
1060 ///    operands to determine if the first operand is greater than the second
1061 ///    operand and returns the result of the comparison.
1062 ///
1063 /// \headerfile <x86intrin.h>
1064 ///
1065 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1066 ///
1067 /// \param __a
1068 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1069 ///    used in the comparison.
1070 /// \param __b
1071 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1072 ///    used in the comparison.
1073 /// \returns An integer containing the comparison results.
1074 static __inline__ int __DEFAULT_FN_ATTRS
1075 _mm_comigt_ss(__m128 __a, __m128 __b)
1076 {
1077   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1078 }
1079
1080 /// \brief Compares two 32-bit float values in the low-order bits of both
1081 ///    operands to determine if the first operand is greater than or equal to
1082 ///    the second operand and returns the result of the comparison.
1083 ///
1084 /// \headerfile <x86intrin.h>
1085 ///
1086 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1087 ///
1088 /// \param __a
1089 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1090 ///    used in the comparison.
1091 /// \param __b
1092 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1093 ///    used in the comparison.
1094 /// \returns An integer containing the comparison results.
1095 static __inline__ int __DEFAULT_FN_ATTRS
1096 _mm_comige_ss(__m128 __a, __m128 __b)
1097 {
1098   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1099 }
1100
1101 /// \brief Compares two 32-bit float values in the low-order bits of both
1102 ///    operands to determine if the first operand is not equal to the second
1103 ///    operand and returns the result of the comparison.
1104 ///
1105 /// \headerfile <x86intrin.h>
1106 ///
1107 /// This intrinsic corresponds to the \c VCOMISS / COMISS instructions.
1108 ///
1109 /// \param __a
1110 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1111 ///    used in the comparison.
1112 /// \param __b
1113 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1114 ///    used in the comparison.
1115 /// \returns An integer containing the comparison results.
1116 static __inline__ int __DEFAULT_FN_ATTRS
1117 _mm_comineq_ss(__m128 __a, __m128 __b)
1118 {
1119   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1120 }
1121
1122 /// \brief Performs an unordered comparison of two 32-bit float values using
1123 ///    the low-order bits of both operands to determine equality and returns
1124 ///    the result of the comparison.
1125 ///
1126 /// \headerfile <x86intrin.h>
1127 ///
1128 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1129 ///
1130 /// \param __a
1131 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1132 ///    used in the comparison.
1133 /// \param __b
1134 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1135 ///    used in the comparison.
1136 /// \returns An integer containing the comparison results.
1137 static __inline__ int __DEFAULT_FN_ATTRS
1138 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1139 {
1140   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1141 }
1142
1143 /// \brief Performs an unordered comparison of two 32-bit float values using
1144 ///    the low-order bits of both operands to determine if the first operand is
1145 ///    less than the second operand and returns the result of the comparison.
1146 ///
1147 /// \headerfile <x86intrin.h>
1148 ///
1149 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1150 ///
1151 /// \param __a
1152 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1153 ///    used in the comparison.
1154 /// \param __b
1155 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1156 ///    used in the comparison.
1157 /// \returns An integer containing the comparison results.
1158 static __inline__ int __DEFAULT_FN_ATTRS
1159 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1160 {
1161   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1162 }
1163
1164 /// \brief Performs an unordered comparison of two 32-bit float values using
1165 ///    the low-order bits of both operands to determine if the first operand
1166 ///    is less than or equal to the second operand and returns the result of
1167 ///    the comparison.
1168 ///
1169 /// \headerfile <x86intrin.h>
1170 ///
1171 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1172 ///
1173 /// \param __a
1174 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1175 ///    used in the comparison.
1176 /// \param __b
1177 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1178 ///    used in the comparison.
1179 /// \returns An integer containing the comparison results.
1180 static __inline__ int __DEFAULT_FN_ATTRS
1181 _mm_ucomile_ss(__m128 __a, __m128 __b)
1182 {
1183   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1184 }
1185
1186 /// \brief Performs an unordered comparison of two 32-bit float values using
1187 ///    the low-order bits of both operands to determine if the first operand
1188 ///    is greater than the second operand and returns the result of the
1189 ///    comparison.
1190 ///
1191 /// \headerfile <x86intrin.h>
1192 ///
1193 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1194 ///
1195 /// \param __a
1196 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1197 ///    used in the comparison.
1198 /// \param __b
1199 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1200 ///    used in the comparison.
1201 /// \returns An integer containing the comparison results.
1202 static __inline__ int __DEFAULT_FN_ATTRS
1203 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1204 {
1205   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1206 }
1207
1208 /// \brief Performs an unordered comparison of two 32-bit float values using
1209 ///    the low-order bits of both operands to determine if the first operand is
1210 ///    greater than or equal to the second operand and returns the result of
1211 ///    the comparison.
1212 ///
1213 /// \headerfile <x86intrin.h>
1214 ///
1215 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1216 ///
1217 /// \param __a
1218 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1219 ///    used in the comparison.
1220 /// \param __b
1221 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1222 ///    used in the comparison.
1223 /// \returns An integer containing the comparison results.
1224 static __inline__ int __DEFAULT_FN_ATTRS
1225 _mm_ucomige_ss(__m128 __a, __m128 __b)
1226 {
1227   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1228 }
1229
1230 /// \brief Performs an unordered comparison of two 32-bit float values using
1231 ///    the low-order bits of both operands to determine inequality and returns
1232 ///    the result of the comparison.
1233 ///
1234 /// \headerfile <x86intrin.h>
1235 ///
1236 /// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions.
1237 ///
1238 /// \param __a
1239 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1240 ///    used in the comparison.
1241 /// \param __b
1242 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1243 ///    used in the comparison.
1244 /// \returns An integer containing the comparison results.
1245 static __inline__ int __DEFAULT_FN_ATTRS
1246 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1247 {
1248   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1249 }
1250
1251 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1252 ///    [4 x float] into a 32-bit integer.
1253 ///
1254 /// \headerfile <x86intrin.h>
1255 ///
1256 /// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
1257 ///
1258 /// \param __a
1259 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1260 ///    used in the conversion.
1261 /// \returns A 32-bit integer containing the converted value.
1262 static __inline__ int __DEFAULT_FN_ATTRS
1263 _mm_cvtss_si32(__m128 __a)
1264 {
1265   return __builtin_ia32_cvtss2si((__v4sf)__a);
1266 }
1267
1268 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1269 ///    [4 x float] into a 32-bit integer.
1270 ///
1271 /// \headerfile <x86intrin.h>
1272 ///
1273 /// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
1274 ///
1275 /// \param __a
1276 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1277 ///    used in the conversion.
1278 /// \returns A 32-bit integer containing the converted value.
1279 static __inline__ int __DEFAULT_FN_ATTRS
1280 _mm_cvt_ss2si(__m128 __a)
1281 {
1282   return _mm_cvtss_si32(__a);
1283 }
1284
1285 #ifdef __x86_64__
1286
1287 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1288 ///    [4 x float] into a 64-bit integer.
1289 ///
1290 /// \headerfile <x86intrin.h>
1291 ///
1292 /// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions.
1293 ///
1294 /// \param __a
1295 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1296 ///    used in the conversion.
1297 /// \returns A 64-bit integer containing the converted value.
1298 static __inline__ long long __DEFAULT_FN_ATTRS
1299 _mm_cvtss_si64(__m128 __a)
1300 {
1301   return __builtin_ia32_cvtss2si64((__v4sf)__a);
1302 }
1303
1304 #endif
1305
1306 /// \brief Converts two low-order float values in a 128-bit vector of
1307 ///    [4 x float] into a 64-bit vector of [2 x i32].
1308 ///
1309 /// \headerfile <x86intrin.h>
1310 ///
1311 /// This intrinsic corresponds to the \c CVTPS2PI instruction.
1312 ///
1313 /// \param __a
1314 ///    A 128-bit vector of [4 x float].
1315 /// \returns A 64-bit integer vector containing the converted values.
1316 static __inline__ __m64 __DEFAULT_FN_ATTRS
1317 _mm_cvtps_pi32(__m128 __a)
1318 {
1319   return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1320 }
1321
1322 /// \brief Converts two low-order float values in a 128-bit vector of
1323 ///    [4 x float] into a 64-bit vector of [2 x i32].
1324 ///
1325 /// \headerfile <x86intrin.h>
1326 ///
1327 /// This intrinsic corresponds to the \c CVTPS2PI instruction.
1328 ///
1329 /// \param __a
1330 ///    A 128-bit vector of [4 x float].
1331 /// \returns A 64-bit integer vector containing the converted values.
1332 static __inline__ __m64 __DEFAULT_FN_ATTRS
1333 _mm_cvt_ps2pi(__m128 __a)
1334 {
1335   return _mm_cvtps_pi32(__a);
1336 }
1337
1338 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1339 ///    [4 x float] into a 32-bit integer, truncating the result when it is
1340 ///    inexact.
1341 ///
1342 /// \headerfile <x86intrin.h>
1343 ///
1344 /// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
1345 ///
1346 /// \param __a
1347 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1348 ///    used in the conversion.
1349 /// \returns A 32-bit integer containing the converted value.
1350 static __inline__ int __DEFAULT_FN_ATTRS
1351 _mm_cvttss_si32(__m128 __a)
1352 {
1353   return __builtin_ia32_cvttss2si((__v4sf)__a);
1354 }
1355
1356 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1357 ///    [4 x float] into a 32-bit integer, truncating the result when it is
1358 ///    inexact.
1359 ///
1360 /// \headerfile <x86intrin.h>
1361 ///
1362 /// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
1363 ///
1364 /// \param __a
1365 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1366 ///    used in the conversion.
1367 /// \returns A 32-bit integer containing the converted value.
1368 static __inline__ int __DEFAULT_FN_ATTRS
1369 _mm_cvtt_ss2si(__m128 __a)
1370 {
1371   return _mm_cvttss_si32(__a);
1372 }
1373
1374 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1375 ///    [4 x float] into a 64-bit integer, truncating the result when it is
1376 ///    inexact.
1377 ///
1378 /// \headerfile <x86intrin.h>
1379 ///
1380 /// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions.
1381 ///
1382 /// \param __a
1383 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1384 ///    used in the conversion.
1385 /// \returns A 64-bit integer containing the converted value.
1386 static __inline__ long long __DEFAULT_FN_ATTRS
1387 _mm_cvttss_si64(__m128 __a)
1388 {
1389   return __builtin_ia32_cvttss2si64((__v4sf)__a);
1390 }
1391
1392 /// \brief Converts two low-order float values in a 128-bit vector of
1393 ///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1394 ///    when it is inexact.
1395 ///
1396 /// \headerfile <x86intrin.h>
1397 ///
1398 /// This intrinsic corresponds to the \c CVTTPS2PI / VTTPS2PI instructions.
1399 ///
1400 /// \param __a
1401 ///    A 128-bit vector of [4 x float].
1402 /// \returns A 64-bit integer vector containing the converted values.
1403 static __inline__ __m64 __DEFAULT_FN_ATTRS
1404 _mm_cvttps_pi32(__m128 __a)
1405 {
1406   return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1407 }
1408
1409 /// \brief Converts two low-order float values in a 128-bit vector of [4 x
1410 ///    float] into a 64-bit vector of [2 x i32], truncating the result when it
1411 ///    is inexact.
1412 ///
1413 /// \headerfile <x86intrin.h>
1414 ///
1415 /// This intrinsic corresponds to the \c CVTTPS2PI instruction.
1416 ///
1417 /// \param __a
1418 ///    A 128-bit vector of [4 x float].
1419 /// \returns A 64-bit integer vector containing the converted values.
1420 static __inline__ __m64 __DEFAULT_FN_ATTRS
1421 _mm_cvtt_ps2pi(__m128 __a)
1422 {
1423   return _mm_cvttps_pi32(__a);
1424 }
1425
1426 /// \brief Converts a 32-bit signed integer value into a floating point value
1427 ///    and writes it to the lower 32 bits of the destination. The remaining
1428 ///    higher order elements of the destination vector are copied from the
1429 ///    corresponding elements in the first operand.
1430 ///
1431 /// \headerfile <x86intrin.h>
1432 ///
1433 /// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
1434 ///
1435 /// \param __a
1436 ///    A 128-bit vector of [4 x float].
1437 /// \param __b
1438 ///    A 32-bit signed integer operand containing the value to be converted.
1439 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1440 ///    converted value of the second operand. The upper 96 bits are copied from
1441 ///    the upper 96 bits of the first operand.
1442 static __inline__ __m128 __DEFAULT_FN_ATTRS
1443 _mm_cvtsi32_ss(__m128 __a, int __b)
1444 {
1445   __a[0] = __b;
1446   return __a;
1447 }
1448
1449 /// \brief Converts a 32-bit signed integer value into a floating point value
1450 ///    and writes it to the lower 32 bits of the destination. The remaining
1451 ///    higher order elements of the destination are copied from the
1452 ///    corresponding elements in the first operand.
1453 ///
1454 /// \headerfile <x86intrin.h>
1455 ///
1456 /// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
1457 ///
1458 /// \param __a
1459 ///    A 128-bit vector of [4 x float].
1460 /// \param __b
1461 ///    A 32-bit signed integer operand containing the value to be converted.
1462 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1463 ///    converted value of the second operand. The upper 96 bits are copied from
1464 ///    the upper 96 bits of the first operand.
1465 static __inline__ __m128 __DEFAULT_FN_ATTRS
1466 _mm_cvt_si2ss(__m128 __a, int __b)
1467 {
1468   return _mm_cvtsi32_ss(__a, __b);
1469 }
1470
1471 #ifdef __x86_64__
1472
1473 /// \brief Converts a 64-bit signed integer value into a floating point value
1474 ///    and writes it to the lower 32 bits of the destination. The remaining
1475 ///    higher order elements of the destination are copied from the
1476 ///    corresponding elements in the first operand.
1477 ///
1478 /// \headerfile <x86intrin.h>
1479 ///
1480 /// This intrinsic corresponds to the \c VCVTSI2SS / CVTSI2SS instruction.
1481 ///
1482 /// \param __a
1483 ///    A 128-bit vector of [4 x float].
1484 /// \param __b
1485 ///    A 64-bit signed integer operand containing the value to be converted.
1486 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1487 ///    converted value of the second operand. The upper 96 bits are copied from
1488 ///    the upper 96 bits of the first operand.
1489 static __inline__ __m128 __DEFAULT_FN_ATTRS
1490 _mm_cvtsi64_ss(__m128 __a, long long __b)
1491 {
1492   __a[0] = __b;
1493   return __a;
1494 }
1495
1496 #endif
1497
1498 /// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
1499 ///    floating point values and writes them to the lower 64-bits of the
1500 ///    destination. The remaining higher order elements of the destination are
1501 ///    copied from the corresponding elements in the first operand.
1502 ///
1503 /// \headerfile <x86intrin.h>
1504 ///
1505 /// This intrinsic corresponds to the \c CVTPI2PS instruction.
1506 ///
1507 /// \param __a
1508 ///    A 128-bit vector of [4 x float].
1509 /// \param __b
1510 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1511 ///    and written to the corresponding low-order elements in the destination.
1512 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1513 ///    converted value of the second operand. The upper 64 bits are copied from
1514 ///    the upper 64 bits of the first operand.
1515 static __inline__ __m128 __DEFAULT_FN_ATTRS
1516 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1517 {
1518   return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1519 }
1520
1521 /// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
1522 ///    floating point values and writes them to the lower 64-bits of the
1523 ///    destination. The remaining higher order elements of the destination are
1524 ///    copied from the corresponding elements in the first operand.
1525 ///
1526 /// \headerfile <x86intrin.h>
1527 ///
1528 /// This intrinsic corresponds to the \c CVTPI2PS instruction.
1529 ///
1530 /// \param __a
1531 ///    A 128-bit vector of [4 x float].
1532 /// \param __b
1533 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1534 ///    and written to the corresponding low-order elements in the destination.
1535 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1536 ///    converted value from the second operand. The upper 64 bits are copied
1537 ///    from the upper 64 bits of the first operand.
1538 static __inline__ __m128 __DEFAULT_FN_ATTRS
1539 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1540 {
1541   return _mm_cvtpi32_ps(__a, __b);
1542 }
1543
1544 /// \brief Extracts a float value contained in the lower 32 bits of a vector of
1545 ///    [4 x float].
1546 ///
1547 /// \headerfile <x86intrin.h>
1548 ///
1549 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1550 ///
1551 /// \param __a
1552 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1553 ///    used in the extraction.
1554 /// \returns A 32-bit float containing the extracted value.
1555 static __inline__ float __DEFAULT_FN_ATTRS
1556 _mm_cvtss_f32(__m128 __a)
1557 {
1558   return __a[0];
1559 }
1560
1561 /// \brief Loads two packed float values from the address __p into the
1562 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1563 ///     are copied from the low-order bits of the first operand.
1564 ///
1565 /// \headerfile <x86intrin.h>
1566 ///
1567 /// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction.
1568 ///
1569 /// \param __a
1570 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1571 ///    of the destination.
1572 /// \param __p
1573 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1574 ///    [127:64] of the destination.
1575 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1576 static __inline__ __m128 __DEFAULT_FN_ATTRS
1577 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1578 {
1579   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1580   struct __mm_loadh_pi_struct {
1581     __mm_loadh_pi_v2f32 __u;
1582   } __attribute__((__packed__, __may_alias__));
1583   __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
1584   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1585   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1586 }
1587
1588 /// \brief Loads two packed float values from the address __p into the low-order
1589 ///    bits of a 128-bit vector of [4 x float]. The high-order bits are copied
1590 ///    from the high-order bits of the first operand.
1591 ///
1592 /// \headerfile <x86intrin.h>
1593 ///
1594 /// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction.
1595 ///
1596 /// \param __a
1597 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1598 ///    [127:64] of the destination.
1599 /// \param __p
1600 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1601 ///    [63:0] of the destination.
1602 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1603 static __inline__ __m128 __DEFAULT_FN_ATTRS
1604 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1605 {
1606   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1607   struct __mm_loadl_pi_struct {
1608     __mm_loadl_pi_v2f32 __u;
1609   } __attribute__((__packed__, __may_alias__));
1610   __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
1611   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1612   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1613 }
1614
1615 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
1616 ///    32 bits of the vector are initialized with the single-precision
1617 ///    floating-point value loaded from a specified memory location. The upper
1618 ///    96 bits are set to zero.
1619 ///
1620 /// \headerfile <x86intrin.h>
1621 ///
1622 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1623 ///
1624 /// \param __p
1625 ///    A pointer to a 32-bit memory location containing a single-precision
1626 ///    floating-point value.
1627 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1628 ///    lower 32 bits contain the value loaded from the memory location. The
1629 ///    upper 96 bits are set to zero.
1630 static __inline__ __m128 __DEFAULT_FN_ATTRS
1631 _mm_load_ss(const float *__p)
1632 {
1633   struct __mm_load_ss_struct {
1634     float __u;
1635   } __attribute__((__packed__, __may_alias__));
1636   float __u = ((struct __mm_load_ss_struct*)__p)->__u;
1637   return (__m128){ __u, 0, 0, 0 };
1638 }
1639
1640 /// \brief Loads a 32-bit float value and duplicates it to all four vector
1641 ///    elements of a 128-bit vector of [4 x float].
1642 ///
1643 /// \headerfile <x86intrin.h>
1644 ///
1645 /// This intrinsic corresponds to the \c VMOVSS / MOVSS + \c shuffling
1646 ///    instruction.
1647 ///
1648 /// \param __p
1649 ///    A pointer to a float value to be loaded and duplicated.
1650 /// \returns A 128-bit vector of [4 x float] containing the loaded
1651 ///    and duplicated values.
1652 static __inline__ __m128 __DEFAULT_FN_ATTRS
1653 _mm_load1_ps(const float *__p)
1654 {
1655   struct __mm_load1_ps_struct {
1656     float __u;
1657   } __attribute__((__packed__, __may_alias__));
1658   float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
1659   return (__m128){ __u, __u, __u, __u };
1660 }
1661
1662 #define        _mm_load_ps1(p) _mm_load1_ps(p)
1663
1664 /// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned
1665 ///    memory location.
1666 ///
1667 /// \headerfile <x86intrin.h>
1668 ///
1669 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
1670 ///
1671 /// \param __p
1672 ///    A pointer to a 128-bit memory location. The address of the memory
1673 ///    location has to be 128-bit aligned.
1674 /// \returns A 128-bit vector of [4 x float] containing the loaded valus.
1675 static __inline__ __m128 __DEFAULT_FN_ATTRS
1676 _mm_load_ps(const float *__p)
1677 {
1678   return *(__m128*)__p;
1679 }
1680
1681 /// \brief Loads a 128-bit floating-point vector of [4 x float] from an
1682 ///    unaligned memory location.
1683 ///
1684 /// \headerfile <x86intrin.h>
1685 ///
1686 /// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
1687 ///
1688 /// \param __p
1689 ///    A pointer to a 128-bit memory location. The address of the memory
1690 ///    location does not have to be aligned.
1691 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1692 static __inline__ __m128 __DEFAULT_FN_ATTRS
1693 _mm_loadu_ps(const float *__p)
1694 {
1695   struct __loadu_ps {
1696     __m128 __v;
1697   } __attribute__((__packed__, __may_alias__));
1698   return ((struct __loadu_ps*)__p)->__v;
1699 }
1700
1701 /// \brief Loads four packed float values, in reverse order, from an aligned
1702 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1703 ///
1704 /// \headerfile <x86intrin.h>
1705 ///
1706 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
1707 ///    instruction.
1708 ///
1709 /// \param __p
1710 ///    A pointer to a 128-bit memory location. The address of the memory
1711 ///    location has to be 128-bit aligned.
1712 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1713 ///    in reverse order.
1714 static __inline__ __m128 __DEFAULT_FN_ATTRS
1715 _mm_loadr_ps(const float *__p)
1716 {
1717   __m128 __a = _mm_load_ps(__p);
1718   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1719 }
1720
1721 /// \brief Create a 128-bit vector of [4 x float] with undefined values.
1722 ///
1723 /// \headerfile <x86intrin.h>
1724 ///
1725 /// This intrinsic has no corresponding instruction.
1726 ///
1727 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1728
1729 static __inline__ __m128 __DEFAULT_FN_ATTRS
1730 _mm_undefined_ps(void)
1731 {
1732   return (__m128)__builtin_ia32_undef128();
1733 }
1734
1735 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
1736 ///    32 bits of the vector are initialized with the specified single-precision
1737 ///    floating-point value. The upper 96 bits are set to zero.
1738 ///
1739 /// \headerfile <x86intrin.h>
1740 ///
1741 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1742 ///
1743 /// \param __w
1744 ///    A single-precision floating-point value used to initialize the lower 32
1745 ///    bits of the result.
1746 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1747 ///    lower 32 bits contain the value provided in the source operand. The
1748 ///    upper 96 bits are set to zero.
1749 static __inline__ __m128 __DEFAULT_FN_ATTRS
1750 _mm_set_ss(float __w)
1751 {
1752   return (__m128){ __w, 0, 0, 0 };
1753 }
1754
1755 /// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
1756 ///    of the four single-precision floating-point vector elements set to the
1757 ///    specified single-precision floating-point value.
1758 ///
1759 /// \headerfile <x86intrin.h>
1760 ///
1761 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
1762 ///
1763 /// \param __w
1764 ///    A single-precision floating-point value used to initialize each vector
1765 ///    element of the result.
1766 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1767 static __inline__ __m128 __DEFAULT_FN_ATTRS
1768 _mm_set1_ps(float __w)
1769 {
1770   return (__m128){ __w, __w, __w, __w };
1771 }
1772
1773 /* Microsoft specific. */
1774 /// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
1775 ///    of the four single-precision floating-point vector elements set to the
1776 ///    specified single-precision floating-point value.
1777 ///
1778 /// \headerfile <x86intrin.h>
1779 ///
1780 /// This intrinsic corresponds to the \c VPERMILPS / PERMILPS instruction.
1781 ///
1782 /// \param __w
1783 ///    A single-precision floating-point value used to initialize each vector
1784 ///    element of the result.
1785 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1786 static __inline__ __m128 __DEFAULT_FN_ATTRS
1787 _mm_set_ps1(float __w)
1788 {
1789     return _mm_set1_ps(__w);
1790 }
1791
1792 /// \brief Constructs a 128-bit floating-point vector of [4 x float]
1793 ///    initialized with the specified single-precision floating-point values.
1794 ///
1795 /// \headerfile <x86intrin.h>
1796 ///
1797 /// This intrinsic is a utility function and does not correspond to a specific
1798 ///    instruction.
1799 ///
1800 /// \param __z
1801 ///    A single-precision floating-point value used to initialize bits [127:96]
1802 ///    of the result.
1803 /// \param __y
1804 ///    A single-precision floating-point value used to initialize bits [95:64]
1805 ///    of the result.
1806 /// \param __x
1807 ///    A single-precision floating-point value used to initialize bits [63:32]
1808 ///    of the result.
1809 /// \param __w
1810 ///    A single-precision floating-point value used to initialize bits [31:0]
1811 ///    of the result.
1812 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1813 static __inline__ __m128 __DEFAULT_FN_ATTRS
1814 _mm_set_ps(float __z, float __y, float __x, float __w)
1815 {
1816   return (__m128){ __w, __x, __y, __z };
1817 }
1818
1819 /// \brief Constructs a 128-bit floating-point vector of [4 x float],
1820 ///    initialized in reverse order with the specified 32-bit single-precision
1821 ///    float-point values.
1822 ///
1823 /// \headerfile <x86intrin.h>
1824 ///
1825 /// This intrinsic is a utility function and does not correspond to a specific
1826 ///    instruction.
1827 ///
1828 /// \param __z
1829 ///    A single-precision floating-point value used to initialize bits [31:0]
1830 ///    of the result.
1831 /// \param __y
1832 ///    A single-precision floating-point value used to initialize bits [63:32]
1833 ///    of the result.
1834 /// \param __x
1835 ///    A single-precision floating-point value used to initialize bits [95:64]
1836 ///    of the result.
1837 /// \param __w
1838 ///    A single-precision floating-point value used to initialize bits [127:96]
1839 ///    of the result.
1840 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1841 static __inline__ __m128 __DEFAULT_FN_ATTRS
1842 _mm_setr_ps(float __z, float __y, float __x, float __w)
1843 {
1844   return (__m128){ __z, __y, __x, __w };
1845 }
1846
1847 /// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized
1848 ///    to zero.
1849 ///
1850 /// \headerfile <x86intrin.h>
1851 ///
1852 /// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
1853 ///
1854 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
1855 ///    all elements set to zero.
1856 static __inline__ __m128 __DEFAULT_FN_ATTRS
1857 _mm_setzero_ps(void)
1858 {
1859   return (__m128){ 0, 0, 0, 0 };
1860 }
1861
1862 /// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1863 ///    memory location.
1864 ///
1865 /// \headerfile <x86intrin.h>
1866 ///
1867 /// This intrinsic corresponds to the \c VPEXTRQ / MOVQ instruction.
1868 ///
1869 /// \param __p
1870 ///    A pointer to a 64-bit memory location.
1871 /// \param __a
1872 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1873 static __inline__ void __DEFAULT_FN_ATTRS
1874 _mm_storeh_pi(__m64 *__p, __m128 __a)
1875 {
1876   __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
1877 }
1878
1879 /// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1880 ///     memory location.
1881 ///
1882 /// \headerfile <x86intrin.h>
1883 ///
1884 /// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction.
1885 ///
1886 /// \param __p
1887 ///    A pointer to a memory location that will receive the float values.
1888 /// \param __a
1889 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1890 static __inline__ void __DEFAULT_FN_ATTRS
1891 _mm_storel_pi(__m64 *__p, __m128 __a)
1892 {
1893   __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
1894 }
1895
1896 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1897 ///     memory location.
1898 ///
1899 /// \headerfile <x86intrin.h>
1900 ///
1901 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
1902 ///
1903 /// \param __p
1904 ///    A pointer to a 32-bit memory location.
1905 /// \param __a
1906 ///    A 128-bit vector of [4 x float] containing the value to be stored.
1907 static __inline__ void __DEFAULT_FN_ATTRS
1908 _mm_store_ss(float *__p, __m128 __a)
1909 {
1910   struct __mm_store_ss_struct {
1911     float __u;
1912   } __attribute__((__packed__, __may_alias__));
1913   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1914 }
1915
1916 /// \brief Stores float values from a 128-bit vector of [4 x float] to an
1917 ///    unaligned memory location.
1918 ///
1919 /// \headerfile <x86intrin.h>
1920 ///
1921 /// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
1922 ///
1923 /// \param __p
1924 ///    A pointer to a 128-bit memory location. The address of the memory
1925 ///    location does not have to be aligned.
1926 /// \param __a
1927 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1928 static __inline__ void __DEFAULT_FN_ATTRS
1929 _mm_storeu_ps(float *__p, __m128 __a)
1930 {
1931   struct __storeu_ps {
1932     __m128 __v;
1933   } __attribute__((__packed__, __may_alias__));
1934   ((struct __storeu_ps*)__p)->__v = __a;
1935 }
1936
1937 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
1938 ///    four contiguous elements in an aligned memory location.
1939 ///
1940 /// \headerfile <x86intrin.h>
1941 ///
1942 /// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
1943 ///    instruction.
1944 ///
1945 /// \param __p
1946 ///    A pointer to a 128-bit memory location.
1947 /// \param __a
1948 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
1949 ///    of the four contiguous elements pointed by __p.
1950 static __inline__ void __DEFAULT_FN_ATTRS
1951 _mm_store_ps(float *__p, __m128 __a)
1952 {
1953   *(__m128*)__p = __a;
1954 }
1955
1956 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
1957 ///    four contiguous elements in an aligned memory location.
1958 ///
1959 /// \headerfile <x86intrin.h>
1960 ///
1961 /// This intrinsic corresponds to \c VMOVAPS / MOVAPS + \c shuffling
1962 ///    instruction.
1963 ///
1964 /// \param __p
1965 ///    A pointer to a 128-bit memory location.
1966 /// \param __a
1967 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
1968 ///    of the four contiguous elements pointed by __p.
1969 static __inline__ void __DEFAULT_FN_ATTRS
1970 _mm_store1_ps(float *__p, __m128 __a)
1971 {
1972   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
1973   _mm_store_ps(__p, __a);
1974 }
1975
1976 /// \brief Stores float values from a 128-bit vector of [4 x float] to an
1977 ///    aligned memory location.
1978 ///
1979 /// \headerfile <x86intrin.h>
1980 ///
1981 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
1982 ///
1983 /// \param __p
1984 ///    A pointer to a 128-bit memory location. The address of the memory
1985 ///    location has to be 128-bit aligned.
1986 /// \param __a
1987 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1988 static __inline__ void __DEFAULT_FN_ATTRS
1989 _mm_store_ps1(float *__p, __m128 __a)
1990 {
1991   return _mm_store1_ps(__p, __a);
1992 }
1993
1994 /// \brief Stores float values from a 128-bit vector of [4 x float] to an
1995 ///    aligned memory location in reverse order.
1996 ///
1997 /// \headerfile <x86intrin.h>
1998 ///
1999 /// This intrinsic corresponds to the \c VMOVAPS / MOVAPS + \c shuffling
2000 ///    instruction.
2001 ///
2002 /// \param __p
2003 ///    A pointer to a 128-bit memory location. The address of the memory
2004 ///    location has to be 128-bit aligned.
2005 /// \param __a
2006 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2007 static __inline__ void __DEFAULT_FN_ATTRS
2008 _mm_storer_ps(float *__p, __m128 __a)
2009 {
2010   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2011   _mm_store_ps(__p, __a);
2012 }
2013
2014 #define _MM_HINT_T0 3
2015 #define _MM_HINT_T1 2
2016 #define _MM_HINT_T2 1
2017 #define _MM_HINT_NTA 0
2018
2019 #ifndef _MSC_VER
2020 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2021    Sema doesn't do any form of constant propagation yet. */
2022
2023 /// \brief Loads one cache line of data from the specified address to a location
2024 ///    closer to the processor.
2025 ///
2026 /// \headerfile <x86intrin.h>
2027 ///
2028 /// \code
2029 /// void _mm_prefetch(const void * a, const int sel);
2030 /// \endcode
2031 ///
2032 /// This intrinsic corresponds to the \c PREFETCHNTA instruction.
2033 ///
2034 /// \param a
2035 ///    A pointer to a memory location containing a cache line of data.
2036 /// \param sel
2037 ///    A predefined integer constant specifying the type of prefetch operation:
2038 ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint.
2039 ///    The PREFETCHNTA instruction will be generated.
2040 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2041 ///    be generated.
2042 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2043 ///    be generated.
2044 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2045 ///    be generated.
2046 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
2047 #endif
2048
2049 /// \brief Stores a 64-bit integer in the specified aligned memory location. To
2050 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
2051 ///    used again soon).
2052 ///
2053 /// \headerfile <x86intrin.h>
2054 ///
2055 /// This intrinsic corresponds to the \c MOVNTQ instruction.
2056 ///
2057 /// \param __p
2058 ///    A pointer to an aligned memory location used to store the register value.
2059 /// \param __a
2060 ///    A 64-bit integer containing the value to be stored.
2061 static __inline__ void __DEFAULT_FN_ATTRS
2062 _mm_stream_pi(__m64 *__p, __m64 __a)
2063 {
2064   __builtin_ia32_movntq(__p, __a);
2065 }
2066
2067 /// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
2068 ///    128-bit aligned memory location. To minimize caching, the data is flagged
2069 ///    as non-temporal (unlikely to be used again soon).
2070 ///
2071 /// \headerfile <x86intrin.h>
2072 ///
2073 /// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
2074 ///
2075 /// \param __p
2076 ///    A pointer to a 128-bit aligned memory location that will receive the
2077 ///    integer values.
2078 /// \param __a
2079 ///    A 128-bit vector of [4 x float] containing the values to be moved.
2080 static __inline__ void __DEFAULT_FN_ATTRS
2081 _mm_stream_ps(float *__p, __m128 __a)
2082 {
2083   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2084 }
2085
2086 /// \brief Forces strong memory ordering (serialization) between store
2087 ///    instructions preceding this instruction and store instructions following
2088 ///    this instruction, ensuring the system completes all previous stores
2089 ///    before executing subsequent stores.
2090 ///
2091 /// \headerfile <x86intrin.h>
2092 ///
2093 /// This intrinsic corresponds to the \c SFENCE instruction.
2094 ///
2095 static __inline__ void __DEFAULT_FN_ATTRS
2096 _mm_sfence(void)
2097 {
2098   __builtin_ia32_sfence();
2099 }
2100
2101 /// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2102 ///    returns it, as specified by the immediate integer operand.
2103 ///
2104 /// \headerfile <x86intrin.h>
2105 ///
2106 /// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction.
2107 ///
2108 /// \param __a
2109 ///    A 64-bit vector of [4 x i16].
2110 /// \param __n
2111 ///    An immediate integer operand that determines which bits are extracted:
2112 ///    0: Bits [15:0] are copied to the destination.
2113 ///    1: Bits [31:16] are copied to the destination.
2114 ///    2: Bits [47:32] are copied to the destination.
2115 ///    3: Bits [63:48] are copied to the destination.
2116 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2117 #define _mm_extract_pi16(a, n) __extension__ ({ \
2118   (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); })
2119
2120 /// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
2121 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2122 ///    specified by the immediate operand __n.
2123 ///
2124 /// \headerfile <x86intrin.h>
2125 ///
2126 /// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction.
2127 ///
2128 /// \param __a
2129 ///    A 64-bit vector of [4 x i16].
2130 /// \param __d
2131 ///    An integer. The lower 16-bit value from this operand is written to the
2132 ///    destination at the offset specified by operand __n.
2133 /// \param __n
2134 ///    An immediate integer operant that determines which the bits to be used
2135 ///    in the destination.
2136 ///    0: Bits [15:0] are copied to the destination.
2137 ///    1: Bits [31:16] are copied to the destination.
2138 ///    2: Bits [47:32] are copied to the destination.
2139 ///    3: Bits [63:48] are copied to the destination.
2140 ///    The remaining bits in the destination are copied from the corresponding
2141 ///    bits in operand __a.
2142 /// \returns A 64-bit integer vector containing the copied packed data from the
2143 ///    operands.
2144 #define _mm_insert_pi16(a, d, n) __extension__ ({ \
2145   (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); })
2146
2147 /// \brief Compares each of the corresponding packed 16-bit integer values of
2148 ///    the 64-bit integer vectors, and writes the greater value to the
2149 ///    corresponding bits in the destination.
2150 ///
2151 /// \headerfile <x86intrin.h>
2152 ///
2153 /// This intrinsic corresponds to the \c PMAXSW instruction.
2154 ///
2155 /// \param __a
2156 ///    A 64-bit integer vector containing one of the source operands.
2157 /// \param __b
2158 ///    A 64-bit integer vector containing one of the source operands.
2159 /// \returns A 64-bit integer vector containing the comparison results.
2160 static __inline__ __m64 __DEFAULT_FN_ATTRS
2161 _mm_max_pi16(__m64 __a, __m64 __b)
2162 {
2163   return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2164 }
2165
2166 /// \brief Compares each of the corresponding packed 8-bit unsigned integer
2167 ///    values of the 64-bit integer vectors, and writes the greater value to the
2168 ///    corresponding bits in the destination.
2169 ///
2170 /// \headerfile <x86intrin.h>
2171 ///
2172 /// This intrinsic corresponds to the \c PMAXUB instruction.
2173 ///
2174 /// \param __a
2175 ///    A 64-bit integer vector containing one of the source operands.
2176 /// \param __b
2177 ///    A 64-bit integer vector containing one of the source operands.
2178 /// \returns A 64-bit integer vector containing the comparison results.
2179 static __inline__ __m64 __DEFAULT_FN_ATTRS
2180 _mm_max_pu8(__m64 __a, __m64 __b)
2181 {
2182   return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2183 }
2184
2185 /// \brief Compares each of the corresponding packed 16-bit integer values of
2186 ///    the 64-bit integer vectors, and writes the lesser value to the
2187 ///    corresponding bits in the destination.
2188 ///
2189 /// \headerfile <x86intrin.h>
2190 ///
2191 /// This intrinsic corresponds to the \c PMINSW instruction.
2192 ///
2193 /// \param __a
2194 ///    A 64-bit integer vector containing one of the source operands.
2195 /// \param __b
2196 ///    A 64-bit integer vector containing one of the source operands.
2197 /// \returns A 64-bit integer vector containing the comparison results.
2198 static __inline__ __m64 __DEFAULT_FN_ATTRS
2199 _mm_min_pi16(__m64 __a, __m64 __b)
2200 {
2201   return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2202 }
2203
2204 /// \brief Compares each of the corresponding packed 8-bit unsigned integer
2205 ///    values of the 64-bit integer vectors, and writes the lesser value to the
2206 ///    corresponding bits in the destination.
2207 ///
2208 /// \headerfile <x86intrin.h>
2209 ///
2210 /// This intrinsic corresponds to the \c PMINUB instruction.
2211 ///
2212 /// \param __a
2213 ///    A 64-bit integer vector containing one of the source operands.
2214 /// \param __b
2215 ///    A 64-bit integer vector containing one of the source operands.
2216 /// \returns A 64-bit integer vector containing the comparison results.
2217 static __inline__ __m64 __DEFAULT_FN_ATTRS
2218 _mm_min_pu8(__m64 __a, __m64 __b)
2219 {
2220   return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2221 }
2222
2223 /// \brief Takes the most significant bit from each 8-bit element in a 64-bit
2224 ///    integer vector to create a 16-bit mask value. Zero-extends the value to
2225 ///    32-bit integer and writes it to the destination.
2226 ///
2227 /// \headerfile <x86intrin.h>
2228 ///
2229 /// This intrinsic corresponds to the \c PMOVMSKB instruction.
2230 ///
2231 /// \param __a
2232 ///    A 64-bit integer vector containing the values with bits to be extracted.
2233 /// \returns The most significant bit from each 8-bit element in the operand,
2234 ///    written to bits [15:0].
2235 static __inline__ int __DEFAULT_FN_ATTRS
2236 _mm_movemask_pi8(__m64 __a)
2237 {
2238   return __builtin_ia32_pmovmskb((__v8qi)__a);
2239 }
2240
2241 /// \brief Multiplies packed 16-bit unsigned integer values and writes the
2242 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
2243 ///    the destination.
2244 ///
2245 /// \headerfile <x86intrin.h>
2246 ///
2247 /// This intrinsic corresponds to the \c PMULHUW instruction.
2248 ///
2249 /// \param __a
2250 ///    A 64-bit integer vector containing one of the source operands.
2251 /// \param __b
2252 ///    A 64-bit integer vector containing one of the source operands.
2253 /// \returns A 64-bit integer vector containing the products of both operands.
2254 static __inline__ __m64 __DEFAULT_FN_ATTRS
2255 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2256 {
2257   return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2258 }
2259
2260 /// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2261 ///    destination, as specified by the immediate value operand.
2262 ///
2263 /// \headerfile <x86intrin.h>
2264 ///
2265 /// This intrinsic corresponds to the \c PSHUFW instruction.
2266 ///
2267 /// \code
2268 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2269 /// \endcode
2270 ///
2271 /// \param a
2272 ///    A 64-bit integer vector containing the values to be shuffled.
2273 /// \param n
2274 ///    An immediate value containing an 8-bit value specifying which elements to
2275 ///    copy from a. The destinations within the 64-bit destination are assigned
2276 ///    values as follows:
2277 ///    Bits [1:0] are used to assign values to bits [15:0] in the destination.
2278 ///    Bits [3:2] are used to assign values to bits [31:16] in the destination.
2279 ///    Bits [5:4] are used to assign values to bits [47:32] in the destination.
2280 ///    Bits [7:6] are used to assign values to bits [63:48] in the destination.
2281 ///    Bit value assignments:
2282 ///    00: assigned from bits [15:0] of a.
2283 ///    01: assigned from bits [31:16] of a.
2284 ///    10: assigned from bits [47:32] of a.
2285 ///    11: assigned from bits [63:48] of a.
2286 /// \returns A 64-bit integer vector containing the shuffled values.
2287 #define _mm_shuffle_pi16(a, n) __extension__ ({ \
2288   (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
2289
2290 /// \brief Conditionally copies the values from each 8-bit element in the first
2291 ///    64-bit integer vector operand to the specified memory location, as
2292 ///    specified by the most significant bit in the corresponding element in the
2293 ///    second 64-bit integer vector operand. To minimize caching, the data is
2294 ///    flagged as non-temporal (unlikely to be used again soon).
2295 ///
2296 /// \headerfile <x86intrin.h>
2297 ///
2298 /// This intrinsic corresponds to the \c MASKMOVQ instruction.
2299 ///
2300 /// \param __d
2301 ///    A 64-bit integer vector containing the values with elements to be copied.
2302 /// \param __n
2303 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2304 ///    element determines whether the corresponding element in operand __d is
2305 ///    copied. If the most significant bit of a given element is 1, the
2306 ///    corresponding element in operand __d is copied.
2307 /// \param __p
2308 ///    A pointer to a 64-bit memory location that will receive the conditionally
2309 ///    copied integer values. The address of the memory location does not have
2310 ///    to be aligned.
2311 static __inline__ void __DEFAULT_FN_ATTRS
2312 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2313 {
2314   __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2315 }
2316
2317 /// \brief Computes the rounded averages of the packed unsigned 8-bit integer
2318 ///    values and writes the averages to the corresponding bits in the
2319 ///    destination.
2320 ///
2321 /// \headerfile <x86intrin.h>
2322 ///
2323 /// This intrinsic corresponds to the \c PAVGB instruction.
2324 ///
2325 /// \param __a
2326 ///    A 64-bit integer vector containing one of the source operands.
2327 /// \param __b
2328 ///    A 64-bit integer vector containing one of the source operands.
2329 /// \returns A 64-bit integer vector containing the averages of both operands.
2330 static __inline__ __m64 __DEFAULT_FN_ATTRS
2331 _mm_avg_pu8(__m64 __a, __m64 __b)
2332 {
2333   return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2334 }
2335
2336 /// \brief Computes the rounded averages of the packed unsigned 16-bit integer
2337 ///    values and writes the averages to the corresponding bits in the
2338 ///    destination.
2339 ///
2340 /// \headerfile <x86intrin.h>
2341 ///
2342 /// This intrinsic corresponds to the \c PAVGW instruction.
2343 ///
2344 /// \param __a
2345 ///    A 64-bit integer vector containing one of the source operands.
2346 /// \param __b
2347 ///    A 64-bit integer vector containing one of the source operands.
2348 /// \returns A 64-bit integer vector containing the averages of both operands.
2349 static __inline__ __m64 __DEFAULT_FN_ATTRS
2350 _mm_avg_pu16(__m64 __a, __m64 __b)
2351 {
2352   return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2353 }
2354
2355 /// \brief Subtracts the corresponding 8-bit unsigned integer values of the two
2356 ///    64-bit vector operands and computes the absolute value for each of the
2357 ///    difference. Then sum of the 8 absolute differences is written to the
2358 ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2359 ///
2360 /// \headerfile <x86intrin.h>
2361 ///
2362 /// This intrinsic corresponds to the \c PSADBW instruction.
2363 ///
2364 /// \param __a
2365 ///    A 64-bit integer vector containing one of the source operands.
2366 /// \param __b
2367 ///    A 64-bit integer vector containing one of the source operands.
2368 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2369 ///    sets of absolute differences between both operands. The upper bits are
2370 ///    cleared.
2371 static __inline__ __m64 __DEFAULT_FN_ATTRS
2372 _mm_sad_pu8(__m64 __a, __m64 __b)
2373 {
2374   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2375 }
2376
2377 /// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
2378 ///    integer value. There are several groups of macros associated with this
2379 ///    intrinsic, including:
2380 ///    * For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2381 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2382 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2383 ///      _MM_GET_EXCEPTION_STATE().
2384 ///    * For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2385 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2386 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2387 ///    * For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2388 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2389 ///      _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
2390 ///    * For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2391 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2392 ///    * For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2393 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2394 ///      _MM_GET_DENORMALS_ZERO_MODE().
2395 ///
2396 ///    For example, the expression below checks if an overflow exception has
2397 ///    occurred:
2398 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2399 ///
2400 ///    The following example gets the current rounding mode:
2401 ///      _MM_GET_ROUNDING_MODE()
2402 ///
2403 /// \headerfile <x86intrin.h>
2404 ///
2405 /// This intrinsic corresponds to the \c VSTMXCSR / STMXCSR instruction.
2406 ///
2407 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2408 ///    register.
2409 static __inline__ unsigned int __DEFAULT_FN_ATTRS
2410 _mm_getcsr(void)
2411 {
2412   return __builtin_ia32_stmxcsr();
2413 }
2414
2415 /// \brief Sets the MXCSR register with the 32-bit unsigned integer value. There
2416 ///    are several groups of macros associated with this intrinsic, including:
2417 ///    * For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2418 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2419 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2420 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2421 ///    * For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2422 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2423 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2424 ///      of these macros.
2425 ///    * For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2426 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2427 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2428 ///    * For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2429 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2430 ///      one of these macros.
2431 ///    * For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2432 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2433 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2434 ///
2435 ///    For example, the following expression causes subsequent floating-point
2436 ///    operations to round up:
2437 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2438 ///
2439 ///    The following example sets the DAZ and FTZ flags:
2440 ///      void setFlags() {
2441 ///        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)
2442 ///        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)
2443 ///      }
2444 ///
2445 /// \headerfile <x86intrin.h>
2446 ///
2447 /// This intrinsic corresponds to the \c VLDMXCSR / LDMXCSR instruction.
2448 ///
2449 /// \param __i
2450 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
2451 static __inline__ void __DEFAULT_FN_ATTRS
2452 _mm_setcsr(unsigned int __i)
2453 {
2454   __builtin_ia32_ldmxcsr(__i);
2455 }
2456
2457 /// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
2458 ///    specified by the immediate value operand.
2459 ///
2460 /// \headerfile <x86intrin.h>
2461 ///
2462 /// \code
2463 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2464 /// \endcode
2465 ///
2466 /// This intrinsic corresponds to the \c VSHUFPS / SHUFPS instruction.
2467 ///
2468 /// \param a
2469 ///    A 128-bit vector of [4 x float].
2470 /// \param b
2471 ///    A 128-bit vector of [4 x float].
2472 /// \param mask
2473 ///    An immediate value containing an 8-bit value specifying which elements to
2474 ///    copy from a and b.
2475 ///    Bits [3:0] specify the values copied from operand a.
2476 ///    Bits [7:4] specify the values copied from operand b. The destinations
2477 ///    within the 128-bit destination are assigned values as follows:
2478 ///    Bits [1:0] are used to assign values to bits [31:0] in the destination.
2479 ///    Bits [3:2] are used to assign values to bits [63:32] in the destination.
2480 ///    Bits [5:4] are used to assign values to bits [95:64] in the destination.
2481 ///    Bits [7:6] are used to assign values to bits [127:96] in the destination.
2482 ///    Bit value assignments:
2483 ///    00: Bits [31:0] copied from the specified operand.
2484 ///    01: Bits [63:32] copied from the specified operand.
2485 ///    10: Bits [95:64] copied from the specified operand.
2486 ///    11: Bits [127:96] copied from the specified operand.
2487 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2488 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
2489   (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2490                                   0 + (((mask) >> 0) & 0x3), \
2491                                   0 + (((mask) >> 2) & 0x3), \
2492                                   4 + (((mask) >> 4) & 0x3), \
2493                                   4 + (((mask) >> 6) & 0x3)); })
2494
2495 /// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2496 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x
2497 ///    float].
2498 ///
2499 /// \headerfile <x86intrin.h>
2500 ///
2501 /// This intrinsic corresponds to the \c VUNPCKHPS / UNPCKHPS instruction.
2502 ///
2503 /// \param __a
2504 ///    A 128-bit vector of [4 x float].
2505 ///    Bits [95:64] are written to bits [31:0] of the destination.
2506 ///    Bits [127:96] are written to bits [95:64] of the destination.
2507 /// \param __b
2508 ///    A 128-bit vector of [4 x float].
2509 ///    Bits [95:64] are written to bits [63:32] of the destination.
2510 ///    Bits [127:96] are written to bits [127:96] of the destination.
2511 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2512 static __inline__ __m128 __DEFAULT_FN_ATTRS
2513 _mm_unpackhi_ps(__m128 __a, __m128 __b)
2514 {
2515   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2516 }
2517
2518 /// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2519 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x
2520 ///    float].
2521 ///
2522 /// \headerfile <x86intrin.h>
2523 ///
2524 /// This intrinsic corresponds to the \c VUNPCKLPS / UNPCKLPS instruction.
2525 ///
2526 /// \param __a
2527 ///    A 128-bit vector of [4 x float].
2528 ///    Bits [31:0] are written to bits [31:0] of the destination.
2529 ///    Bits [63:32] are written to bits [95:64] of the destination.
2530 /// \param __b
2531 ///    A 128-bit vector of [4 x float].
2532 ///    Bits [31:0] are written to bits [63:32] of the destination.
2533 ///    Bits [63:32] are written to bits [127:96] of the destination.
2534 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2535 static __inline__ __m128 __DEFAULT_FN_ATTRS
2536 _mm_unpacklo_ps(__m128 __a, __m128 __b)
2537 {
2538   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2539 }
2540
2541 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2542 ///    32 bits are set to the lower 32 bits of the second parameter. The upper
2543 ///    96 bits are set to the upper 96 bits of the first parameter.
2544 ///
2545 /// \headerfile <x86intrin.h>
2546 ///
2547 /// This intrinsic corresponds to the \c VMOVSS / MOVSS instruction.
2548 ///
2549 /// \param __a
2550 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2551 ///    written to the upper 96 bits of the result.
2552 /// \param __b
2553 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2554 ///    written to the lower 32 bits of the result.
2555 /// \returns A 128-bit floating-point vector of [4 x float].
2556 static __inline__ __m128 __DEFAULT_FN_ATTRS
2557 _mm_move_ss(__m128 __a, __m128 __b)
2558 {
2559   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3);
2560 }
2561
2562 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2563 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
2564 ///    64 bits are set to the upper 64 bits of the first parameter.
2565 ///
2566 /// \headerfile <x86intrin.h>
2567 ///
2568 /// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction.
2569 ///
2570 /// \param __a
2571 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2572 ///    written to the upper 64 bits of the result.
2573 /// \param __b
2574 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2575 ///    written to the lower 64 bits of the result.
2576 /// \returns A 128-bit floating-point vector of [4 x float].
2577 static __inline__ __m128 __DEFAULT_FN_ATTRS
2578 _mm_movehl_ps(__m128 __a, __m128 __b)
2579 {
2580   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2581 }
2582
2583 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2584 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
2585 ///    64 bits are set to the lower 64 bits of the second parameter.
2586 ///
2587 /// \headerfile <x86intrin.h>
2588 ///
2589 /// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
2590 ///
2591 /// \param __a
2592 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2593 ///    written to the lower 64 bits of the result.
2594 /// \param __b
2595 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2596 ///    written to the upper 64 bits of the result.
2597 /// \returns A 128-bit floating-point vector of [4 x float].
2598 static __inline__ __m128 __DEFAULT_FN_ATTRS
2599 _mm_movelh_ps(__m128 __a, __m128 __b)
2600 {
2601   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2602 }
2603
2604 /// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2605 ///    float].
2606 ///
2607 /// \headerfile <x86intrin.h>
2608 ///
2609 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2610 ///
2611 /// \param __a
2612 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2613 ///    from the corresponding elements in this operand.
2614 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2615 ///    values from the operand.
2616 static __inline__ __m128 __DEFAULT_FN_ATTRS
2617 _mm_cvtpi16_ps(__m64 __a)
2618 {
2619   __m64 __b, __c;
2620   __m128 __r;
2621
2622   __b = _mm_setzero_si64();
2623   __b = _mm_cmpgt_pi16(__b, __a);
2624   __c = _mm_unpackhi_pi16(__a, __b);
2625   __r = _mm_setzero_ps();
2626   __r = _mm_cvtpi32_ps(__r, __c);
2627   __r = _mm_movelh_ps(__r, __r);
2628   __c = _mm_unpacklo_pi16(__a, __b);
2629   __r = _mm_cvtpi32_ps(__r, __c);
2630
2631   return __r;
2632 }
2633
2634 /// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a
2635 ///    128-bit vector of [4 x float].
2636 ///
2637 /// \headerfile <x86intrin.h>
2638 ///
2639 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2640 ///
2641 /// \param __a
2642 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2643 ///    destination are copied from the corresponding elements in this operand.
2644 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2645 ///    values from the operand.
2646 static __inline__ __m128 __DEFAULT_FN_ATTRS
2647 _mm_cvtpu16_ps(__m64 __a)
2648 {
2649   __m64 __b, __c;
2650   __m128 __r;
2651
2652   __b = _mm_setzero_si64();
2653   __c = _mm_unpackhi_pi16(__a, __b);
2654   __r = _mm_setzero_ps();
2655   __r = _mm_cvtpi32_ps(__r, __c);
2656   __r = _mm_movelh_ps(__r, __r);
2657   __c = _mm_unpacklo_pi16(__a, __b);
2658   __r = _mm_cvtpi32_ps(__r, __c);
2659
2660   return __r;
2661 }
2662
2663 /// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2664 ///    into a 128-bit vector of [4 x float].
2665 ///
2666 /// \headerfile <x86intrin.h>
2667 ///
2668 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2669 ///
2670 /// \param __a
2671 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2672 ///    from the corresponding lower 4 elements in this operand.
2673 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2674 ///    values from the operand.
2675 static __inline__ __m128 __DEFAULT_FN_ATTRS
2676 _mm_cvtpi8_ps(__m64 __a)
2677 {
2678   __m64 __b;
2679
2680   __b = _mm_setzero_si64();
2681   __b = _mm_cmpgt_pi8(__b, __a);
2682   __b = _mm_unpacklo_pi8(__a, __b);
2683
2684   return _mm_cvtpi16_ps(__b);
2685 }
2686
2687 /// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit
2688 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2689 ///
2690 /// \headerfile <x86intrin.h>
2691 ///
2692 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2693 ///
2694 /// \param __a
2695 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2696 ///    destination are copied from the corresponding lower 4 elements in this
2697 ///    operand.
2698 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2699 ///    values from the source operand.
2700 static __inline__ __m128 __DEFAULT_FN_ATTRS
2701 _mm_cvtpu8_ps(__m64 __a)
2702 {
2703   __m64 __b;
2704
2705   __b = _mm_setzero_si64();
2706   __b = _mm_unpacklo_pi8(__a, __b);
2707
2708   return _mm_cvtpi16_ps(__b);
2709 }
2710
2711 /// \brief Converts the two 32-bit signed integer values from each 64-bit vector
2712 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2713 ///
2714 /// \headerfile <x86intrin.h>
2715 ///
2716 /// This intrinsic corresponds to the \c CVTPI2PS + \c COMPOSITE instruction.
2717 ///
2718 /// \param __a
2719 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2720 ///    copied from the elements in this operand.
2721 /// \param __b
2722 ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2723 ///    copied from the elements in this operand.
2724 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2725 ///    copied and converted values from the first operand. The upper 64 bits
2726 ///    contain the copied and converted values from the second operand.
2727 static __inline__ __m128 __DEFAULT_FN_ATTRS
2728 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2729 {
2730   __m128 __c;
2731
2732   __c = _mm_setzero_ps();
2733   __c = _mm_cvtpi32_ps(__c, __b);
2734   __c = _mm_movelh_ps(__c, __c);
2735
2736   return _mm_cvtpi32_ps(__c, __a);
2737 }
2738
2739 /// \brief Converts each single-precision floating-point element of a 128-bit
2740 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2741 ///    packs the results into a 64-bit integer vector of [4 x i16]. If the
2742 ///    floating-point element is NaN or infinity, or if the floating-point
2743 ///    element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
2744 ///    to 0x8000. Otherwise if the floating-point element is greater
2745 ///    than 0x7FFF, it is converted to 0x7FFF.
2746 ///
2747 /// \headerfile <x86intrin.h>
2748 ///
2749 /// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
2750 ///
2751 /// \param __a
2752 ///    A 128-bit floating-point vector of [4 x float].
2753 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2754 ///    values.
2755 static __inline__ __m64 __DEFAULT_FN_ATTRS
2756 _mm_cvtps_pi16(__m128 __a)
2757 {
2758   __m64 __b, __c;
2759
2760   __b = _mm_cvtps_pi32(__a);
2761   __a = _mm_movehl_ps(__a, __a);
2762   __c = _mm_cvtps_pi32(__a);
2763
2764   return _mm_packs_pi32(__b, __c);
2765 }
2766
2767 /// \brief Converts each single-precision floating-point element of a 128-bit
2768 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
2769 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
2770 ///    [8 x i8]. The upper 32 bits of the vector are set to 0. If the
2771 ///    floating-point element is NaN or infinity, or if the floating-point
2772 ///    element is greater than 0x7FFFFFFF or less than -0x80, it is converted
2773 ///    to 0x80. Otherwise if the floating-point element is greater
2774 ///    than 0x7F, it is converted to 0x7F.
2775 ///
2776 /// \headerfile <x86intrin.h>
2777 ///
2778 /// This intrinsic corresponds to the \c CVTPS2PI + \c COMPOSITE instruction.
2779 ///
2780 /// \param __a
2781 ///    128-bit floating-point vector of [4 x float].
2782 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2783 ///    converted values and the uppper 32 bits are set to zero.
2784 static __inline__ __m64 __DEFAULT_FN_ATTRS
2785 _mm_cvtps_pi8(__m128 __a)
2786 {
2787   __m64 __b, __c;
2788
2789   __b = _mm_cvtps_pi16(__a);
2790   __c = _mm_setzero_si64();
2791
2792   return _mm_packs_pi16(__b, __c);
2793 }
2794
2795 /// \brief Extracts the sign bits from each single-precision floating-point
2796 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
2797 ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2798 ///    to zero.
2799 ///
2800 /// \headerfile <x86intrin.h>
2801 ///
2802 /// This intrinsic corresponds to the \c VMOVMSKPS / MOVMSKPS instruction.
2803 ///
2804 /// \param __a
2805 ///    A 128-bit floating-point vector of [4 x float].
2806 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2807 ///    single-precision floating-point element of the parameter. Bits [31:4] are
2808 ///    set to zero.
2809 static __inline__ int __DEFAULT_FN_ATTRS
2810 _mm_movemask_ps(__m128 __a)
2811 {
2812   return __builtin_ia32_movmskps((__v4sf)__a);
2813 }
2814
2815
2816 #define _MM_ALIGN16 __attribute__((aligned(16)))
2817
2818 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2819
2820 #define _MM_EXCEPT_INVALID    (0x0001)
2821 #define _MM_EXCEPT_DENORM     (0x0002)
2822 #define _MM_EXCEPT_DIV_ZERO   (0x0004)
2823 #define _MM_EXCEPT_OVERFLOW   (0x0008)
2824 #define _MM_EXCEPT_UNDERFLOW  (0x0010)
2825 #define _MM_EXCEPT_INEXACT    (0x0020)
2826 #define _MM_EXCEPT_MASK       (0x003f)
2827
2828 #define _MM_MASK_INVALID      (0x0080)
2829 #define _MM_MASK_DENORM       (0x0100)
2830 #define _MM_MASK_DIV_ZERO     (0x0200)
2831 #define _MM_MASK_OVERFLOW     (0x0400)
2832 #define _MM_MASK_UNDERFLOW    (0x0800)
2833 #define _MM_MASK_INEXACT      (0x1000)
2834 #define _MM_MASK_MASK         (0x1f80)
2835
2836 #define _MM_ROUND_NEAREST     (0x0000)
2837 #define _MM_ROUND_DOWN        (0x2000)
2838 #define _MM_ROUND_UP          (0x4000)
2839 #define _MM_ROUND_TOWARD_ZERO (0x6000)
2840 #define _MM_ROUND_MASK        (0x6000)
2841
2842 #define _MM_FLUSH_ZERO_MASK   (0x8000)
2843 #define _MM_FLUSH_ZERO_ON     (0x8000)
2844 #define _MM_FLUSH_ZERO_OFF    (0x0000)
2845
2846 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2847 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2848 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2849 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2850
2851 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2852 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2853 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2854 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2855
2856 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2857 do { \
2858   __m128 tmp3, tmp2, tmp1, tmp0; \
2859   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2860   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2861   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2862   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2863   (row0) = _mm_movelh_ps(tmp0, tmp2); \
2864   (row1) = _mm_movehl_ps(tmp2, tmp0); \
2865   (row2) = _mm_movelh_ps(tmp1, tmp3); \
2866   (row3) = _mm_movehl_ps(tmp3, tmp1); \
2867 } while (0)
2868
2869 /* Aliases for compatibility. */
2870 #define _m_pextrw _mm_extract_pi16
2871 #define _m_pinsrw _mm_insert_pi16
2872 #define _m_pmaxsw _mm_max_pi16
2873 #define _m_pmaxub _mm_max_pu8
2874 #define _m_pminsw _mm_min_pi16
2875 #define _m_pminub _mm_min_pu8
2876 #define _m_pmovmskb _mm_movemask_pi8
2877 #define _m_pmulhuw _mm_mulhi_pu16
2878 #define _m_pshufw _mm_shuffle_pi16
2879 #define _m_maskmovq _mm_maskmove_si64
2880 #define _m_pavgb _mm_avg_pu8
2881 #define _m_pavgw _mm_avg_pu16
2882 #define _m_psadbw _mm_sad_pu8
2883 #define _m_ _mm_
2884 #define _m_ _mm_
2885
2886 #undef __DEFAULT_FN_ATTRS
2887
2888 /* Ugly hack for backwards-compatibility (compatible with gcc) */
2889 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
2890 #include <emmintrin.h>
2891 #endif
2892
2893 #endif /* __XMMINTRIN_H */