contrib/llvm-project/clang/lib/Headers/xmmintrin.h

   1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 #ifndef __XMMINTRIN_H
  11 #define __XMMINTRIN_H
  12
  13 #include <mmintrin.h>
  14
  15 typedef int __v4si __attribute__((__vector_size__(16)));
  16 typedef float __v4sf __attribute__((__vector_size__(16)));
  17 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
  18
  19 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
  20
  21 /* Unsigned types */
  22 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
  23
  24 /* This header should only be included in a hosted environment as it depends on
  25  * a standard library to provide allocation routines. */
  26 #if __STDC_HOSTED__
  27 #include <mm_malloc.h>
  28 #endif
  29
  30 /* Define the default attributes for the functions in this file. */
  31 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
  32 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
  33
  34 /// Adds the 32-bit float values in the low-order bits of the operands.
  35 ///
  36 /// \headerfile <x86intrin.h>
  37 ///
  38 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
  39 ///
  40 /// \param __a
  41 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  42 ///    The lower 32 bits of this operand are used in the calculation.
  43 /// \param __b
  44 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  45 ///    The lower 32 bits of this operand are used in the calculation.
  46 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
  47 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
  48 ///    the upper 96 bits of the first source operand.
  49 static __inline__ __m128 __DEFAULT_FN_ATTRS
  50 _mm_add_ss(__m128 __a, __m128 __b)
  51 {
  52   __a[0] += __b[0];
  53   return __a;
  54 }
  55
  56 /// Adds two 128-bit vectors of [4 x float], and returns the results of
  57 ///    the addition.
  58 ///
  59 /// \headerfile <x86intrin.h>
  60 ///
  61 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
  62 ///
  63 /// \param __a
  64 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  65 /// \param __b
  66 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  67 /// \returns A 128-bit vector of [4 x float] containing the sums of both
  68 ///    operands.
  69 static __inline__ __m128 __DEFAULT_FN_ATTRS
  70 _mm_add_ps(__m128 __a, __m128 __b)
  71 {
  72   return (__m128)((__v4sf)__a + (__v4sf)__b);
  73 }
  74
  75 /// Subtracts the 32-bit float value in the low-order bits of the second
  76 ///    operand from the corresponding value in the first operand.
  77 ///
  78 /// \headerfile <x86intrin.h>
  79 ///
  80 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
  81 ///
  82 /// \param __a
  83 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
  84 ///    of this operand are used in the calculation.
  85 /// \param __b
  86 ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
  87 ///    bits of this operand are used in the calculation.
  88 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
  89 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
  90 ///    copied from the upper 96 bits of the first source operand.
  91 static __inline__ __m128 __DEFAULT_FN_ATTRS
  92 _mm_sub_ss(__m128 __a, __m128 __b)
  93 {
  94   __a[0] -= __b[0];
  95   return __a;
  96 }
  97
  98 /// Subtracts each of the values of the second operand from the first
  99 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
 100 ///    the results of the subtraction.
 101 ///
 102 /// \headerfile <x86intrin.h>
 103 ///
 104 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
 105 ///
 106 /// \param __a
 107 ///    A 128-bit vector of [4 x float] containing the minuend.
 108 /// \param __b
 109 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 110 /// \returns A 128-bit vector of [4 x float] containing the differences between
 111 ///    both operands.
 112 static __inline__ __m128 __DEFAULT_FN_ATTRS
 113 _mm_sub_ps(__m128 __a, __m128 __b)
 114 {
 115   return (__m128)((__v4sf)__a - (__v4sf)__b);
 116 }
 117
 118 /// Multiplies two 32-bit float values in the low-order bits of the
 119 ///    operands.
 120 ///
 121 /// \headerfile <x86intrin.h>
 122 ///
 123 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
 124 ///
 125 /// \param __a
 126 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 127 ///    The lower 32 bits of this operand are used in the calculation.
 128 /// \param __b
 129 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 130 ///    The lower 32 bits of this operand are used in the calculation.
 131 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
 132 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
 133 ///    bits of the first source operand.
 134 static __inline__ __m128 __DEFAULT_FN_ATTRS
 135 _mm_mul_ss(__m128 __a, __m128 __b)
 136 {
 137   __a[0] *= __b[0];
 138   return __a;
 139 }
 140
 141 /// Multiplies two 128-bit vectors of [4 x float] and returns the
 142 ///    results of the multiplication.
 143 ///
 144 /// \headerfile <x86intrin.h>
 145 ///
 146 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
 147 ///
 148 /// \param __a
 149 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 150 /// \param __b
 151 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 152 /// \returns A 128-bit vector of [4 x float] containing the products of both
 153 ///    operands.
 154 static __inline__ __m128 __DEFAULT_FN_ATTRS
 155 _mm_mul_ps(__m128 __a, __m128 __b)
 156 {
 157   return (__m128)((__v4sf)__a * (__v4sf)__b);
 158 }
 159
 160 /// Divides the value in the low-order 32 bits of the first operand by
 161 ///    the corresponding value in the second operand.
 162 ///
 163 /// \headerfile <x86intrin.h>
 164 ///
 165 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
 166 ///
 167 /// \param __a
 168 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
 169 ///    bits of this operand are used in the calculation.
 170 /// \param __b
 171 ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
 172 ///    of this operand are used in the calculation.
 173 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
 174 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
 175 ///    upper 96 bits of the first source operand.
 176 static __inline__ __m128 __DEFAULT_FN_ATTRS
 177 _mm_div_ss(__m128 __a, __m128 __b)
 178 {
 179   __a[0] /= __b[0];
 180   return __a;
 181 }
 182
 183 /// Divides two 128-bit vectors of [4 x float].
 184 ///
 185 /// \headerfile <x86intrin.h>
 186 ///
 187 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
 188 ///
 189 /// \param __a
 190 ///    A 128-bit vector of [4 x float] containing the dividend.
 191 /// \param __b
 192 ///    A 128-bit vector of [4 x float] containing the divisor.
 193 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
 194 ///    operands.
 195 static __inline__ __m128 __DEFAULT_FN_ATTRS
 196 _mm_div_ps(__m128 __a, __m128 __b)
 197 {
 198   return (__m128)((__v4sf)__a / (__v4sf)__b);
 199 }
 200
 201 /// Calculates the square root of the value stored in the low-order bits
 202 ///    of a 128-bit vector of [4 x float].
 203 ///
 204 /// \headerfile <x86intrin.h>
 205 ///
 206 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
 207 ///
 208 /// \param __a
 209 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 210 ///    used in the calculation.
 211 /// \returns A 128-bit vector of [4 x float] containing the square root of the
 212 ///    value in the low-order bits of the operand.
 213 static __inline__ __m128 __DEFAULT_FN_ATTRS
 214 _mm_sqrt_ss(__m128 __a)
 215 {
 216   return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
 217 }
 218
 219 /// Calculates the square roots of the values stored in a 128-bit vector
 220 ///    of [4 x float].
 221 ///
 222 /// \headerfile <x86intrin.h>
 223 ///
 224 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
 225 ///
 226 /// \param __a
 227 ///    A 128-bit vector of [4 x float].
 228 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
 229 ///    values in the operand.
 230 static __inline__ __m128 __DEFAULT_FN_ATTRS
 231 _mm_sqrt_ps(__m128 __a)
 232 {
 233   return __builtin_ia32_sqrtps((__v4sf)__a);
 234 }
 235
 236 /// Calculates the approximate reciprocal of the value stored in the
 237 ///    low-order bits of a 128-bit vector of [4 x float].
 238 ///
 239 /// \headerfile <x86intrin.h>
 240 ///
 241 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
 242 ///
 243 /// \param __a
 244 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 245 ///    used in the calculation.
 246 /// \returns A 128-bit vector of [4 x float] containing the approximate
 247 ///    reciprocal of the value in the low-order bits of the operand.
 248 static __inline__ __m128 __DEFAULT_FN_ATTRS
 249 _mm_rcp_ss(__m128 __a)
 250 {
 251   return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
 252 }
 253
 254 /// Calculates the approximate reciprocals of the values stored in a
 255 ///    128-bit vector of [4 x float].
 256 ///
 257 /// \headerfile <x86intrin.h>
 258 ///
 259 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
 260 ///
 261 /// \param __a
 262 ///    A 128-bit vector of [4 x float].
 263 /// \returns A 128-bit vector of [4 x float] containing the approximate
 264 ///    reciprocals of the values in the operand.
 265 static __inline__ __m128 __DEFAULT_FN_ATTRS
 266 _mm_rcp_ps(__m128 __a)
 267 {
 268   return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
 269 }
 270
 271 /// Calculates the approximate reciprocal of the square root of the value
 272 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
 273 ///
 274 /// \headerfile <x86intrin.h>
 275 ///
 276 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
 277 ///
 278 /// \param __a
 279 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 280 ///    used in the calculation.
 281 /// \returns A 128-bit vector of [4 x float] containing the approximate
 282 ///    reciprocal of the square root of the value in the low-order bits of the
 283 ///    operand.
 284 static __inline__ __m128 __DEFAULT_FN_ATTRS
 285 _mm_rsqrt_ss(__m128 __a)
 286 {
 287   return __builtin_ia32_rsqrtss((__v4sf)__a);
 288 }
 289
 290 /// Calculates the approximate reciprocals of the square roots of the
 291 ///    values stored in a 128-bit vector of [4 x float].
 292 ///
 293 /// \headerfile <x86intrin.h>
 294 ///
 295 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
 296 ///
 297 /// \param __a
 298 ///    A 128-bit vector of [4 x float].
 299 /// \returns A 128-bit vector of [4 x float] containing the approximate
 300 ///    reciprocals of the square roots of the values in the operand.
 301 static __inline__ __m128 __DEFAULT_FN_ATTRS
 302 _mm_rsqrt_ps(__m128 __a)
 303 {
 304   return __builtin_ia32_rsqrtps((__v4sf)__a);
 305 }
 306
 307 /// Compares two 32-bit float values in the low-order bits of both
 308 ///    operands and returns the lesser value in the low-order bits of the
 309 ///    vector of [4 x float].
 310 ///
 311 /// \headerfile <x86intrin.h>
 312 ///
 313 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
 314 ///
 315 /// \param __a
 316 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 317 ///    32 bits of this operand are used in the comparison.
 318 /// \param __b
 319 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 320 ///    32 bits of this operand are used in the comparison.
 321 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 322 ///    minimum value between both operands. The upper 96 bits are copied from
 323 ///    the upper 96 bits of the first source operand.
 324 static __inline__ __m128 __DEFAULT_FN_ATTRS
 325 _mm_min_ss(__m128 __a, __m128 __b)
 326 {
 327   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
 328 }
 329
 330 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
 331 ///    of each pair of values.
 332 ///
 333 /// \headerfile <x86intrin.h>
 334 ///
 335 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
 336 ///
 337 /// \param __a
 338 ///    A 128-bit vector of [4 x float] containing one of the operands.
 339 /// \param __b
 340 ///    A 128-bit vector of [4 x float] containing one of the operands.
 341 /// \returns A 128-bit vector of [4 x float] containing the minimum values
 342 ///    between both operands.
 343 static __inline__ __m128 __DEFAULT_FN_ATTRS
 344 _mm_min_ps(__m128 __a, __m128 __b)
 345 {
 346   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
 347 }
 348
 349 /// Compares two 32-bit float values in the low-order bits of both
 350 ///    operands and returns the greater value in the low-order bits of a 128-bit
 351 ///    vector of [4 x float].
 352 ///
 353 /// \headerfile <x86intrin.h>
 354 ///
 355 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
 356 ///
 357 /// \param __a
 358 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 359 ///    32 bits of this operand are used in the comparison.
 360 /// \param __b
 361 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 362 ///    32 bits of this operand are used in the comparison.
 363 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 364 ///    maximum value between both operands. The upper 96 bits are copied from
 365 ///    the upper 96 bits of the first source operand.
 366 static __inline__ __m128 __DEFAULT_FN_ATTRS
 367 _mm_max_ss(__m128 __a, __m128 __b)
 368 {
 369   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
 370 }
 371
 372 /// Compares two 128-bit vectors of [4 x float] and returns the greater
 373 ///    of each pair of values.
 374 ///
 375 /// \headerfile <x86intrin.h>
 376 ///
 377 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
 378 ///
 379 /// \param __a
 380 ///    A 128-bit vector of [4 x float] containing one of the operands.
 381 /// \param __b
 382 ///    A 128-bit vector of [4 x float] containing one of the operands.
 383 /// \returns A 128-bit vector of [4 x float] containing the maximum values
 384 ///    between both operands.
 385 static __inline__ __m128 __DEFAULT_FN_ATTRS
 386 _mm_max_ps(__m128 __a, __m128 __b)
 387 {
 388   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
 389 }
 390
 391 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
 392 ///
 393 /// \headerfile <x86intrin.h>
 394 ///
 395 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
 396 ///
 397 /// \param __a
 398 ///    A 128-bit vector containing one of the source operands.
 399 /// \param __b
 400 ///    A 128-bit vector containing one of the source operands.
 401 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 402 ///    values between both operands.
 403 static __inline__ __m128 __DEFAULT_FN_ATTRS
 404 _mm_and_ps(__m128 __a, __m128 __b)
 405 {
 406   return (__m128)((__v4su)__a & (__v4su)__b);
 407 }
 408
 409 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
 410 ///    the one's complement of the values contained in the first source
 411 ///    operand.
 412 ///
 413 /// \headerfile <x86intrin.h>
 414 ///
 415 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
 416 ///
 417 /// \param __a
 418 ///    A 128-bit vector of [4 x float] containing the first source operand. The
 419 ///    one's complement of this value is used in the bitwise AND.
 420 /// \param __b
 421 ///    A 128-bit vector of [4 x float] containing the second source operand.
 422 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 423 ///    one's complement of the first operand and the values in the second
 424 ///    operand.
 425 static __inline__ __m128 __DEFAULT_FN_ATTRS
 426 _mm_andnot_ps(__m128 __a, __m128 __b)
 427 {
 428   return (__m128)(~(__v4su)__a & (__v4su)__b);
 429 }
 430
 431 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
 432 ///
 433 /// \headerfile <x86intrin.h>
 434 ///
 435 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
 436 ///
 437 /// \param __a
 438 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 439 /// \param __b
 440 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 441 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
 442 ///    values between both operands.
 443 static __inline__ __m128 __DEFAULT_FN_ATTRS
 444 _mm_or_ps(__m128 __a, __m128 __b)
 445 {
 446   return (__m128)((__v4su)__a | (__v4su)__b);
 447 }
 448
 449 /// Performs a bitwise exclusive OR of two 128-bit vectors of
 450 ///    [4 x float].
 451 ///
 452 /// \headerfile <x86intrin.h>
 453 ///
 454 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
 455 ///
 456 /// \param __a
 457 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 458 /// \param __b
 459 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 460 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
 461 ///    of the values between both operands.
 462 static __inline__ __m128 __DEFAULT_FN_ATTRS
 463 _mm_xor_ps(__m128 __a, __m128 __b)
 464 {
 465   return (__m128)((__v4su)__a ^ (__v4su)__b);
 466 }
 467
 468 /// Compares two 32-bit float values in the low-order bits of both
 469 ///    operands for equality and returns the result of the comparison in the
 470 ///    low-order bits of a vector [4 x float].
 471 ///
 472 /// \headerfile <x86intrin.h>
 473 ///
 474 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
 475 ///
 476 /// \param __a
 477 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 478 ///    32 bits of this operand are used in the comparison.
 479 /// \param __b
 480 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 481 ///    32 bits of this operand are used in the comparison.
 482 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 483 ///    in the low-order bits.
 484 static __inline__ __m128 __DEFAULT_FN_ATTRS
 485 _mm_cmpeq_ss(__m128 __a, __m128 __b)
 486 {
 487   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
 488 }
 489
 490 /// Compares each of the corresponding 32-bit float values of the
 491 ///    128-bit vectors of [4 x float] for equality.
 492 ///
 493 /// \headerfile <x86intrin.h>
 494 ///
 495 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
 496 ///
 497 /// \param __a
 498 ///    A 128-bit vector of [4 x float].
 499 /// \param __b
 500 ///    A 128-bit vector of [4 x float].
 501 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 502 static __inline__ __m128 __DEFAULT_FN_ATTRS
 503 _mm_cmpeq_ps(__m128 __a, __m128 __b)
 504 {
 505   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
 506 }
 507
 508 /// Compares two 32-bit float values in the low-order bits of both
 509 ///    operands to determine if the value in the first operand is less than the
 510 ///    corresponding value in the second operand and returns the result of the
 511 ///    comparison in the low-order bits of a vector of [4 x float].
 512 ///
 513 /// \headerfile <x86intrin.h>
 514 ///
 515 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
 516 ///
 517 /// \param __a
 518 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 519 ///    32 bits of this operand are used in the comparison.
 520 /// \param __b
 521 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 522 ///    32 bits of this operand are used in the comparison.
 523 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 524 ///    in the low-order bits.
 525 static __inline__ __m128 __DEFAULT_FN_ATTRS
 526 _mm_cmplt_ss(__m128 __a, __m128 __b)
 527 {
 528   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
 529 }
 530
 531 /// Compares each of the corresponding 32-bit float values of the
 532 ///    128-bit vectors of [4 x float] to determine if the values in the first
 533 ///    operand are less than those in the second operand.
 534 ///
 535 /// \headerfile <x86intrin.h>
 536 ///
 537 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
 538 ///
 539 /// \param __a
 540 ///    A 128-bit vector of [4 x float].
 541 /// \param __b
 542 ///    A 128-bit vector of [4 x float].
 543 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 544 static __inline__ __m128 __DEFAULT_FN_ATTRS
 545 _mm_cmplt_ps(__m128 __a, __m128 __b)
 546 {
 547   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
 548 }
 549
 550 /// Compares two 32-bit float values in the low-order bits of both
 551 ///    operands to determine if the value in the first operand is less than or
 552 ///    equal to the corresponding value in the second operand and returns the
 553 ///    result of the comparison in the low-order bits of a vector of
 554 ///    [4 x float].
 555 ///
 556 /// \headerfile <x86intrin.h>
 557 ///
 558 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
 559 ///
 560 /// \param __a
 561 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 562 ///    32 bits of this operand are used in the comparison.
 563 /// \param __b
 564 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 565 ///    32 bits of this operand are used in the comparison.
 566 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 567 ///    in the low-order bits.
 568 static __inline__ __m128 __DEFAULT_FN_ATTRS
 569 _mm_cmple_ss(__m128 __a, __m128 __b)
 570 {
 571   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
 572 }
 573
 574 /// Compares each of the corresponding 32-bit float values of the
 575 ///    128-bit vectors of [4 x float] to determine if the values in the first
 576 ///    operand are less than or equal to those in the second operand.
 577 ///
 578 /// \headerfile <x86intrin.h>
 579 ///
 580 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
 581 ///
 582 /// \param __a
 583 ///    A 128-bit vector of [4 x float].
 584 /// \param __b
 585 ///    A 128-bit vector of [4 x float].
 586 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 587 static __inline__ __m128 __DEFAULT_FN_ATTRS
 588 _mm_cmple_ps(__m128 __a, __m128 __b)
 589 {
 590   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
 591 }
 592
 593 /// Compares two 32-bit float values in the low-order bits of both
 594 ///    operands to determine if the value in the first operand is greater than
 595 ///    the corresponding value in the second operand and returns the result of
 596 ///    the comparison in the low-order bits of a vector of [4 x float].
 597 ///
 598 /// \headerfile <x86intrin.h>
 599 ///
 600 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
 601 ///
 602 /// \param __a
 603 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 604 ///    32 bits of this operand are used in the comparison.
 605 /// \param __b
 606 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 607 ///    32 bits of this operand are used in the comparison.
 608 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 609 ///    in the low-order bits.
 610 static __inline__ __m128 __DEFAULT_FN_ATTRS
 611 _mm_cmpgt_ss(__m128 __a, __m128 __b)
 612 {
 613   return (__m128)__builtin_shufflevector((__v4sf)__a,
 614                                          (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
 615                                          4, 1, 2, 3);
 616 }
 617
 618 /// Compares each of the corresponding 32-bit float values of the
 619 ///    128-bit vectors of [4 x float] to determine if the values in the first
 620 ///    operand are greater than those in the second operand.
 621 ///
 622 /// \headerfile <x86intrin.h>
 623 ///
 624 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
 625 ///
 626 /// \param __a
 627 ///    A 128-bit vector of [4 x float].
 628 /// \param __b
 629 ///    A 128-bit vector of [4 x float].
 630 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 631 static __inline__ __m128 __DEFAULT_FN_ATTRS
 632 _mm_cmpgt_ps(__m128 __a, __m128 __b)
 633 {
 634   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
 635 }
 636
 637 /// Compares two 32-bit float values in the low-order bits of both
 638 ///    operands to determine if the value in the first operand is greater than
 639 ///    or equal to the corresponding value in the second operand and returns
 640 ///    the result of the comparison in the low-order bits of a vector of
 641 ///    [4 x float].
 642 ///
 643 /// \headerfile <x86intrin.h>
 644 ///
 645 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
 646 ///
 647 /// \param __a
 648 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 649 ///    32 bits of this operand are used in the comparison.
 650 /// \param __b
 651 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 652 ///    32 bits of this operand are used in the comparison.
 653 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 654 ///    in the low-order bits.
 655 static __inline__ __m128 __DEFAULT_FN_ATTRS
 656 _mm_cmpge_ss(__m128 __a, __m128 __b)
 657 {
 658   return (__m128)__builtin_shufflevector((__v4sf)__a,
 659                                          (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
 660                                          4, 1, 2, 3);
 661 }
 662
 663 /// Compares each of the corresponding 32-bit float values of the
 664 ///    128-bit vectors of [4 x float] to determine if the values in the first
 665 ///    operand are greater than or equal to those in the second operand.
 666 ///
 667 /// \headerfile <x86intrin.h>
 668 ///
 669 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
 670 ///
 671 /// \param __a
 672 ///    A 128-bit vector of [4 x float].
 673 /// \param __b
 674 ///    A 128-bit vector of [4 x float].
 675 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 676 static __inline__ __m128 __DEFAULT_FN_ATTRS
 677 _mm_cmpge_ps(__m128 __a, __m128 __b)
 678 {
 679   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
 680 }
 681
 682 /// Compares two 32-bit float values in the low-order bits of both
 683 ///    operands for inequality and returns the result of the comparison in the
 684 ///    low-order bits of a vector of [4 x float].
 685 ///
 686 /// \headerfile <x86intrin.h>
 687 ///
 688 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
 689 ///   instructions.
 690 ///
 691 /// \param __a
 692 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 693 ///    32 bits of this operand are used in the comparison.
 694 /// \param __b
 695 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 696 ///    32 bits of this operand are used in the comparison.
 697 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 698 ///    in the low-order bits.
 699 static __inline__ __m128 __DEFAULT_FN_ATTRS
 700 _mm_cmpneq_ss(__m128 __a, __m128 __b)
 701 {
 702   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
 703 }
 704
 705 /// Compares each of the corresponding 32-bit float values of the
 706 ///    128-bit vectors of [4 x float] for inequality.
 707 ///
 708 /// \headerfile <x86intrin.h>
 709 ///
 710 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
 711 ///   instructions.
 712 ///
 713 /// \param __a
 714 ///    A 128-bit vector of [4 x float].
 715 /// \param __b
 716 ///    A 128-bit vector of [4 x float].
 717 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 718 static __inline__ __m128 __DEFAULT_FN_ATTRS
 719 _mm_cmpneq_ps(__m128 __a, __m128 __b)
 720 {
 721   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
 722 }
 723
 724 /// Compares two 32-bit float values in the low-order bits of both
 725 ///    operands to determine if the value in the first operand is not less than
 726 ///    the corresponding value in the second operand and returns the result of
 727 ///    the comparison in the low-order bits of a vector of [4 x float].
 728 ///
 729 /// \headerfile <x86intrin.h>
 730 ///
 731 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
 732 ///   instructions.
 733 ///
 734 /// \param __a
 735 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 736 ///    32 bits of this operand are used in the comparison.
 737 /// \param __b
 738 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 739 ///    32 bits of this operand are used in the comparison.
 740 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 741 ///    in the low-order bits.
 742 static __inline__ __m128 __DEFAULT_FN_ATTRS
 743 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
 744 {
 745   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
 746 }
 747
 748 /// Compares each of the corresponding 32-bit float values of the
 749 ///    128-bit vectors of [4 x float] to determine if the values in the first
 750 ///    operand are not less than those in the second operand.
 751 ///
 752 /// \headerfile <x86intrin.h>
 753 ///
 754 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
 755 ///   instructions.
 756 ///
 757 /// \param __a
 758 ///    A 128-bit vector of [4 x float].
 759 /// \param __b
 760 ///    A 128-bit vector of [4 x float].
 761 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 762 static __inline__ __m128 __DEFAULT_FN_ATTRS
 763 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
 764 {
 765   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
 766 }
 767
 768 /// Compares two 32-bit float values in the low-order bits of both
 769 ///    operands to determine if the value in the first operand is not less than
 770 ///    or equal to the corresponding value in the second operand and returns
 771 ///    the result of the comparison in the low-order bits of a vector of
 772 ///    [4 x float].
 773 ///
 774 /// \headerfile <x86intrin.h>
 775 ///
 776 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
 777 ///   instructions.
 778 ///
 779 /// \param __a
 780 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 781 ///    32 bits of this operand are used in the comparison.
 782 /// \param __b
 783 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 784 ///    32 bits of this operand are used in the comparison.
 785 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 786 ///    in the low-order bits.
 787 static __inline__ __m128 __DEFAULT_FN_ATTRS
 788 _mm_cmpnle_ss(__m128 __a, __m128 __b)
 789 {
 790   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
 791 }
 792
 793 /// Compares each of the corresponding 32-bit float values of the
 794 ///    128-bit vectors of [4 x float] to determine if the values in the first
 795 ///    operand are not less than or equal to those in the second operand.
 796 ///
 797 /// \headerfile <x86intrin.h>
 798 ///
 799 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
 800 ///   instructions.
 801 ///
 802 /// \param __a
 803 ///    A 128-bit vector of [4 x float].
 804 /// \param __b
 805 ///    A 128-bit vector of [4 x float].
 806 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 807 static __inline__ __m128 __DEFAULT_FN_ATTRS
 808 _mm_cmpnle_ps(__m128 __a, __m128 __b)
 809 {
 810   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
 811 }
 812
 813 /// Compares two 32-bit float values in the low-order bits of both
 814 ///    operands to determine if the value in the first operand is not greater
 815 ///    than the corresponding value in the second operand and returns the
 816 ///    result of the comparison in the low-order bits of a vector of
 817 ///    [4 x float].
 818 ///
 819 /// \headerfile <x86intrin.h>
 820 ///
 821 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
 822 ///   instructions.
 823 ///
 824 /// \param __a
 825 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 826 ///    32 bits of this operand are used in the comparison.
 827 /// \param __b
 828 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 829 ///    32 bits of this operand are used in the comparison.
 830 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 831 ///    in the low-order bits.
 832 static __inline__ __m128 __DEFAULT_FN_ATTRS
 833 _mm_cmpngt_ss(__m128 __a, __m128 __b)
 834 {
 835   return (__m128)__builtin_shufflevector((__v4sf)__a,
 836                                          (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
 837                                          4, 1, 2, 3);
 838 }
 839
 840 /// Compares each of the corresponding 32-bit float values of the
 841 ///    128-bit vectors of [4 x float] to determine if the values in the first
 842 ///    operand are not greater than those in the second operand.
 843 ///
 844 /// \headerfile <x86intrin.h>
 845 ///
 846 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
 847 ///   instructions.
 848 ///
 849 /// \param __a
 850 ///    A 128-bit vector of [4 x float].
 851 /// \param __b
 852 ///    A 128-bit vector of [4 x float].
 853 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 854 static __inline__ __m128 __DEFAULT_FN_ATTRS
 855 _mm_cmpngt_ps(__m128 __a, __m128 __b)
 856 {
 857   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
 858 }
 859
 860 /// Compares two 32-bit float values in the low-order bits of both
 861 ///    operands to determine if the value in the first operand is not greater
 862 ///    than or equal to the corresponding value in the second operand and
 863 ///    returns the result of the comparison in the low-order bits of a vector
 864 ///    of [4 x float].
 865 ///
 866 /// \headerfile <x86intrin.h>
 867 ///
 868 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
 869 ///   instructions.
 870 ///
 871 /// \param __a
 872 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 873 ///    32 bits of this operand are used in the comparison.
 874 /// \param __b
 875 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 876 ///    32 bits of this operand are used in the comparison.
 877 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 878 ///    in the low-order bits.
 879 static __inline__ __m128 __DEFAULT_FN_ATTRS
 880 _mm_cmpnge_ss(__m128 __a, __m128 __b)
 881 {
 882   return (__m128)__builtin_shufflevector((__v4sf)__a,
 883                                          (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
 884                                          4, 1, 2, 3);
 885 }
 886
 887 /// Compares each of the corresponding 32-bit float values of the
 888 ///    128-bit vectors of [4 x float] to determine if the values in the first
 889 ///    operand are not greater than or equal to those in the second operand.
 890 ///
 891 /// \headerfile <x86intrin.h>
 892 ///
 893 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
 894 ///   instructions.
 895 ///
 896 /// \param __a
 897 ///    A 128-bit vector of [4 x float].
 898 /// \param __b
 899 ///    A 128-bit vector of [4 x float].
 900 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 901 static __inline__ __m128 __DEFAULT_FN_ATTRS
 902 _mm_cmpnge_ps(__m128 __a, __m128 __b)
 903 {
 904   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
 905 }
 906
 907 /// Compares two 32-bit float values in the low-order bits of both
 908 ///    operands to determine if the value in the first operand is ordered with
 909 ///    respect to the corresponding value in the second operand and returns the
 910 ///    result of the comparison in the low-order bits of a vector of
 911 ///    [4 x float].
 912 ///
 913 /// \headerfile <x86intrin.h>
 914 ///
 915 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
 916 ///   instructions.
 917 ///
 918 /// \param __a
 919 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 920 ///    32 bits of this operand are used in the comparison.
 921 /// \param __b
 922 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 923 ///    32 bits of this operand are used in the comparison.
 924 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 925 ///    in the low-order bits.
 926 static __inline__ __m128 __DEFAULT_FN_ATTRS
 927 _mm_cmpord_ss(__m128 __a, __m128 __b)
 928 {
 929   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
 930 }
 931
 932 /// Compares each of the corresponding 32-bit float values of the
 933 ///    128-bit vectors of [4 x float] to determine if the values in the first
 934 ///    operand are ordered with respect to those in the second operand.
 935 ///
 936 /// \headerfile <x86intrin.h>
 937 ///
 938 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
 939 ///   instructions.
 940 ///
 941 /// \param __a
 942 ///    A 128-bit vector of [4 x float].
 943 /// \param __b
 944 ///    A 128-bit vector of [4 x float].
 945 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 946 static __inline__ __m128 __DEFAULT_FN_ATTRS
 947 _mm_cmpord_ps(__m128 __a, __m128 __b)
 948 {
 949   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
 950 }
 951
 952 /// Compares two 32-bit float values in the low-order bits of both
 953 ///    operands to determine if the value in the first operand is unordered
 954 ///    with respect to the corresponding value in the second operand and
 955 ///    returns the result of the comparison in the low-order bits of a vector
 956 ///    of [4 x float].
 957 ///
 958 /// \headerfile <x86intrin.h>
 959 ///
 960 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
 961 ///   instructions.
 962 ///
 963 /// \param __a
 964 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 965 ///    32 bits of this operand are used in the comparison.
 966 /// \param __b
 967 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
 968 ///    32 bits of this operand are used in the comparison.
 969 /// \returns A 128-bit vector of [4 x float] containing the comparison results
 970 ///    in the low-order bits.
 971 static __inline__ __m128 __DEFAULT_FN_ATTRS
 972 _mm_cmpunord_ss(__m128 __a, __m128 __b)
 973 {
 974   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
 975 }
 976
 977 /// Compares each of the corresponding 32-bit float values of the
 978 ///    128-bit vectors of [4 x float] to determine if the values in the first
 979 ///    operand are unordered with respect to those in the second operand.
 980 ///
 981 /// \headerfile <x86intrin.h>
 982 ///
 983 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
 984 ///   instructions.
 985 ///
 986 /// \param __a
 987 ///    A 128-bit vector of [4 x float].
 988 /// \param __b
 989 ///    A 128-bit vector of [4 x float].
 990 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 991 static __inline__ __m128 __DEFAULT_FN_ATTRS
 992 _mm_cmpunord_ps(__m128 __a, __m128 __b)
 993 {
 994   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
 995 }
 996
 997 /// Compares two 32-bit float values in the low-order bits of both
 998 ///    operands for equality and returns the result of the comparison.
 999 ///
1000 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1001 ///
1002 /// \headerfile <x86intrin.h>
1003 ///
1004 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1005 ///   instructions.
1006 ///
1007 /// \param __a
1008 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1009 ///    used in the comparison.
1010 /// \param __b
1011 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1012 ///    used in the comparison.
1013 /// \returns An integer containing the comparison results. If either of the
1014 ///    two lower 32-bit values is NaN, 0 is returned.
1015 static __inline__ int __DEFAULT_FN_ATTRS
1016 _mm_comieq_ss(__m128 __a, __m128 __b)
1017 {
1018   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1019 }
1020
1021 /// Compares two 32-bit float values in the low-order bits of both
1022 ///    operands to determine if the first operand is less than the second
1023 ///    operand and returns the result of the comparison.
1024 ///
1025 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1026 ///
1027 /// \headerfile <x86intrin.h>
1028 ///
1029 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1030 ///   instructions.
1031 ///
1032 /// \param __a
1033 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1034 ///    used in the comparison.
1035 /// \param __b
1036 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1037 ///    used in the comparison.
1038 /// \returns An integer containing the comparison results. If either of the two
1039 ///     lower 32-bit values is NaN, 0 is returned.
1040 static __inline__ int __DEFAULT_FN_ATTRS
1041 _mm_comilt_ss(__m128 __a, __m128 __b)
1042 {
1043   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1044 }
1045
1046 /// Compares two 32-bit float values in the low-order bits of both
1047 ///    operands to determine if the first operand is less than or equal to the
1048 ///    second operand and returns the result of the comparison.
1049 ///
1050 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1051 ///
1052 /// \headerfile <x86intrin.h>
1053 ///
1054 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1055 ///
1056 /// \param __a
1057 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1058 ///    used in the comparison.
1059 /// \param __b
1060 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1061 ///    used in the comparison.
1062 /// \returns An integer containing the comparison results. If either of the two
1063 ///     lower 32-bit values is NaN, 0 is returned.
1064 static __inline__ int __DEFAULT_FN_ATTRS
1065 _mm_comile_ss(__m128 __a, __m128 __b)
1066 {
1067   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1068 }
1069
1070 /// Compares two 32-bit float values in the low-order bits of both
1071 ///    operands to determine if the first operand is greater than the second
1072 ///    operand and returns the result of the comparison.
1073 ///
1074 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1075 ///
1076 /// \headerfile <x86intrin.h>
1077 ///
1078 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1079 ///
1080 /// \param __a
1081 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1082 ///    used in the comparison.
1083 /// \param __b
1084 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1085 ///    used in the comparison.
1086 /// \returns An integer containing the comparison results. If either of the
1087 ///     two lower 32-bit values is NaN, 0 is returned.
1088 static __inline__ int __DEFAULT_FN_ATTRS
1089 _mm_comigt_ss(__m128 __a, __m128 __b)
1090 {
1091   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1092 }
1093
1094 /// Compares two 32-bit float values in the low-order bits of both
1095 ///    operands to determine if the first operand is greater than or equal to
1096 ///    the second operand and returns the result of the comparison.
1097 ///
1098 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1099 ///
1100 /// \headerfile <x86intrin.h>
1101 ///
1102 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1103 ///
1104 /// \param __a
1105 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1106 ///    used in the comparison.
1107 /// \param __b
1108 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1109 ///    used in the comparison.
1110 /// \returns An integer containing the comparison results. If either of the two
1111 ///    lower 32-bit values is NaN, 0 is returned.
1112 static __inline__ int __DEFAULT_FN_ATTRS
1113 _mm_comige_ss(__m128 __a, __m128 __b)
1114 {
1115   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1116 }
1117
1118 /// Compares two 32-bit float values in the low-order bits of both
1119 ///    operands to determine if the first operand is not equal to the second
1120 ///    operand and returns the result of the comparison.
1121 ///
1122 ///    If either of the two lower 32-bit values is NaN, 1 is returned.
1123 ///
1124 /// \headerfile <x86intrin.h>
1125 ///
1126 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1127 ///
1128 /// \param __a
1129 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1130 ///    used in the comparison.
1131 /// \param __b
1132 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1133 ///    used in the comparison.
1134 /// \returns An integer containing the comparison results. If either of the
1135 ///     two lower 32-bit values is NaN, 1 is returned.
1136 static __inline__ int __DEFAULT_FN_ATTRS
1137 _mm_comineq_ss(__m128 __a, __m128 __b)
1138 {
1139   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1140 }
1141
1142 /// Performs an unordered comparison of two 32-bit float values using
1143 ///    the low-order bits of both operands to determine equality and returns
1144 ///    the result of the comparison.
1145 ///
1146 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1147 ///
1148 /// \headerfile <x86intrin.h>
1149 ///
1150 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1151 ///
1152 /// \param __a
1153 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1154 ///    used in the comparison.
1155 /// \param __b
1156 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1157 ///    used in the comparison.
1158 /// \returns An integer containing the comparison results. If either of the two
1159 ///     lower 32-bit values is NaN, 0 is returned.
1160 static __inline__ int __DEFAULT_FN_ATTRS
1161 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1162 {
1163   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1164 }
1165
1166 /// Performs an unordered comparison of two 32-bit float values using
1167 ///    the low-order bits of both operands to determine if the first operand is
1168 ///    less than the second operand and returns the result of the comparison.
1169 ///
1170 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1171 ///
1172 /// \headerfile <x86intrin.h>
1173 ///
1174 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1175 ///
1176 /// \param __a
1177 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1178 ///    used in the comparison.
1179 /// \param __b
1180 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1181 ///    used in the comparison.
1182 /// \returns An integer containing the comparison results. If either of the two
1183 ///    lower 32-bit values is NaN, 0 is returned.
1184 static __inline__ int __DEFAULT_FN_ATTRS
1185 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1186 {
1187   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1188 }
1189
1190 /// Performs an unordered comparison of two 32-bit float values using
1191 ///    the low-order bits of both operands to determine if the first operand is
1192 ///    less than or equal to the second operand and returns the result of the
1193 ///    comparison.
1194 ///
1195 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1196 ///
1197 /// \headerfile <x86intrin.h>
1198 ///
1199 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1200 ///
1201 /// \param __a
1202 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1203 ///    used in the comparison.
1204 /// \param __b
1205 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1206 ///    used in the comparison.
1207 /// \returns An integer containing the comparison results. If either of the two
1208 ///     lower 32-bit values is NaN, 0 is returned.
1209 static __inline__ int __DEFAULT_FN_ATTRS
1210 _mm_ucomile_ss(__m128 __a, __m128 __b)
1211 {
1212   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1213 }
1214
1215 /// Performs an unordered comparison of two 32-bit float values using
1216 ///    the low-order bits of both operands to determine if the first operand is
1217 ///    greater than the second operand and returns the result of the
1218 ///    comparison.
1219 ///
1220 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1221 ///
1222 /// \headerfile <x86intrin.h>
1223 ///
1224 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1225 ///
1226 /// \param __a
1227 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1228 ///    used in the comparison.
1229 /// \param __b
1230 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1231 ///    used in the comparison.
1232 /// \returns An integer containing the comparison results. If either of the two
1233 ///     lower 32-bit values is NaN, 0 is returned.
1234 static __inline__ int __DEFAULT_FN_ATTRS
1235 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1236 {
1237   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1238 }
1239
1240 /// Performs an unordered comparison of two 32-bit float values using
1241 ///    the low-order bits of both operands to determine if the first operand is
1242 ///    greater than or equal to the second operand and returns the result of
1243 ///    the comparison.
1244 ///
1245 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1246 ///
1247 /// \headerfile <x86intrin.h>
1248 ///
1249 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1250 ///
1251 /// \param __a
1252 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1253 ///    used in the comparison.
1254 /// \param __b
1255 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1256 ///    used in the comparison.
1257 /// \returns An integer containing the comparison results. If either of the two
1258 ///     lower 32-bit values is NaN, 0 is returned.
1259 static __inline__ int __DEFAULT_FN_ATTRS
1260 _mm_ucomige_ss(__m128 __a, __m128 __b)
1261 {
1262   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1263 }
1264
1265 /// Performs an unordered comparison of two 32-bit float values using
1266 ///    the low-order bits of both operands to determine inequality and returns
1267 ///    the result of the comparison.
1268 ///
1269 ///    If either of the two lower 32-bit values is NaN, 1 is returned.
1270 ///
1271 /// \headerfile <x86intrin.h>
1272 ///
1273 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1274 ///
1275 /// \param __a
1276 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1277 ///    used in the comparison.
1278 /// \param __b
1279 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1280 ///    used in the comparison.
1281 /// \returns An integer containing the comparison results. If either of the two
1282 ///    lower 32-bit values is NaN, 1 is returned.
1283 static __inline__ int __DEFAULT_FN_ATTRS
1284 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1285 {
1286   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1287 }
1288
1289 /// Converts a float value contained in the lower 32 bits of a vector of
1290 ///    [4 x float] into a 32-bit integer.
1291 ///
1292 /// \headerfile <x86intrin.h>
1293 ///
1294 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1295 ///   instructions.
1296 ///
1297 /// \param __a
1298 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1299 ///    used in the conversion.
1300 /// \returns A 32-bit integer containing the converted value.
1301 static __inline__ int __DEFAULT_FN_ATTRS
1302 _mm_cvtss_si32(__m128 __a)
1303 {
1304   return __builtin_ia32_cvtss2si((__v4sf)__a);
1305 }
1306
1307 /// Converts a float value contained in the lower 32 bits of a vector of
1308 ///    [4 x float] into a 32-bit integer.
1309 ///
1310 /// \headerfile <x86intrin.h>
1311 ///
1312 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1313 ///   instructions.
1314 ///
1315 /// \param __a
1316 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1317 ///    used in the conversion.
1318 /// \returns A 32-bit integer containing the converted value.
1319 static __inline__ int __DEFAULT_FN_ATTRS
1320 _mm_cvt_ss2si(__m128 __a)
1321 {
1322   return _mm_cvtss_si32(__a);
1323 }
1324
1325 #ifdef __x86_64__
1326
1327 /// Converts a float value contained in the lower 32 bits of a vector of
1328 ///    [4 x float] into a 64-bit integer.
1329 ///
1330 /// \headerfile <x86intrin.h>
1331 ///
1332 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1333 ///   instructions.
1334 ///
1335 /// \param __a
1336 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1337 ///    used in the conversion.
1338 /// \returns A 64-bit integer containing the converted value.
1339 static __inline__ long long __DEFAULT_FN_ATTRS
1340 _mm_cvtss_si64(__m128 __a)
1341 {
1342   return __builtin_ia32_cvtss2si64((__v4sf)__a);
1343 }
1344
1345 #endif
1346
1347 /// Converts two low-order float values in a 128-bit vector of
1348 ///    [4 x float] into a 64-bit vector of [2 x i32].
1349 ///
1350 /// \headerfile <x86intrin.h>
1351 ///
1352 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1353 ///
1354 /// \param __a
1355 ///    A 128-bit vector of [4 x float].
1356 /// \returns A 64-bit integer vector containing the converted values.
1357 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1358 _mm_cvtps_pi32(__m128 __a)
1359 {
1360   return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1361 }
1362
1363 /// Converts two low-order float values in a 128-bit vector of
1364 ///    [4 x float] into a 64-bit vector of [2 x i32].
1365 ///
1366 /// \headerfile <x86intrin.h>
1367 ///
1368 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1369 ///
1370 /// \param __a
1371 ///    A 128-bit vector of [4 x float].
1372 /// \returns A 64-bit integer vector containing the converted values.
1373 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1374 _mm_cvt_ps2pi(__m128 __a)
1375 {
1376   return _mm_cvtps_pi32(__a);
1377 }
1378
1379 /// Converts a float value contained in the lower 32 bits of a vector of
1380 ///    [4 x float] into a 32-bit integer, truncating the result when it is
1381 ///    inexact.
1382 ///
1383 /// \headerfile <x86intrin.h>
1384 ///
1385 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1386 ///   instructions.
1387 ///
1388 /// \param __a
1389 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1390 ///    used in the conversion.
1391 /// \returns A 32-bit integer containing the converted value.
1392 static __inline__ int __DEFAULT_FN_ATTRS
1393 _mm_cvttss_si32(__m128 __a)
1394 {
1395   return __builtin_ia32_cvttss2si((__v4sf)__a);
1396 }
1397
1398 /// Converts a float value contained in the lower 32 bits of a vector of
1399 ///    [4 x float] into a 32-bit integer, truncating the result when it is
1400 ///    inexact.
1401 ///
1402 /// \headerfile <x86intrin.h>
1403 ///
1404 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1405 ///   instructions.
1406 ///
1407 /// \param __a
1408 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1409 ///    used in the conversion.
1410 /// \returns A 32-bit integer containing the converted value.
1411 static __inline__ int __DEFAULT_FN_ATTRS
1412 _mm_cvtt_ss2si(__m128 __a)
1413 {
1414   return _mm_cvttss_si32(__a);
1415 }
1416
1417 #ifdef __x86_64__
1418 /// Converts a float value contained in the lower 32 bits of a vector of
1419 ///    [4 x float] into a 64-bit integer, truncating the result when it is
1420 ///    inexact.
1421 ///
1422 /// \headerfile <x86intrin.h>
1423 ///
1424 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1425 ///   instructions.
1426 ///
1427 /// \param __a
1428 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1429 ///    used in the conversion.
1430 /// \returns A 64-bit integer containing the converted value.
1431 static __inline__ long long __DEFAULT_FN_ATTRS
1432 _mm_cvttss_si64(__m128 __a)
1433 {
1434   return __builtin_ia32_cvttss2si64((__v4sf)__a);
1435 }
1436 #endif
1437
1438 /// Converts two low-order float values in a 128-bit vector of
1439 ///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1440 ///    when it is inexact.
1441 ///
1442 /// \headerfile <x86intrin.h>
1443 ///
1444 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1445 ///   instructions.
1446 ///
1447 /// \param __a
1448 ///    A 128-bit vector of [4 x float].
1449 /// \returns A 64-bit integer vector containing the converted values.
1450 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1451 _mm_cvttps_pi32(__m128 __a)
1452 {
1453   return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1454 }
1455
1456 /// Converts two low-order float values in a 128-bit vector of [4 x
1457 ///    float] into a 64-bit vector of [2 x i32], truncating the result when it
1458 ///    is inexact.
1459 ///
1460 /// \headerfile <x86intrin.h>
1461 ///
1462 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1463 ///
1464 /// \param __a
1465 ///    A 128-bit vector of [4 x float].
1466 /// \returns A 64-bit integer vector containing the converted values.
1467 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1468 _mm_cvtt_ps2pi(__m128 __a)
1469 {
1470   return _mm_cvttps_pi32(__a);
1471 }
1472
1473 /// Converts a 32-bit signed integer value into a floating point value
1474 ///    and writes it to the lower 32 bits of the destination. The remaining
1475 ///    higher order elements of the destination vector are copied from the
1476 ///    corresponding elements in the first operand.
1477 ///
1478 /// \headerfile <x86intrin.h>
1479 ///
1480 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1481 ///
1482 /// \param __a
1483 ///    A 128-bit vector of [4 x float].
1484 /// \param __b
1485 ///    A 32-bit signed integer operand containing the value to be converted.
1486 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1487 ///    converted value of the second operand. The upper 96 bits are copied from
1488 ///    the upper 96 bits of the first operand.
1489 static __inline__ __m128 __DEFAULT_FN_ATTRS
1490 _mm_cvtsi32_ss(__m128 __a, int __b)
1491 {
1492   __a[0] = __b;
1493   return __a;
1494 }
1495
1496 /// Converts a 32-bit signed integer value into a floating point value
1497 ///    and writes it to the lower 32 bits of the destination. The remaining
1498 ///    higher order elements of the destination are copied from the
1499 ///    corresponding elements in the first operand.
1500 ///
1501 /// \headerfile <x86intrin.h>
1502 ///
1503 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1504 ///
1505 /// \param __a
1506 ///    A 128-bit vector of [4 x float].
1507 /// \param __b
1508 ///    A 32-bit signed integer operand containing the value to be converted.
1509 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1510 ///    converted value of the second operand. The upper 96 bits are copied from
1511 ///    the upper 96 bits of the first operand.
1512 static __inline__ __m128 __DEFAULT_FN_ATTRS
1513 _mm_cvt_si2ss(__m128 __a, int __b)
1514 {
1515   return _mm_cvtsi32_ss(__a, __b);
1516 }
1517
1518 #ifdef __x86_64__
1519
1520 /// Converts a 64-bit signed integer value into a floating point value
1521 ///    and writes it to the lower 32 bits of the destination. The remaining
1522 ///    higher order elements of the destination are copied from the
1523 ///    corresponding elements in the first operand.
1524 ///
1525 /// \headerfile <x86intrin.h>
1526 ///
1527 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1528 ///
1529 /// \param __a
1530 ///    A 128-bit vector of [4 x float].
1531 /// \param __b
1532 ///    A 64-bit signed integer operand containing the value to be converted.
1533 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1534 ///    converted value of the second operand. The upper 96 bits are copied from
1535 ///    the upper 96 bits of the first operand.
1536 static __inline__ __m128 __DEFAULT_FN_ATTRS
1537 _mm_cvtsi64_ss(__m128 __a, long long __b)
1538 {
1539   __a[0] = __b;
1540   return __a;
1541 }
1542
1543 #endif
1544
1545 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1546 ///    floating point values and writes them to the lower 64-bits of the
1547 ///    destination. The remaining higher order elements of the destination are
1548 ///    copied from the corresponding elements in the first operand.
1549 ///
1550 /// \headerfile <x86intrin.h>
1551 ///
1552 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1553 ///
1554 /// \param __a
1555 ///    A 128-bit vector of [4 x float].
1556 /// \param __b
1557 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1558 ///    and written to the corresponding low-order elements in the destination.
1559 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1560 ///    converted value of the second operand. The upper 64 bits are copied from
1561 ///    the upper 64 bits of the first operand.
1562 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1563 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1564 {
1565   return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1566 }
1567
1568 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1569 ///    floating point values and writes them to the lower 64-bits of the
1570 ///    destination. The remaining higher order elements of the destination are
1571 ///    copied from the corresponding elements in the first operand.
1572 ///
1573 /// \headerfile <x86intrin.h>
1574 ///
1575 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1576 ///
1577 /// \param __a
1578 ///    A 128-bit vector of [4 x float].
1579 /// \param __b
1580 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1581 ///    and written to the corresponding low-order elements in the destination.
1582 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1583 ///    converted value from the second operand. The upper 64 bits are copied
1584 ///    from the upper 64 bits of the first operand.
1585 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1586 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1587 {
1588   return _mm_cvtpi32_ps(__a, __b);
1589 }
1590
1591 /// Extracts a float value contained in the lower 32 bits of a vector of
1592 ///    [4 x float].
1593 ///
1594 /// \headerfile <x86intrin.h>
1595 ///
1596 /// This intrinsic has no corresponding instruction.
1597 ///
1598 /// \param __a
1599 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1600 ///    used in the extraction.
1601 /// \returns A 32-bit float containing the extracted value.
1602 static __inline__ float __DEFAULT_FN_ATTRS
1603 _mm_cvtss_f32(__m128 __a)
1604 {
1605   return __a[0];
1606 }
1607
1608 /// Loads two packed float values from the address \a __p into the
1609 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1610 ///     are copied from the low-order bits of the first operand.
1611 ///
1612 /// \headerfile <x86intrin.h>
1613 ///
1614 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1615 ///
1616 /// \param __a
1617 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1618 ///    of the destination.
1619 /// \param __p
1620 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1621 ///    [127:64] of the destination.
1622 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1623 static __inline__ __m128 __DEFAULT_FN_ATTRS
1624 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1625 {
1626   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1627   struct __mm_loadh_pi_struct {
1628     __mm_loadh_pi_v2f32 __u;
1629   } __attribute__((__packed__, __may_alias__));
1630   __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1631   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1632   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1633 }
1634
1635 /// Loads two packed float values from the address \a __p into the
1636 ///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1637 ///    are copied from the high-order bits of the first operand.
1638 ///
1639 /// \headerfile <x86intrin.h>
1640 ///
1641 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1642 ///
1643 /// \param __a
1644 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1645 ///    [127:64] of the destination.
1646 /// \param __p
1647 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1648 ///    [63:0] of the destination.
1649 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1650 static __inline__ __m128 __DEFAULT_FN_ATTRS
1651 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1652 {
1653   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1654   struct __mm_loadl_pi_struct {
1655     __mm_loadl_pi_v2f32 __u;
1656   } __attribute__((__packed__, __may_alias__));
1657   __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1658   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1659   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1660 }
1661
1662 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1663 ///    32 bits of the vector are initialized with the single-precision
1664 ///    floating-point value loaded from a specified memory location. The upper
1665 ///    96 bits are set to zero.
1666 ///
1667 /// \headerfile <x86intrin.h>
1668 ///
1669 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1670 ///
1671 /// \param __p
1672 ///    A pointer to a 32-bit memory location containing a single-precision
1673 ///    floating-point value.
1674 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1675 ///    lower 32 bits contain the value loaded from the memory location. The
1676 ///    upper 96 bits are set to zero.
1677 static __inline__ __m128 __DEFAULT_FN_ATTRS
1678 _mm_load_ss(const float *__p)
1679 {
1680   struct __mm_load_ss_struct {
1681     float __u;
1682   } __attribute__((__packed__, __may_alias__));
1683   float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1684   return __extension__ (__m128){ __u, 0, 0, 0 };
1685 }
1686
1687 /// Loads a 32-bit float value and duplicates it to all four vector
1688 ///    elements of a 128-bit vector of [4 x float].
1689 ///
1690 /// \headerfile <x86intrin.h>
1691 ///
1692 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1693 ///    instruction.
1694 ///
1695 /// \param __p
1696 ///    A pointer to a float value to be loaded and duplicated.
1697 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1698 ///    duplicated values.
1699 static __inline__ __m128 __DEFAULT_FN_ATTRS
1700 _mm_load1_ps(const float *__p)
1701 {
1702   struct __mm_load1_ps_struct {
1703     float __u;
1704   } __attribute__((__packed__, __may_alias__));
1705   float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1706   return __extension__ (__m128){ __u, __u, __u, __u };
1707 }
1708
1709 #define        _mm_load_ps1(p) _mm_load1_ps(p)
1710
1711 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1712 ///    memory location.
1713 ///
1714 /// \headerfile <x86intrin.h>
1715 ///
1716 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1717 ///
1718 /// \param __p
1719 ///    A pointer to a 128-bit memory location. The address of the memory
1720 ///    location has to be 128-bit aligned.
1721 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1722 static __inline__ __m128 __DEFAULT_FN_ATTRS
1723 _mm_load_ps(const float *__p)
1724 {
1725   return *(const __m128*)__p;
1726 }
1727
1728 /// Loads a 128-bit floating-point vector of [4 x float] from an
1729 ///    unaligned memory location.
1730 ///
1731 /// \headerfile <x86intrin.h>
1732 ///
1733 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1734 ///
1735 /// \param __p
1736 ///    A pointer to a 128-bit memory location. The address of the memory
1737 ///    location does not have to be aligned.
1738 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1739 static __inline__ __m128 __DEFAULT_FN_ATTRS
1740 _mm_loadu_ps(const float *__p)
1741 {
1742   struct __loadu_ps {
1743     __m128_u __v;
1744   } __attribute__((__packed__, __may_alias__));
1745   return ((const struct __loadu_ps*)__p)->__v;
1746 }
1747
1748 /// Loads four packed float values, in reverse order, from an aligned
1749 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1750 ///
1751 /// \headerfile <x86intrin.h>
1752 ///
1753 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1754 ///    instruction.
1755 ///
1756 /// \param __p
1757 ///    A pointer to a 128-bit memory location. The address of the memory
1758 ///    location has to be 128-bit aligned.
1759 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1760 ///    in reverse order.
1761 static __inline__ __m128 __DEFAULT_FN_ATTRS
1762 _mm_loadr_ps(const float *__p)
1763 {
1764   __m128 __a = _mm_load_ps(__p);
1765   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1766 }
1767
1768 /// Create a 128-bit vector of [4 x float] with undefined values.
1769 ///
1770 /// \headerfile <x86intrin.h>
1771 ///
1772 /// This intrinsic has no corresponding instruction.
1773 ///
1774 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1775 static __inline__ __m128 __DEFAULT_FN_ATTRS
1776 _mm_undefined_ps(void)
1777 {
1778   return (__m128)__builtin_ia32_undef128();
1779 }
1780
1781 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1782 ///    32 bits of the vector are initialized with the specified single-precision
1783 ///    floating-point value. The upper 96 bits are set to zero.
1784 ///
1785 /// \headerfile <x86intrin.h>
1786 ///
1787 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1788 ///
1789 /// \param __w
1790 ///    A single-precision floating-point value used to initialize the lower 32
1791 ///    bits of the result.
1792 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1793 ///    lower 32 bits contain the value provided in the source operand. The
1794 ///    upper 96 bits are set to zero.
1795 static __inline__ __m128 __DEFAULT_FN_ATTRS
1796 _mm_set_ss(float __w)
1797 {
1798   return __extension__ (__m128){ __w, 0, 0, 0 };
1799 }
1800
1801 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1802 ///    of the four single-precision floating-point vector elements set to the
1803 ///    specified single-precision floating-point value.
1804 ///
1805 /// \headerfile <x86intrin.h>
1806 ///
1807 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1808 ///
1809 /// \param __w
1810 ///    A single-precision floating-point value used to initialize each vector
1811 ///    element of the result.
1812 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1813 static __inline__ __m128 __DEFAULT_FN_ATTRS
1814 _mm_set1_ps(float __w)
1815 {
1816   return __extension__ (__m128){ __w, __w, __w, __w };
1817 }
1818
1819 /* Microsoft specific. */
1820 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1821 ///    of the four single-precision floating-point vector elements set to the
1822 ///    specified single-precision floating-point value.
1823 ///
1824 /// \headerfile <x86intrin.h>
1825 ///
1826 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1827 ///
1828 /// \param __w
1829 ///    A single-precision floating-point value used to initialize each vector
1830 ///    element of the result.
1831 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1832 static __inline__ __m128 __DEFAULT_FN_ATTRS
1833 _mm_set_ps1(float __w)
1834 {
1835     return _mm_set1_ps(__w);
1836 }
1837
1838 /// Constructs a 128-bit floating-point vector of [4 x float]
1839 ///    initialized with the specified single-precision floating-point values.
1840 ///
1841 /// \headerfile <x86intrin.h>
1842 ///
1843 /// This intrinsic is a utility function and does not correspond to a specific
1844 ///    instruction.
1845 ///
1846 /// \param __z
1847 ///    A single-precision floating-point value used to initialize bits [127:96]
1848 ///    of the result.
1849 /// \param __y
1850 ///    A single-precision floating-point value used to initialize bits [95:64]
1851 ///    of the result.
1852 /// \param __x
1853 ///    A single-precision floating-point value used to initialize bits [63:32]
1854 ///    of the result.
1855 /// \param __w
1856 ///    A single-precision floating-point value used to initialize bits [31:0]
1857 ///    of the result.
1858 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1859 static __inline__ __m128 __DEFAULT_FN_ATTRS
1860 _mm_set_ps(float __z, float __y, float __x, float __w)
1861 {
1862   return __extension__ (__m128){ __w, __x, __y, __z };
1863 }
1864
1865 /// Constructs a 128-bit floating-point vector of [4 x float],
1866 ///    initialized in reverse order with the specified 32-bit single-precision
1867 ///    float-point values.
1868 ///
1869 /// \headerfile <x86intrin.h>
1870 ///
1871 /// This intrinsic is a utility function and does not correspond to a specific
1872 ///    instruction.
1873 ///
1874 /// \param __z
1875 ///    A single-precision floating-point value used to initialize bits [31:0]
1876 ///    of the result.
1877 /// \param __y
1878 ///    A single-precision floating-point value used to initialize bits [63:32]
1879 ///    of the result.
1880 /// \param __x
1881 ///    A single-precision floating-point value used to initialize bits [95:64]
1882 ///    of the result.
1883 /// \param __w
1884 ///    A single-precision floating-point value used to initialize bits [127:96]
1885 ///    of the result.
1886 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1887 static __inline__ __m128 __DEFAULT_FN_ATTRS
1888 _mm_setr_ps(float __z, float __y, float __x, float __w)
1889 {
1890   return __extension__ (__m128){ __z, __y, __x, __w };
1891 }
1892
1893 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
1894 ///    to zero.
1895 ///
1896 /// \headerfile <x86intrin.h>
1897 ///
1898 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1899 ///
1900 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
1901 ///    all elements set to zero.
1902 static __inline__ __m128 __DEFAULT_FN_ATTRS
1903 _mm_setzero_ps(void)
1904 {
1905   return __extension__ (__m128){ 0, 0, 0, 0 };
1906 }
1907
1908 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1909 ///    memory location.
1910 ///
1911 /// \headerfile <x86intrin.h>
1912 ///
1913 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1914 ///
1915 /// \param __p
1916 ///    A pointer to a 64-bit memory location.
1917 /// \param __a
1918 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1919 static __inline__ void __DEFAULT_FN_ATTRS
1920 _mm_storeh_pi(__m64 *__p, __m128 __a)
1921 {
1922   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1923   struct __mm_storeh_pi_struct {
1924     __mm_storeh_pi_v2f32 __u;
1925   } __attribute__((__packed__, __may_alias__));
1926   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1927 }
1928
1929 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1930 ///     memory location.
1931 ///
1932 /// \headerfile <x86intrin.h>
1933 ///
1934 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1935 ///
1936 /// \param __p
1937 ///    A pointer to a memory location that will receive the float values.
1938 /// \param __a
1939 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1940 static __inline__ void __DEFAULT_FN_ATTRS
1941 _mm_storel_pi(__m64 *__p, __m128 __a)
1942 {
1943   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1944   struct __mm_storeh_pi_struct {
1945     __mm_storeh_pi_v2f32 __u;
1946   } __attribute__((__packed__, __may_alias__));
1947   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1948 }
1949
1950 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1951 ///     memory location.
1952 ///
1953 /// \headerfile <x86intrin.h>
1954 ///
1955 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1956 ///
1957 /// \param __p
1958 ///    A pointer to a 32-bit memory location.
1959 /// \param __a
1960 ///    A 128-bit vector of [4 x float] containing the value to be stored.
1961 static __inline__ void __DEFAULT_FN_ATTRS
1962 _mm_store_ss(float *__p, __m128 __a)
1963 {
1964   struct __mm_store_ss_struct {
1965     float __u;
1966   } __attribute__((__packed__, __may_alias__));
1967   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1968 }
1969
1970 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
1971 ///    location.
1972 ///
1973 /// \headerfile <x86intrin.h>
1974 ///
1975 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1976 ///
1977 /// \param __p
1978 ///    A pointer to a 128-bit memory location. The address of the memory
1979 ///    location does not have to be aligned.
1980 /// \param __a
1981 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1982 static __inline__ void __DEFAULT_FN_ATTRS
1983 _mm_storeu_ps(float *__p, __m128 __a)
1984 {
1985   struct __storeu_ps {
1986     __m128_u __v;
1987   } __attribute__((__packed__, __may_alias__));
1988   ((struct __storeu_ps*)__p)->__v = __a;
1989 }
1990
1991 /// Stores a 128-bit vector of [4 x float] into an aligned memory
1992 ///    location.
1993 ///
1994 /// \headerfile <x86intrin.h>
1995 ///
1996 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1997 ///
1998 /// \param __p
1999 ///    A pointer to a 128-bit memory location. The address of the memory
2000 ///    location has to be 16-byte aligned.
2001 /// \param __a
2002 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2003 static __inline__ void __DEFAULT_FN_ATTRS
2004 _mm_store_ps(float *__p, __m128 __a)
2005 {
2006   *(__m128*)__p = __a;
2007 }
2008
2009 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2010 ///    four contiguous elements in an aligned memory location.
2011 ///
2012 /// \headerfile <x86intrin.h>
2013 ///
2014 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2015 ///    instruction.
2016 ///
2017 /// \param __p
2018 ///    A pointer to a 128-bit memory location.
2019 /// \param __a
2020 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2021 ///    of the four contiguous elements pointed by \a __p.
2022 static __inline__ void __DEFAULT_FN_ATTRS
2023 _mm_store1_ps(float *__p, __m128 __a)
2024 {
2025   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2026   _mm_store_ps(__p, __a);
2027 }
2028
2029 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2030 ///    four contiguous elements in an aligned memory location.
2031 ///
2032 /// \headerfile <x86intrin.h>
2033 ///
2034 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2035 ///    instruction.
2036 ///
2037 /// \param __p
2038 ///    A pointer to a 128-bit memory location.
2039 /// \param __a
2040 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2041 ///    of the four contiguous elements pointed by \a __p.
2042 static __inline__ void __DEFAULT_FN_ATTRS
2043 _mm_store_ps1(float *__p, __m128 __a)
2044 {
2045   _mm_store1_ps(__p, __a);
2046 }
2047
2048 /// Stores float values from a 128-bit vector of [4 x float] to an
2049 ///    aligned memory location in reverse order.
2050 ///
2051 /// \headerfile <x86intrin.h>
2052 ///
2053 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2054 ///    instruction.
2055 ///
2056 /// \param __p
2057 ///    A pointer to a 128-bit memory location. The address of the memory
2058 ///    location has to be 128-bit aligned.
2059 /// \param __a
2060 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2061 static __inline__ void __DEFAULT_FN_ATTRS
2062 _mm_storer_ps(float *__p, __m128 __a)
2063 {
2064   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2065   _mm_store_ps(__p, __a);
2066 }
2067
2068 #define _MM_HINT_ET0 7
2069 #define _MM_HINT_ET1 6
2070 #define _MM_HINT_T0  3
2071 #define _MM_HINT_T1  2
2072 #define _MM_HINT_T2  1
2073 #define _MM_HINT_NTA 0
2074
2075 #ifndef _MSC_VER
2076 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2077    Sema doesn't do any form of constant propagation yet. */
2078
2079 /// Loads one cache line of data from the specified address to a location
2080 ///    closer to the processor.
2081 ///
2082 /// \headerfile <x86intrin.h>
2083 ///
2084 /// \code
2085 /// void _mm_prefetch(const void * a, const int sel);
2086 /// \endcode
2087 ///
2088 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2089 ///
2090 /// \param a
2091 ///    A pointer to a memory location containing a cache line of data.
2092 /// \param sel
2093 ///    A predefined integer constant specifying the type of prefetch
2094 ///    operation: \n
2095 ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2096 ///    PREFETCHNTA instruction will be generated. \n
2097 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2098 ///    be generated. \n
2099 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2100 ///    be generated. \n
2101 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2102 ///    be generated.
2103 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2104                                                  ((sel) >> 2) & 1, (sel) & 0x3))
2105 #endif
2106
2107 /// Stores a 64-bit integer in the specified aligned memory location. To
2108 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
2109 ///    used again soon).
2110 ///
2111 /// \headerfile <x86intrin.h>
2112 ///
2113 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2114 ///
2115 /// \param __p
2116 ///    A pointer to an aligned memory location used to store the register value.
2117 /// \param __a
2118 ///    A 64-bit integer containing the value to be stored.
2119 static __inline__ void __DEFAULT_FN_ATTRS_MMX
2120 _mm_stream_pi(__m64 *__p, __m64 __a)
2121 {
2122   __builtin_ia32_movntq(__p, __a);
2123 }
2124
2125 /// Moves packed float values from a 128-bit vector of [4 x float] to a
2126 ///    128-bit aligned memory location. To minimize caching, the data is flagged
2127 ///    as non-temporal (unlikely to be used again soon).
2128 ///
2129 /// \headerfile <x86intrin.h>
2130 ///
2131 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2132 ///
2133 /// \param __p
2134 ///    A pointer to a 128-bit aligned memory location that will receive the
2135 ///    single-precision floating-point values.
2136 /// \param __a
2137 ///    A 128-bit vector of [4 x float] containing the values to be moved.
2138 static __inline__ void __DEFAULT_FN_ATTRS
2139 _mm_stream_ps(float *__p, __m128 __a)
2140 {
2141   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2142 }
2143
2144 #if defined(__cplusplus)
2145 extern "C" {
2146 #endif
2147
2148 /// Forces strong memory ordering (serialization) between store
2149 ///    instructions preceding this instruction and store instructions following
2150 ///    this instruction, ensuring the system completes all previous stores
2151 ///    before executing subsequent stores.
2152 ///
2153 /// \headerfile <x86intrin.h>
2154 ///
2155 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2156 ///
2157 void _mm_sfence(void);
2158
2159 #if defined(__cplusplus)
2160 } // extern "C"
2161 #endif
2162
2163 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2164 ///    returns it, as specified by the immediate integer operand.
2165 ///
2166 /// \headerfile <x86intrin.h>
2167 ///
2168 /// \code
2169 /// int _mm_extract_pi16(__m64 a, int n);
2170 /// \endcode
2171 ///
2172 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2173 ///
2174 /// \param a
2175 ///    A 64-bit vector of [4 x i16].
2176 /// \param n
2177 ///    An immediate integer operand that determines which bits are extracted: \n
2178 ///    0: Bits [15:0] are copied to the destination. \n
2179 ///    1: Bits [31:16] are copied to the destination. \n
2180 ///    2: Bits [47:32] are copied to the destination. \n
2181 ///    3: Bits [63:48] are copied to the destination.
2182 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2183 #define _mm_extract_pi16(a, n) \
2184   (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
2185
2186 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
2187 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2188 ///    specified by the immediate operand \a n.
2189 ///
2190 /// \headerfile <x86intrin.h>
2191 ///
2192 /// \code
2193 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2194 /// \endcode
2195 ///
2196 /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2197 ///
2198 /// \param a
2199 ///    A 64-bit vector of [4 x i16].
2200 /// \param d
2201 ///    An integer. The lower 16-bit value from this operand is written to the
2202 ///    destination at the offset specified by operand \a n.
2203 /// \param n
2204 ///    An immediate integer operant that determines which the bits to be used
2205 ///    in the destination. \n
2206 ///    0: Bits [15:0] are copied to the destination. \n
2207 ///    1: Bits [31:16] are copied to the destination. \n
2208 ///    2: Bits [47:32] are copied to the destination. \n
2209 ///    3: Bits [63:48] are copied to the destination.  \n
2210 ///    The remaining bits in the destination are copied from the corresponding
2211 ///    bits in operand \a a.
2212 /// \returns A 64-bit integer vector containing the copied packed data from the
2213 ///    operands.
2214 #define _mm_insert_pi16(a, d, n) \
2215   (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)
2216
2217 /// Compares each of the corresponding packed 16-bit integer values of
2218 ///    the 64-bit integer vectors, and writes the greater value to the
2219 ///    corresponding bits in the destination.
2220 ///
2221 /// \headerfile <x86intrin.h>
2222 ///
2223 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2224 ///
2225 /// \param __a
2226 ///    A 64-bit integer vector containing one of the source operands.
2227 /// \param __b
2228 ///    A 64-bit integer vector containing one of the source operands.
2229 /// \returns A 64-bit integer vector containing the comparison results.
2230 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2231 _mm_max_pi16(__m64 __a, __m64 __b)
2232 {
2233   return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2234 }
2235
2236 /// Compares each of the corresponding packed 8-bit unsigned integer
2237 ///    values of the 64-bit integer vectors, and writes the greater value to the
2238 ///    corresponding bits in the destination.
2239 ///
2240 /// \headerfile <x86intrin.h>
2241 ///
2242 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2243 ///
2244 /// \param __a
2245 ///    A 64-bit integer vector containing one of the source operands.
2246 /// \param __b
2247 ///    A 64-bit integer vector containing one of the source operands.
2248 /// \returns A 64-bit integer vector containing the comparison results.
2249 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2250 _mm_max_pu8(__m64 __a, __m64 __b)
2251 {
2252   return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2253 }
2254
2255 /// Compares each of the corresponding packed 16-bit integer values of
2256 ///    the 64-bit integer vectors, and writes the lesser value to the
2257 ///    corresponding bits in the destination.
2258 ///
2259 /// \headerfile <x86intrin.h>
2260 ///
2261 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2262 ///
2263 /// \param __a
2264 ///    A 64-bit integer vector containing one of the source operands.
2265 /// \param __b
2266 ///    A 64-bit integer vector containing one of the source operands.
2267 /// \returns A 64-bit integer vector containing the comparison results.
2268 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2269 _mm_min_pi16(__m64 __a, __m64 __b)
2270 {
2271   return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2272 }
2273
2274 /// Compares each of the corresponding packed 8-bit unsigned integer
2275 ///    values of the 64-bit integer vectors, and writes the lesser value to the
2276 ///    corresponding bits in the destination.
2277 ///
2278 /// \headerfile <x86intrin.h>
2279 ///
2280 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2281 ///
2282 /// \param __a
2283 ///    A 64-bit integer vector containing one of the source operands.
2284 /// \param __b
2285 ///    A 64-bit integer vector containing one of the source operands.
2286 /// \returns A 64-bit integer vector containing the comparison results.
2287 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2288 _mm_min_pu8(__m64 __a, __m64 __b)
2289 {
2290   return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2291 }
2292
2293 /// Takes the most significant bit from each 8-bit element in a 64-bit
2294 ///    integer vector to create an 8-bit mask value. Zero-extends the value to
2295 ///    32-bit integer and writes it to the destination.
2296 ///
2297 /// \headerfile <x86intrin.h>
2298 ///
2299 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2300 ///
2301 /// \param __a
2302 ///    A 64-bit integer vector containing the values with bits to be extracted.
2303 /// \returns The most significant bit from each 8-bit element in \a __a,
2304 ///    written to bits [7:0].
2305 static __inline__ int __DEFAULT_FN_ATTRS_MMX
2306 _mm_movemask_pi8(__m64 __a)
2307 {
2308   return __builtin_ia32_pmovmskb((__v8qi)__a);
2309 }
2310
2311 /// Multiplies packed 16-bit unsigned integer values and writes the
2312 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
2313 ///    the destination.
2314 ///
2315 /// \headerfile <x86intrin.h>
2316 ///
2317 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2318 ///
2319 /// \param __a
2320 ///    A 64-bit integer vector containing one of the source operands.
2321 /// \param __b
2322 ///    A 64-bit integer vector containing one of the source operands.
2323 /// \returns A 64-bit integer vector containing the products of both operands.
2324 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2325 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2326 {
2327   return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2328 }
2329
2330 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2331 ///    destination, as specified by the immediate value operand.
2332 ///
2333 /// \headerfile <x86intrin.h>
2334 ///
2335 /// \code
2336 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2337 /// \endcode
2338 ///
2339 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2340 ///
2341 /// \param a
2342 ///    A 64-bit integer vector containing the values to be shuffled.
2343 /// \param n
2344 ///    An immediate value containing an 8-bit value specifying which elements to
2345 ///    copy from \a a. The destinations within the 64-bit destination are
2346 ///    assigned values as follows: \n
2347 ///    Bits [1:0] are used to assign values to bits [15:0] in the
2348 ///    destination. \n
2349 ///    Bits [3:2] are used to assign values to bits [31:16] in the
2350 ///    destination. \n
2351 ///    Bits [5:4] are used to assign values to bits [47:32] in the
2352 ///    destination. \n
2353 ///    Bits [7:6] are used to assign values to bits [63:48] in the
2354 ///    destination. \n
2355 ///    Bit value assignments: \n
2356 ///    00: assigned from bits [15:0] of \a a. \n
2357 ///    01: assigned from bits [31:16] of \a a. \n
2358 ///    10: assigned from bits [47:32] of \a a. \n
2359 ///    11: assigned from bits [63:48] of \a a.
2360 /// \returns A 64-bit integer vector containing the shuffled values.
2361 #define _mm_shuffle_pi16(a, n) \
2362   (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
2363
2364 /// Conditionally copies the values from each 8-bit element in the first
2365 ///    64-bit integer vector operand to the specified memory location, as
2366 ///    specified by the most significant bit in the corresponding element in the
2367 ///    second 64-bit integer vector operand.
2368 ///
2369 ///    To minimize caching, the data is flagged as non-temporal
2370 ///    (unlikely to be used again soon).
2371 ///
2372 /// \headerfile <x86intrin.h>
2373 ///
2374 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2375 ///
2376 /// \param __d
2377 ///    A 64-bit integer vector containing the values with elements to be copied.
2378 /// \param __n
2379 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2380 ///    element determines whether the corresponding element in operand \a __d
2381 ///    is copied. If the most significant bit of a given element is 1, the
2382 ///    corresponding element in operand \a __d is copied.
2383 /// \param __p
2384 ///    A pointer to a 64-bit memory location that will receive the conditionally
2385 ///    copied integer values. The address of the memory location does not have
2386 ///    to be aligned.
2387 static __inline__ void __DEFAULT_FN_ATTRS_MMX
2388 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2389 {
2390   __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2391 }
2392
2393 /// Computes the rounded averages of the packed unsigned 8-bit integer
2394 ///    values and writes the averages to the corresponding bits in the
2395 ///    destination.
2396 ///
2397 /// \headerfile <x86intrin.h>
2398 ///
2399 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2400 ///
2401 /// \param __a
2402 ///    A 64-bit integer vector containing one of the source operands.
2403 /// \param __b
2404 ///    A 64-bit integer vector containing one of the source operands.
2405 /// \returns A 64-bit integer vector containing the averages of both operands.
2406 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2407 _mm_avg_pu8(__m64 __a, __m64 __b)
2408 {
2409   return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2410 }
2411
2412 /// Computes the rounded averages of the packed unsigned 16-bit integer
2413 ///    values and writes the averages to the corresponding bits in the
2414 ///    destination.
2415 ///
2416 /// \headerfile <x86intrin.h>
2417 ///
2418 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2419 ///
2420 /// \param __a
2421 ///    A 64-bit integer vector containing one of the source operands.
2422 /// \param __b
2423 ///    A 64-bit integer vector containing one of the source operands.
2424 /// \returns A 64-bit integer vector containing the averages of both operands.
2425 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2426 _mm_avg_pu16(__m64 __a, __m64 __b)
2427 {
2428   return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2429 }
2430
2431 /// Subtracts the corresponding 8-bit unsigned integer values of the two
2432 ///    64-bit vector operands and computes the absolute value for each of the
2433 ///    difference. Then sum of the 8 absolute differences is written to the
2434 ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2435 ///
2436 /// \headerfile <x86intrin.h>
2437 ///
2438 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2439 ///
2440 /// \param __a
2441 ///    A 64-bit integer vector containing one of the source operands.
2442 /// \param __b
2443 ///    A 64-bit integer vector containing one of the source operands.
2444 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2445 ///    sets of absolute differences between both operands. The upper bits are
2446 ///    cleared.
2447 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2448 _mm_sad_pu8(__m64 __a, __m64 __b)
2449 {
2450   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2451 }
2452
2453 #if defined(__cplusplus)
2454 extern "C" {
2455 #endif
2456
2457 /// Returns the contents of the MXCSR register as a 32-bit unsigned
2458 ///    integer value.
2459 ///
2460 ///    There are several groups of macros associated with this
2461 ///    intrinsic, including:
2462 ///    <ul>
2463 ///    <li>
2464 ///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2465 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2466 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2467 ///      _MM_GET_EXCEPTION_STATE().
2468 ///    </li>
2469 ///    <li>
2470 ///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2471 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2472 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2473 ///    </li>
2474 ///    <li>
2475 ///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2476 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2477 ///      _MM_GET_ROUNDING_MODE().
2478 ///    </li>
2479 ///    <li>
2480 ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2481 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2482 ///    </li>
2483 ///    <li>
2484 ///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2485 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2486 ///      _MM_GET_DENORMALS_ZERO_MODE().
2487 ///    </li>
2488 ///    </ul>
2489 ///
2490 ///    For example, the following expression checks if an overflow exception has
2491 ///    occurred:
2492 ///    \code
2493 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2494 ///    \endcode
2495 ///
2496 ///    The following expression gets the current rounding mode:
2497 ///    \code
2498 ///      _MM_GET_ROUNDING_MODE()
2499 ///    \endcode
2500 ///
2501 /// \headerfile <x86intrin.h>
2502 ///
2503 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2504 ///
2505 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2506 ///    register.
2507 unsigned int _mm_getcsr(void);
2508
2509 /// Sets the MXCSR register with the 32-bit unsigned integer value.
2510 ///
2511 ///    There are several groups of macros associated with this intrinsic,
2512 ///    including:
2513 ///    <ul>
2514 ///    <li>
2515 ///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2516 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2517 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2518 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2519 ///    </li>
2520 ///    <li>
2521 ///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2522 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2523 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2524 ///      of these macros.
2525 ///    </li>
2526 ///    <li>
2527 ///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2528 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2529 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2530 ///    </li>
2531 ///    <li>
2532 ///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2533 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2534 ///      one of these macros.
2535 ///    </li>
2536 ///    <li>
2537 ///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2538 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2539 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2540 ///    </li>
2541 ///    </ul>
2542 ///
2543 ///    For example, the following expression causes subsequent floating-point
2544 ///    operations to round up:
2545 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2546 ///
2547 ///    The following example sets the DAZ and FTZ flags:
2548 ///    \code
2549 ///    void setFlags() {
2550 ///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2551 ///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2552 ///    }
2553 ///    \endcode
2554 ///
2555 /// \headerfile <x86intrin.h>
2556 ///
2557 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2558 ///
2559 /// \param __i
2560 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
2561 void _mm_setcsr(unsigned int __i);
2562
2563 #if defined(__cplusplus)
2564 } // extern "C"
2565 #endif
2566
2567 /// Selects 4 float values from the 128-bit operands of [4 x float], as
2568 ///    specified by the immediate value operand.
2569 ///
2570 /// \headerfile <x86intrin.h>
2571 ///
2572 /// \code
2573 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2574 /// \endcode
2575 ///
2576 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2577 ///
2578 /// \param a
2579 ///    A 128-bit vector of [4 x float].
2580 /// \param b
2581 ///    A 128-bit vector of [4 x float].
2582 /// \param mask
2583 ///    An immediate value containing an 8-bit value specifying which elements to
2584 ///    copy from \a a and \a b. \n
2585 ///    Bits [3:0] specify the values copied from operand \a a. \n
2586 ///    Bits [7:4] specify the values copied from operand \a b. \n
2587 ///    The destinations within the 128-bit destination are assigned values as
2588 ///    follows: \n
2589 ///    Bits [1:0] are used to assign values to bits [31:0] in the
2590 ///    destination. \n
2591 ///    Bits [3:2] are used to assign values to bits [63:32] in the
2592 ///    destination. \n
2593 ///    Bits [5:4] are used to assign values to bits [95:64] in the
2594 ///    destination. \n
2595 ///    Bits [7:6] are used to assign values to bits [127:96] in the
2596 ///    destination. \n
2597 ///    Bit value assignments: \n
2598 ///    00: Bits [31:0] copied from the specified operand. \n
2599 ///    01: Bits [63:32] copied from the specified operand. \n
2600 ///    10: Bits [95:64] copied from the specified operand. \n
2601 ///    11: Bits [127:96] copied from the specified operand.
2602 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2603 #define _mm_shuffle_ps(a, b, mask) \
2604   (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2605                                 (int)(mask))
2606
2607 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2608 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2609 ///
2610 /// \headerfile <x86intrin.h>
2611 ///
2612 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2613 ///
2614 /// \param __a
2615 ///    A 128-bit vector of [4 x float]. \n
2616 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
2617 ///    Bits [127:96] are written to bits [95:64] of the destination.
2618 /// \param __b
2619 ///    A 128-bit vector of [4 x float].
2620 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
2621 ///    Bits [127:96] are written to bits [127:96] of the destination.
2622 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2623 static __inline__ __m128 __DEFAULT_FN_ATTRS
2624 _mm_unpackhi_ps(__m128 __a, __m128 __b)
2625 {
2626   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2627 }
2628
2629 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2630 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2631 ///
2632 /// \headerfile <x86intrin.h>
2633 ///
2634 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2635 ///
2636 /// \param __a
2637 ///    A 128-bit vector of [4 x float]. \n
2638 ///    Bits [31:0] are written to bits [31:0] of the destination.  \n
2639 ///    Bits [63:32] are written to bits [95:64] of the destination.
2640 /// \param __b
2641 ///    A 128-bit vector of [4 x float]. \n
2642 ///    Bits [31:0] are written to bits [63:32] of the destination. \n
2643 ///    Bits [63:32] are written to bits [127:96] of the destination.
2644 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2645 static __inline__ __m128 __DEFAULT_FN_ATTRS
2646 _mm_unpacklo_ps(__m128 __a, __m128 __b)
2647 {
2648   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2649 }
2650
2651 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2652 ///    32 bits are set to the lower 32 bits of the second parameter. The upper
2653 ///    96 bits are set to the upper 96 bits of the first parameter.
2654 ///
2655 /// \headerfile <x86intrin.h>
2656 ///
2657 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2658 ///    instruction.
2659 ///
2660 /// \param __a
2661 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2662 ///    written to the upper 96 bits of the result.
2663 /// \param __b
2664 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2665 ///    written to the lower 32 bits of the result.
2666 /// \returns A 128-bit floating-point vector of [4 x float].
2667 static __inline__ __m128 __DEFAULT_FN_ATTRS
2668 _mm_move_ss(__m128 __a, __m128 __b)
2669 {
2670   __a[0] = __b[0];
2671   return __a;
2672 }
2673
2674 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2675 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
2676 ///    64 bits are set to the upper 64 bits of the first parameter.
2677 ///
2678 /// \headerfile <x86intrin.h>
2679 ///
2680 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2681 ///
2682 /// \param __a
2683 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2684 ///    written to the upper 64 bits of the result.
2685 /// \param __b
2686 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2687 ///    written to the lower 64 bits of the result.
2688 /// \returns A 128-bit floating-point vector of [4 x float].
2689 static __inline__ __m128 __DEFAULT_FN_ATTRS
2690 _mm_movehl_ps(__m128 __a, __m128 __b)
2691 {
2692   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2693 }
2694
2695 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2696 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
2697 ///    64 bits are set to the lower 64 bits of the second parameter.
2698 ///
2699 /// \headerfile <x86intrin.h>
2700 ///
2701 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2702 ///
2703 /// \param __a
2704 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2705 ///    written to the lower 64 bits of the result.
2706 /// \param __b
2707 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2708 ///    written to the upper 64 bits of the result.
2709 /// \returns A 128-bit floating-point vector of [4 x float].
2710 static __inline__ __m128 __DEFAULT_FN_ATTRS
2711 _mm_movelh_ps(__m128 __a, __m128 __b)
2712 {
2713   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2714 }
2715
2716 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2717 ///    float].
2718 ///
2719 /// \headerfile <x86intrin.h>
2720 ///
2721 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2722 ///
2723 /// \param __a
2724 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2725 ///    from the corresponding elements in this operand.
2726 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2727 ///    values from the operand.
2728 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2729 _mm_cvtpi16_ps(__m64 __a)
2730 {
2731   __m64 __b, __c;
2732   __m128 __r;
2733
2734   __b = _mm_setzero_si64();
2735   __b = _mm_cmpgt_pi16(__b, __a);
2736   __c = _mm_unpackhi_pi16(__a, __b);
2737   __r = _mm_setzero_ps();
2738   __r = _mm_cvtpi32_ps(__r, __c);
2739   __r = _mm_movelh_ps(__r, __r);
2740   __c = _mm_unpacklo_pi16(__a, __b);
2741   __r = _mm_cvtpi32_ps(__r, __c);
2742
2743   return __r;
2744 }
2745
2746 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
2747 ///    128-bit vector of [4 x float].
2748 ///
2749 /// \headerfile <x86intrin.h>
2750 ///
2751 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2752 ///
2753 /// \param __a
2754 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2755 ///    destination are copied from the corresponding elements in this operand.
2756 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2757 ///    values from the operand.
2758 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2759 _mm_cvtpu16_ps(__m64 __a)
2760 {
2761   __m64 __b, __c;
2762   __m128 __r;
2763
2764   __b = _mm_setzero_si64();
2765   __c = _mm_unpackhi_pi16(__a, __b);
2766   __r = _mm_setzero_ps();
2767   __r = _mm_cvtpi32_ps(__r, __c);
2768   __r = _mm_movelh_ps(__r, __r);
2769   __c = _mm_unpacklo_pi16(__a, __b);
2770   __r = _mm_cvtpi32_ps(__r, __c);
2771
2772   return __r;
2773 }
2774
2775 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2776 ///    into a 128-bit vector of [4 x float].
2777 ///
2778 /// \headerfile <x86intrin.h>
2779 ///
2780 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2781 ///
2782 /// \param __a
2783 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2784 ///    from the corresponding lower 4 elements in this operand.
2785 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2786 ///    values from the operand.
2787 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2788 _mm_cvtpi8_ps(__m64 __a)
2789 {
2790   __m64 __b;
2791
2792   __b = _mm_setzero_si64();
2793   __b = _mm_cmpgt_pi8(__b, __a);
2794   __b = _mm_unpacklo_pi8(__a, __b);
2795
2796   return _mm_cvtpi16_ps(__b);
2797 }
2798
2799 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
2800 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2801 ///
2802 /// \headerfile <x86intrin.h>
2803 ///
2804 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2805 ///
2806 /// \param __a
2807 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2808 ///    destination are copied from the corresponding lower 4 elements in this
2809 ///    operand.
2810 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2811 ///    values from the source operand.
2812 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2813 _mm_cvtpu8_ps(__m64 __a)
2814 {
2815   __m64 __b;
2816
2817   __b = _mm_setzero_si64();
2818   __b = _mm_unpacklo_pi8(__a, __b);
2819
2820   return _mm_cvtpi16_ps(__b);
2821 }
2822
2823 /// Converts the two 32-bit signed integer values from each 64-bit vector
2824 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2825 ///
2826 /// \headerfile <x86intrin.h>
2827 ///
2828 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2829 ///
2830 /// \param __a
2831 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2832 ///    copied from the elements in this operand.
2833 /// \param __b
2834 ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2835 ///    copied from the elements in this operand.
2836 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2837 ///    copied and converted values from the first operand. The upper 64 bits
2838 ///    contain the copied and converted values from the second operand.
2839 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2840 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2841 {
2842   __m128 __c;
2843
2844   __c = _mm_setzero_ps();
2845   __c = _mm_cvtpi32_ps(__c, __b);
2846   __c = _mm_movelh_ps(__c, __c);
2847
2848   return _mm_cvtpi32_ps(__c, __a);
2849 }
2850
2851 /// Converts each single-precision floating-point element of a 128-bit
2852 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2853 ///    packs the results into a 64-bit integer vector of [4 x i16].
2854 ///
2855 ///    If the floating-point element is NaN or infinity, or if the
2856 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2857 ///    it is converted to 0x8000. Otherwise if the floating-point element is
2858 ///    greater than 0x7FFF, it is converted to 0x7FFF.
2859 ///
2860 /// \headerfile <x86intrin.h>
2861 ///
2862 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2863 ///
2864 /// \param __a
2865 ///    A 128-bit floating-point vector of [4 x float].
2866 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2867 ///    values.
2868 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2869 _mm_cvtps_pi16(__m128 __a)
2870 {
2871   __m64 __b, __c;
2872
2873   __b = _mm_cvtps_pi32(__a);
2874   __a = _mm_movehl_ps(__a, __a);
2875   __c = _mm_cvtps_pi32(__a);
2876
2877   return _mm_packs_pi32(__b, __c);
2878 }
2879
2880 /// Converts each single-precision floating-point element of a 128-bit
2881 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
2882 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
2883 ///    [8 x i8]. The upper 32 bits of the vector are set to 0.
2884 ///
2885 ///    If the floating-point element is NaN or infinity, or if the
2886 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2887 ///    is converted to 0x80. Otherwise if the floating-point element is greater
2888 ///    than 0x7F, it is converted to 0x7F.
2889 ///
2890 /// \headerfile <x86intrin.h>
2891 ///
2892 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2893 ///
2894 /// \param __a
2895 ///    128-bit floating-point vector of [4 x float].
2896 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2897 ///    converted values and the uppper 32 bits are set to zero.
2898 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2899 _mm_cvtps_pi8(__m128 __a)
2900 {
2901   __m64 __b, __c;
2902
2903   __b = _mm_cvtps_pi16(__a);
2904   __c = _mm_setzero_si64();
2905
2906   return _mm_packs_pi16(__b, __c);
2907 }
2908
2909 /// Extracts the sign bits from each single-precision floating-point
2910 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
2911 ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2912 ///    to zero.
2913 ///
2914 /// \headerfile <x86intrin.h>
2915 ///
2916 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2917 ///
2918 /// \param __a
2919 ///    A 128-bit floating-point vector of [4 x float].
2920 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2921 ///    single-precision floating-point element of the parameter. Bits [31:4] are
2922 ///    set to zero.
2923 static __inline__ int __DEFAULT_FN_ATTRS
2924 _mm_movemask_ps(__m128 __a)
2925 {
2926   return __builtin_ia32_movmskps((__v4sf)__a);
2927 }
2928
2929
2930 #define _MM_ALIGN16 __attribute__((aligned(16)))
2931
2932 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2933
2934 #define _MM_EXCEPT_INVALID    (0x0001)
2935 #define _MM_EXCEPT_DENORM     (0x0002)
2936 #define _MM_EXCEPT_DIV_ZERO   (0x0004)
2937 #define _MM_EXCEPT_OVERFLOW   (0x0008)
2938 #define _MM_EXCEPT_UNDERFLOW  (0x0010)
2939 #define _MM_EXCEPT_INEXACT    (0x0020)
2940 #define _MM_EXCEPT_MASK       (0x003f)
2941
2942 #define _MM_MASK_INVALID      (0x0080)
2943 #define _MM_MASK_DENORM       (0x0100)
2944 #define _MM_MASK_DIV_ZERO     (0x0200)
2945 #define _MM_MASK_OVERFLOW     (0x0400)
2946 #define _MM_MASK_UNDERFLOW    (0x0800)
2947 #define _MM_MASK_INEXACT      (0x1000)
2948 #define _MM_MASK_MASK         (0x1f80)
2949
2950 #define _MM_ROUND_NEAREST     (0x0000)
2951 #define _MM_ROUND_DOWN        (0x2000)
2952 #define _MM_ROUND_UP          (0x4000)
2953 #define _MM_ROUND_TOWARD_ZERO (0x6000)
2954 #define _MM_ROUND_MASK        (0x6000)
2955
2956 #define _MM_FLUSH_ZERO_MASK   (0x8000)
2957 #define _MM_FLUSH_ZERO_ON     (0x8000)
2958 #define _MM_FLUSH_ZERO_OFF    (0x0000)
2959
2960 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2961 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2962 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2963 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2964
2965 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2966 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2967 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2968 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2969
2970 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2971 do { \
2972   __m128 tmp3, tmp2, tmp1, tmp0; \
2973   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2974   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2975   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2976   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2977   (row0) = _mm_movelh_ps(tmp0, tmp2); \
2978   (row1) = _mm_movehl_ps(tmp2, tmp0); \
2979   (row2) = _mm_movelh_ps(tmp1, tmp3); \
2980   (row3) = _mm_movehl_ps(tmp3, tmp1); \
2981 } while (0)
2982
2983 /* Aliases for compatibility. */
2984 #define _m_pextrw _mm_extract_pi16
2985 #define _m_pinsrw _mm_insert_pi16
2986 #define _m_pmaxsw _mm_max_pi16
2987 #define _m_pmaxub _mm_max_pu8
2988 #define _m_pminsw _mm_min_pi16
2989 #define _m_pminub _mm_min_pu8
2990 #define _m_pmovmskb _mm_movemask_pi8
2991 #define _m_pmulhuw _mm_mulhi_pu16
2992 #define _m_pshufw _mm_shuffle_pi16
2993 #define _m_maskmovq _mm_maskmove_si64
2994 #define _m_pavgb _mm_avg_pu8
2995 #define _m_pavgw _mm_avg_pu16
2996 #define _m_psadbw _mm_sad_pu8
2997 #define _m_ _mm_
2998 #define _m_ _mm_
2999
3000 #undef __DEFAULT_FN_ATTRS
3001 #undef __DEFAULT_FN_ATTRS_MMX
3002
3003 /* Ugly hack for backwards-compatibility (compatible with gcc) */
3004 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3005 #include <emmintrin.h>
3006 #endif
3007
3008 #endif /* __XMMINTRIN_H */