contrib/llvm/tools/clang/lib/Headers/emmintrin.h

   1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
   2  *
   3  * Permission is hereby granted, free of charge, to any person obtaining a copy
   4  * of this software and associated documentation files (the "Software"), to deal
   5  * in the Software without restriction, including without limitation the rights
   6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   7  * copies of the Software, and to permit persons to whom the Software is
   8  * furnished to do so, subject to the following conditions:
   9  *
  10  * The above copyright notice and this permission notice shall be included in
  11  * all copies or substantial portions of the Software.
  12  *
  13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19  * THE SOFTWARE.
  20  *
  21  *===-----------------------------------------------------------------------===
  22  */
  23
  24 #ifndef __EMMINTRIN_H
  25 #define __EMMINTRIN_H
  26
  27 #include <xmmintrin.h>
  28
  29 typedef double __m128d __attribute__((__vector_size__(16)));
  30 typedef long long __m128i __attribute__((__vector_size__(16)));
  31
  32 /* Type defines.  */
  33 typedef double __v2df __attribute__ ((__vector_size__ (16)));
  34 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
  35 typedef short __v8hi __attribute__((__vector_size__(16)));
  36 typedef char __v16qi __attribute__((__vector_size__(16)));
  37
  38 /* Unsigned types */
  39 typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
  40 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
  41 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
  42
  43 /* We need an explicitly signed variant for char. Note that this shouldn't
  44  * appear in the interface though. */
  45 typedef signed char __v16qs __attribute__((__vector_size__(16)));
  46
  47 #include <f16cintrin.h>
  48
  49 /* Define the default attributes for the functions in this file. */
  50 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
  51
  52 static __inline__ __m128d __DEFAULT_FN_ATTRS
  53 _mm_add_sd(__m128d __a, __m128d __b)
  54 {
  55   __a[0] += __b[0];
  56   return __a;
  57 }
  58
  59 static __inline__ __m128d __DEFAULT_FN_ATTRS
  60 _mm_add_pd(__m128d __a, __m128d __b)
  61 {
  62   return (__m128d)((__v2df)__a + (__v2df)__b);
  63 }
  64
  65 static __inline__ __m128d __DEFAULT_FN_ATTRS
  66 _mm_sub_sd(__m128d __a, __m128d __b)
  67 {
  68   __a[0] -= __b[0];
  69   return __a;
  70 }
  71
  72 static __inline__ __m128d __DEFAULT_FN_ATTRS
  73 _mm_sub_pd(__m128d __a, __m128d __b)
  74 {
  75   return (__m128d)((__v2df)__a - (__v2df)__b);
  76 }
  77
  78 static __inline__ __m128d __DEFAULT_FN_ATTRS
  79 _mm_mul_sd(__m128d __a, __m128d __b)
  80 {
  81   __a[0] *= __b[0];
  82   return __a;
  83 }
  84
  85 static __inline__ __m128d __DEFAULT_FN_ATTRS
  86 _mm_mul_pd(__m128d __a, __m128d __b)
  87 {
  88   return (__m128d)((__v2df)__a * (__v2df)__b);
  89 }
  90
  91 static __inline__ __m128d __DEFAULT_FN_ATTRS
  92 _mm_div_sd(__m128d __a, __m128d __b)
  93 {
  94   __a[0] /= __b[0];
  95   return __a;
  96 }
  97
  98 static __inline__ __m128d __DEFAULT_FN_ATTRS
  99 _mm_div_pd(__m128d __a, __m128d __b)
 100 {
 101   return (__m128d)((__v2df)__a / (__v2df)__b);
 102 }
 103
 104 static __inline__ __m128d __DEFAULT_FN_ATTRS
 105 _mm_sqrt_sd(__m128d __a, __m128d __b)
 106 {
 107   __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
 108   return (__m128d) { __c[0], __a[1] };
 109 }
 110
 111 static __inline__ __m128d __DEFAULT_FN_ATTRS
 112 _mm_sqrt_pd(__m128d __a)
 113 {
 114   return __builtin_ia32_sqrtpd((__v2df)__a);
 115 }
 116
 117 static __inline__ __m128d __DEFAULT_FN_ATTRS
 118 _mm_min_sd(__m128d __a, __m128d __b)
 119 {
 120   return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
 121 }
 122
 123 static __inline__ __m128d __DEFAULT_FN_ATTRS
 124 _mm_min_pd(__m128d __a, __m128d __b)
 125 {
 126   return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
 127 }
 128
 129 static __inline__ __m128d __DEFAULT_FN_ATTRS
 130 _mm_max_sd(__m128d __a, __m128d __b)
 131 {
 132   return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
 133 }
 134
 135 static __inline__ __m128d __DEFAULT_FN_ATTRS
 136 _mm_max_pd(__m128d __a, __m128d __b)
 137 {
 138   return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
 139 }
 140
 141 static __inline__ __m128d __DEFAULT_FN_ATTRS
 142 _mm_and_pd(__m128d __a, __m128d __b)
 143 {
 144   return (__m128d)((__v4su)__a & (__v4su)__b);
 145 }
 146
 147 static __inline__ __m128d __DEFAULT_FN_ATTRS
 148 _mm_andnot_pd(__m128d __a, __m128d __b)
 149 {
 150   return (__m128d)(~(__v4su)__a & (__v4su)__b);
 151 }
 152
 153 static __inline__ __m128d __DEFAULT_FN_ATTRS
 154 _mm_or_pd(__m128d __a, __m128d __b)
 155 {
 156   return (__m128d)((__v4su)__a | (__v4su)__b);
 157 }
 158
 159 static __inline__ __m128d __DEFAULT_FN_ATTRS
 160 _mm_xor_pd(__m128d __a, __m128d __b)
 161 {
 162   return (__m128d)((__v4su)__a ^ (__v4su)__b);
 163 }
 164
 165 static __inline__ __m128d __DEFAULT_FN_ATTRS
 166 _mm_cmpeq_pd(__m128d __a, __m128d __b)
 167 {
 168   return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
 169 }
 170
 171 static __inline__ __m128d __DEFAULT_FN_ATTRS
 172 _mm_cmplt_pd(__m128d __a, __m128d __b)
 173 {
 174   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
 175 }
 176
 177 static __inline__ __m128d __DEFAULT_FN_ATTRS
 178 _mm_cmple_pd(__m128d __a, __m128d __b)
 179 {
 180   return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
 181 }
 182
 183 static __inline__ __m128d __DEFAULT_FN_ATTRS
 184 _mm_cmpgt_pd(__m128d __a, __m128d __b)
 185 {
 186   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
 187 }
 188
 189 static __inline__ __m128d __DEFAULT_FN_ATTRS
 190 _mm_cmpge_pd(__m128d __a, __m128d __b)
 191 {
 192   return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
 193 }
 194
 195 static __inline__ __m128d __DEFAULT_FN_ATTRS
 196 _mm_cmpord_pd(__m128d __a, __m128d __b)
 197 {
 198   return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
 199 }
 200
 201 static __inline__ __m128d __DEFAULT_FN_ATTRS
 202 _mm_cmpunord_pd(__m128d __a, __m128d __b)
 203 {
 204   return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
 205 }
 206
 207 static __inline__ __m128d __DEFAULT_FN_ATTRS
 208 _mm_cmpneq_pd(__m128d __a, __m128d __b)
 209 {
 210   return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
 211 }
 212
 213 static __inline__ __m128d __DEFAULT_FN_ATTRS
 214 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
 215 {
 216   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
 217 }
 218
 219 static __inline__ __m128d __DEFAULT_FN_ATTRS
 220 _mm_cmpnle_pd(__m128d __a, __m128d __b)
 221 {
 222   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
 223 }
 224
 225 static __inline__ __m128d __DEFAULT_FN_ATTRS
 226 _mm_cmpngt_pd(__m128d __a, __m128d __b)
 227 {
 228   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
 229 }
 230
 231 static __inline__ __m128d __DEFAULT_FN_ATTRS
 232 _mm_cmpnge_pd(__m128d __a, __m128d __b)
 233 {
 234   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
 235 }
 236
 237 static __inline__ __m128d __DEFAULT_FN_ATTRS
 238 _mm_cmpeq_sd(__m128d __a, __m128d __b)
 239 {
 240   return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
 241 }
 242
 243 static __inline__ __m128d __DEFAULT_FN_ATTRS
 244 _mm_cmplt_sd(__m128d __a, __m128d __b)
 245 {
 246   return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
 247 }
 248
 249 static __inline__ __m128d __DEFAULT_FN_ATTRS
 250 _mm_cmple_sd(__m128d __a, __m128d __b)
 251 {
 252   return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
 253 }
 254
 255 static __inline__ __m128d __DEFAULT_FN_ATTRS
 256 _mm_cmpgt_sd(__m128d __a, __m128d __b)
 257 {
 258   __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
 259   return (__m128d) { __c[0], __a[1] };
 260 }
 261
 262 static __inline__ __m128d __DEFAULT_FN_ATTRS
 263 _mm_cmpge_sd(__m128d __a, __m128d __b)
 264 {
 265   __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
 266   return (__m128d) { __c[0], __a[1] };
 267 }
 268
 269 static __inline__ __m128d __DEFAULT_FN_ATTRS
 270 _mm_cmpord_sd(__m128d __a, __m128d __b)
 271 {
 272   return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
 273 }
 274
 275 static __inline__ __m128d __DEFAULT_FN_ATTRS
 276 _mm_cmpunord_sd(__m128d __a, __m128d __b)
 277 {
 278   return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
 279 }
 280
 281 static __inline__ __m128d __DEFAULT_FN_ATTRS
 282 _mm_cmpneq_sd(__m128d __a, __m128d __b)
 283 {
 284   return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
 285 }
 286
 287 static __inline__ __m128d __DEFAULT_FN_ATTRS
 288 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
 289 {
 290   return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
 291 }
 292
 293 static __inline__ __m128d __DEFAULT_FN_ATTRS
 294 _mm_cmpnle_sd(__m128d __a, __m128d __b)
 295 {
 296   return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
 297 }
 298
 299 static __inline__ __m128d __DEFAULT_FN_ATTRS
 300 _mm_cmpngt_sd(__m128d __a, __m128d __b)
 301 {
 302   __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
 303   return (__m128d) { __c[0], __a[1] };
 304 }
 305
 306 static __inline__ __m128d __DEFAULT_FN_ATTRS
 307 _mm_cmpnge_sd(__m128d __a, __m128d __b)
 308 {
 309   __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
 310   return (__m128d) { __c[0], __a[1] };
 311 }
 312
 313 static __inline__ int __DEFAULT_FN_ATTRS
 314 _mm_comieq_sd(__m128d __a, __m128d __b)
 315 {
 316   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
 317 }
 318
 319 static __inline__ int __DEFAULT_FN_ATTRS
 320 _mm_comilt_sd(__m128d __a, __m128d __b)
 321 {
 322   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
 323 }
 324
 325 static __inline__ int __DEFAULT_FN_ATTRS
 326 _mm_comile_sd(__m128d __a, __m128d __b)
 327 {
 328   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
 329 }
 330
 331 static __inline__ int __DEFAULT_FN_ATTRS
 332 _mm_comigt_sd(__m128d __a, __m128d __b)
 333 {
 334   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
 335 }
 336
 337 static __inline__ int __DEFAULT_FN_ATTRS
 338 _mm_comige_sd(__m128d __a, __m128d __b)
 339 {
 340   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
 341 }
 342
 343 static __inline__ int __DEFAULT_FN_ATTRS
 344 _mm_comineq_sd(__m128d __a, __m128d __b)
 345 {
 346   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
 347 }
 348
 349 static __inline__ int __DEFAULT_FN_ATTRS
 350 _mm_ucomieq_sd(__m128d __a, __m128d __b)
 351 {
 352   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
 353 }
 354
 355 static __inline__ int __DEFAULT_FN_ATTRS
 356 _mm_ucomilt_sd(__m128d __a, __m128d __b)
 357 {
 358   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
 359 }
 360
 361 static __inline__ int __DEFAULT_FN_ATTRS
 362 _mm_ucomile_sd(__m128d __a, __m128d __b)
 363 {
 364   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
 365 }
 366
 367 static __inline__ int __DEFAULT_FN_ATTRS
 368 _mm_ucomigt_sd(__m128d __a, __m128d __b)
 369 {
 370   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
 371 }
 372
 373 static __inline__ int __DEFAULT_FN_ATTRS
 374 _mm_ucomige_sd(__m128d __a, __m128d __b)
 375 {
 376   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
 377 }
 378
 379 static __inline__ int __DEFAULT_FN_ATTRS
 380 _mm_ucomineq_sd(__m128d __a, __m128d __b)
 381 {
 382   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
 383 }
 384
 385 static __inline__ __m128 __DEFAULT_FN_ATTRS
 386 _mm_cvtpd_ps(__m128d __a)
 387 {
 388   return __builtin_ia32_cvtpd2ps((__v2df)__a);
 389 }
 390
 391 static __inline__ __m128d __DEFAULT_FN_ATTRS
 392 _mm_cvtps_pd(__m128 __a)
 393 {
 394   return (__m128d) __builtin_convertvector(
 395       __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
 396 }
 397
 398 static __inline__ __m128d __DEFAULT_FN_ATTRS
 399 _mm_cvtepi32_pd(__m128i __a)
 400 {
 401   return (__m128d) __builtin_convertvector(
 402       __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
 403 }
 404
 405 static __inline__ __m128i __DEFAULT_FN_ATTRS
 406 _mm_cvtpd_epi32(__m128d __a)
 407 {
 408   return __builtin_ia32_cvtpd2dq((__v2df)__a);
 409 }
 410
 411 static __inline__ int __DEFAULT_FN_ATTRS
 412 _mm_cvtsd_si32(__m128d __a)
 413 {
 414   return __builtin_ia32_cvtsd2si((__v2df)__a);
 415 }
 416
 417 static __inline__ __m128 __DEFAULT_FN_ATTRS
 418 _mm_cvtsd_ss(__m128 __a, __m128d __b)
 419 {
 420   __a[0] = __b[0];
 421   return __a;
 422 }
 423
 424 static __inline__ __m128d __DEFAULT_FN_ATTRS
 425 _mm_cvtsi32_sd(__m128d __a, int __b)
 426 {
 427   __a[0] = __b;
 428   return __a;
 429 }
 430
 431 static __inline__ __m128d __DEFAULT_FN_ATTRS
 432 _mm_cvtss_sd(__m128d __a, __m128 __b)
 433 {
 434   __a[0] = __b[0];
 435   return __a;
 436 }
 437
 438 static __inline__ __m128i __DEFAULT_FN_ATTRS
 439 _mm_cvttpd_epi32(__m128d __a)
 440 {
 441   return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
 442 }
 443
 444 static __inline__ int __DEFAULT_FN_ATTRS
 445 _mm_cvttsd_si32(__m128d __a)
 446 {
 447   return __a[0];
 448 }
 449
 450 static __inline__ __m64 __DEFAULT_FN_ATTRS
 451 _mm_cvtpd_pi32(__m128d __a)
 452 {
 453   return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
 454 }
 455
 456 static __inline__ __m64 __DEFAULT_FN_ATTRS
 457 _mm_cvttpd_pi32(__m128d __a)
 458 {
 459   return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
 460 }
 461
 462 static __inline__ __m128d __DEFAULT_FN_ATTRS
 463 _mm_cvtpi32_pd(__m64 __a)
 464 {
 465   return __builtin_ia32_cvtpi2pd((__v2si)__a);
 466 }
 467
 468 static __inline__ double __DEFAULT_FN_ATTRS
 469 _mm_cvtsd_f64(__m128d __a)
 470 {
 471   return __a[0];
 472 }
 473
 474 static __inline__ __m128d __DEFAULT_FN_ATTRS
 475 _mm_load_pd(double const *__dp)
 476 {
 477   return *(__m128d*)__dp;
 478 }
 479
 480 static __inline__ __m128d __DEFAULT_FN_ATTRS
 481 _mm_load1_pd(double const *__dp)
 482 {
 483   struct __mm_load1_pd_struct {
 484     double __u;
 485   } __attribute__((__packed__, __may_alias__));
 486   double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
 487   return (__m128d){ __u, __u };
 488 }
 489
 490 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
 491
 492 static __inline__ __m128d __DEFAULT_FN_ATTRS
 493 _mm_loadr_pd(double const *__dp)
 494 {
 495   __m128d __u = *(__m128d*)__dp;
 496   return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
 497 }
 498
 499 static __inline__ __m128d __DEFAULT_FN_ATTRS
 500 _mm_loadu_pd(double const *__dp)
 501 {
 502   struct __loadu_pd {
 503     __m128d __v;
 504   } __attribute__((__packed__, __may_alias__));
 505   return ((struct __loadu_pd*)__dp)->__v;
 506 }
 507
 508 static __inline__ __m128i __DEFAULT_FN_ATTRS
 509 _mm_loadu_si64(void const *__a)
 510 {
 511   struct __loadu_si64 {
 512     long long __v;
 513   } __attribute__((__packed__, __may_alias__));
 514   long long __u = ((struct __loadu_si64*)__a)->__v;
 515   return (__m128i){__u, 0L};
 516 }
 517
 518 static __inline__ __m128d __DEFAULT_FN_ATTRS
 519 _mm_load_sd(double const *__dp)
 520 {
 521   struct __mm_load_sd_struct {
 522     double __u;
 523   } __attribute__((__packed__, __may_alias__));
 524   double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
 525   return (__m128d){ __u, 0 };
 526 }
 527
 528 static __inline__ __m128d __DEFAULT_FN_ATTRS
 529 _mm_loadh_pd(__m128d __a, double const *__dp)
 530 {
 531   struct __mm_loadh_pd_struct {
 532     double __u;
 533   } __attribute__((__packed__, __may_alias__));
 534   double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
 535   return (__m128d){ __a[0], __u };
 536 }
 537
 538 static __inline__ __m128d __DEFAULT_FN_ATTRS
 539 _mm_loadl_pd(__m128d __a, double const *__dp)
 540 {
 541   struct __mm_loadl_pd_struct {
 542     double __u;
 543   } __attribute__((__packed__, __may_alias__));
 544   double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
 545   return (__m128d){ __u, __a[1] };
 546 }
 547
 548 static __inline__ __m128d __DEFAULT_FN_ATTRS
 549 _mm_undefined_pd(void)
 550 {
 551   return (__m128d)__builtin_ia32_undef128();
 552 }
 553
 554 static __inline__ __m128d __DEFAULT_FN_ATTRS
 555 _mm_set_sd(double __w)
 556 {
 557   return (__m128d){ __w, 0 };
 558 }
 559
 560 static __inline__ __m128d __DEFAULT_FN_ATTRS
 561 _mm_set1_pd(double __w)
 562 {
 563   return (__m128d){ __w, __w };
 564 }
 565
 566 static __inline__ __m128d __DEFAULT_FN_ATTRS
 567 _mm_set_pd(double __w, double __x)
 568 {
 569   return (__m128d){ __x, __w };
 570 }
 571
 572 static __inline__ __m128d __DEFAULT_FN_ATTRS
 573 _mm_setr_pd(double __w, double __x)
 574 {
 575   return (__m128d){ __w, __x };
 576 }
 577
 578 static __inline__ __m128d __DEFAULT_FN_ATTRS
 579 _mm_setzero_pd(void)
 580 {
 581   return (__m128d){ 0, 0 };
 582 }
 583
 584 static __inline__ __m128d __DEFAULT_FN_ATTRS
 585 _mm_move_sd(__m128d __a, __m128d __b)
 586 {
 587   return (__m128d){ __b[0], __a[1] };
 588 }
 589
 590 static __inline__ void __DEFAULT_FN_ATTRS
 591 _mm_store_sd(double *__dp, __m128d __a)
 592 {
 593   struct __mm_store_sd_struct {
 594     double __u;
 595   } __attribute__((__packed__, __may_alias__));
 596   ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
 597 }
 598
 599 static __inline__ void __DEFAULT_FN_ATTRS
 600 _mm_store_pd(double *__dp, __m128d __a)
 601 {
 602   *(__m128d*)__dp = __a;
 603 }
 604
 605 static __inline__ void __DEFAULT_FN_ATTRS
 606 _mm_store1_pd(double *__dp, __m128d __a)
 607 {
 608   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
 609   _mm_store_pd(__dp, __a);
 610 }
 611
 612 static __inline__ void __DEFAULT_FN_ATTRS
 613 _mm_store_pd1(double *__dp, __m128d __a)
 614 {
 615   return _mm_store1_pd(__dp, __a);
 616 }
 617
 618 static __inline__ void __DEFAULT_FN_ATTRS
 619 _mm_storeu_pd(double *__dp, __m128d __a)
 620 {
 621   struct __storeu_pd {
 622     __m128d __v;
 623   } __attribute__((__packed__, __may_alias__));
 624   ((struct __storeu_pd*)__dp)->__v = __a;
 625 }
 626
 627 static __inline__ void __DEFAULT_FN_ATTRS
 628 _mm_storer_pd(double *__dp, __m128d __a)
 629 {
 630   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
 631   *(__m128d *)__dp = __a;
 632 }
 633
 634 static __inline__ void __DEFAULT_FN_ATTRS
 635 _mm_storeh_pd(double *__dp, __m128d __a)
 636 {
 637   struct __mm_storeh_pd_struct {
 638     double __u;
 639   } __attribute__((__packed__, __may_alias__));
 640   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
 641 }
 642
 643 static __inline__ void __DEFAULT_FN_ATTRS
 644 _mm_storel_pd(double *__dp, __m128d __a)
 645 {
 646   struct __mm_storeh_pd_struct {
 647     double __u;
 648   } __attribute__((__packed__, __may_alias__));
 649   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
 650 }
 651
 652 static __inline__ __m128i __DEFAULT_FN_ATTRS
 653 _mm_add_epi8(__m128i __a, __m128i __b)
 654 {
 655   return (__m128i)((__v16qu)__a + (__v16qu)__b);
 656 }
 657
 658 static __inline__ __m128i __DEFAULT_FN_ATTRS
 659 _mm_add_epi16(__m128i __a, __m128i __b)
 660 {
 661   return (__m128i)((__v8hu)__a + (__v8hu)__b);
 662 }
 663
 664 static __inline__ __m128i __DEFAULT_FN_ATTRS
 665 _mm_add_epi32(__m128i __a, __m128i __b)
 666 {
 667   return (__m128i)((__v4su)__a + (__v4su)__b);
 668 }
 669
 670 static __inline__ __m64 __DEFAULT_FN_ATTRS
 671 _mm_add_si64(__m64 __a, __m64 __b)
 672 {
 673   return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
 674 }
 675
 676 static __inline__ __m128i __DEFAULT_FN_ATTRS
 677 _mm_add_epi64(__m128i __a, __m128i __b)
 678 {
 679   return (__m128i)((__v2du)__a + (__v2du)__b);
 680 }
 681
 682 static __inline__ __m128i __DEFAULT_FN_ATTRS
 683 _mm_adds_epi8(__m128i __a, __m128i __b)
 684 {
 685   return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
 686 }
 687
 688 static __inline__ __m128i __DEFAULT_FN_ATTRS
 689 _mm_adds_epi16(__m128i __a, __m128i __b)
 690 {
 691   return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
 692 }
 693
 694 static __inline__ __m128i __DEFAULT_FN_ATTRS
 695 _mm_adds_epu8(__m128i __a, __m128i __b)
 696 {
 697   return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
 698 }
 699
 700 static __inline__ __m128i __DEFAULT_FN_ATTRS
 701 _mm_adds_epu16(__m128i __a, __m128i __b)
 702 {
 703   return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
 704 }
 705
 706 static __inline__ __m128i __DEFAULT_FN_ATTRS
 707 _mm_avg_epu8(__m128i __a, __m128i __b)
 708 {
 709   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
 710 }
 711
 712 static __inline__ __m128i __DEFAULT_FN_ATTRS
 713 _mm_avg_epu16(__m128i __a, __m128i __b)
 714 {
 715   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
 716 }
 717
 718 static __inline__ __m128i __DEFAULT_FN_ATTRS
 719 _mm_madd_epi16(__m128i __a, __m128i __b)
 720 {
 721   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
 722 }
 723
 724 static __inline__ __m128i __DEFAULT_FN_ATTRS
 725 _mm_max_epi16(__m128i __a, __m128i __b)
 726 {
 727   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
 728 }
 729
 730 static __inline__ __m128i __DEFAULT_FN_ATTRS
 731 _mm_max_epu8(__m128i __a, __m128i __b)
 732 {
 733   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
 734 }
 735
 736 static __inline__ __m128i __DEFAULT_FN_ATTRS
 737 _mm_min_epi16(__m128i __a, __m128i __b)
 738 {
 739   return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
 740 }
 741
 742 static __inline__ __m128i __DEFAULT_FN_ATTRS
 743 _mm_min_epu8(__m128i __a, __m128i __b)
 744 {
 745   return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
 746 }
 747
 748 static __inline__ __m128i __DEFAULT_FN_ATTRS
 749 _mm_mulhi_epi16(__m128i __a, __m128i __b)
 750 {
 751   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
 752 }
 753
 754 static __inline__ __m128i __DEFAULT_FN_ATTRS
 755 _mm_mulhi_epu16(__m128i __a, __m128i __b)
 756 {
 757   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
 758 }
 759
 760 /// \brief Multiplies the corresponding elements of two [8 x short] vectors and
 761 ///    returns a vector containing the low-order 16 bits of each 32-bit product
 762 ///    in the corresponding element.
 763 ///
 764 /// \headerfile <x86intrin.h>
 765 ///
 766 /// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction.
 767 ///
 768 /// \param __a
 769 ///    A 128-bit integer vector containing one of the source operands.
 770 /// \param __b
 771 ///    A 128-bit integer vector containing one of the source operands.
 772 /// \returns A 128-bit integer vector containing the products of both operands.
 773 static __inline__ __m128i __DEFAULT_FN_ATTRS
 774 _mm_mullo_epi16(__m128i __a, __m128i __b)
 775 {
 776   return (__m128i)((__v8hu)__a * (__v8hu)__b);
 777 }
 778
 779 /// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
 780 ///    of the two 64-bit integer vectors and returns the 64-bit unsigned
 781 ///    product.
 782 ///
 783 /// \headerfile <x86intrin.h>
 784 ///
 785 /// This intrinsic corresponds to the \c PMULUDQ instruction.
 786 ///
 787 /// \param __a
 788 ///    A 64-bit integer containing one of the source operands.
 789 /// \param __b
 790 ///    A 64-bit integer containing one of the source operands.
 791 /// \returns A 64-bit integer vector containing the product of both operands.
 792 static __inline__ __m64 __DEFAULT_FN_ATTRS
 793 _mm_mul_su32(__m64 __a, __m64 __b)
 794 {
 795   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
 796 }
 797
 798 /// \brief Multiplies 32-bit unsigned integer values contained in the lower
 799 ///    bits of the corresponding elements of two [2 x i64] vectors, and returns
 800 ///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
 801 ///
 802 /// \headerfile <x86intrin.h>
 803 ///
 804 /// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction.
 805 ///
 806 /// \param __a
 807 ///    A [2 x i64] vector containing one of the source operands.
 808 /// \param __b
 809 ///    A [2 x i64] vector containing one of the source operands.
 810 /// \returns A [2 x i64] vector containing the product of both operands.
 811 static __inline__ __m128i __DEFAULT_FN_ATTRS
 812 _mm_mul_epu32(__m128i __a, __m128i __b)
 813 {
 814   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
 815 }
 816
 817 /// \brief Computes the absolute differences of corresponding 8-bit integer
 818 ///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
 819 ///    separately sums the second 8 absolute differences. Packss these two
 820 ///    unsigned 16-bit integer sums into the upper and lower elements of a
 821 ///    [2 x i64] vector.
 822 ///
 823 /// \headerfile <x86intrin.h>
 824 ///
 825 /// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction.
 826 ///
 827 /// \param __a
 828 ///    A 128-bit integer vector containing one of the source operands.
 829 /// \param __b
 830 ///    A 128-bit integer vector containing one of the source operands.
 831 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
 832 ///    differences between both operands.
 833 static __inline__ __m128i __DEFAULT_FN_ATTRS
 834 _mm_sad_epu8(__m128i __a, __m128i __b)
 835 {
 836   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
 837 }
 838
 839 /// \brief Subtracts the corresponding 8-bit integer values in the operands.
 840 ///
 841 /// \headerfile <x86intrin.h>
 842 ///
 843 /// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction.
 844 ///
 845 /// \param __a
 846 ///    A 128-bit integer vector containing the minuends.
 847 /// \param __b
 848 ///    A 128-bit integer vector containing the subtrahends.
 849 /// \returns A 128-bit integer vector containing the differences of the values
 850 ///    in the operands.
 851 static __inline__ __m128i __DEFAULT_FN_ATTRS
 852 _mm_sub_epi8(__m128i __a, __m128i __b)
 853 {
 854   return (__m128i)((__v16qu)__a - (__v16qu)__b);
 855 }
 856
 857 /// \brief Subtracts the corresponding 16-bit integer values in the operands.
 858 ///
 859 /// \headerfile <x86intrin.h>
 860 ///
 861 /// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction.
 862 ///
 863 /// \param __a
 864 ///    A 128-bit integer vector containing the minuends.
 865 /// \param __b
 866 ///    A 128-bit integer vector containing the subtrahends.
 867 /// \returns A 128-bit integer vector containing the differences of the values
 868 ///    in the operands.
 869 static __inline__ __m128i __DEFAULT_FN_ATTRS
 870 _mm_sub_epi16(__m128i __a, __m128i __b)
 871 {
 872   return (__m128i)((__v8hu)__a - (__v8hu)__b);
 873 }
 874
 875 /// \brief Subtracts the corresponding 32-bit integer values in the operands.
 876 ///
 877 /// \headerfile <x86intrin.h>
 878 ///
 879 /// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction.
 880 ///
 881 /// \param __a
 882 ///    A 128-bit integer vector containing the minuends.
 883 /// \param __b
 884 ///    A 128-bit integer vector containing the subtrahends.
 885 /// \returns A 128-bit integer vector containing the differences of the values
 886 ///    in the operands.
 887 static __inline__ __m128i __DEFAULT_FN_ATTRS
 888 _mm_sub_epi32(__m128i __a, __m128i __b)
 889 {
 890   return (__m128i)((__v4su)__a - (__v4su)__b);
 891 }
 892
 893 /// \brief Subtracts signed or unsigned 64-bit integer values and writes the
 894 ///    difference to the corresponding bits in the destination.
 895 ///
 896 /// \headerfile <x86intrin.h>
 897 ///
 898 /// This intrinsic corresponds to the \c PSUBQ instruction.
 899 ///
 900 /// \param __a
 901 ///    A 64-bit integer vector containing the minuend.
 902 /// \param __b
 903 ///    A 64-bit integer vector containing the subtrahend.
 904 /// \returns A 64-bit integer vector containing the difference of the values in
 905 ///    the operands.
 906 static __inline__ __m64 __DEFAULT_FN_ATTRS
 907 _mm_sub_si64(__m64 __a, __m64 __b)
 908 {
 909   return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
 910 }
 911
 912 /// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
 913 ///
 914 /// \headerfile <x86intrin.h>
 915 ///
 916 /// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction.
 917 ///
 918 /// \param __a
 919 ///    A 128-bit integer vector containing the minuends.
 920 /// \param __b
 921 ///    A 128-bit integer vector containing the subtrahends.
 922 /// \returns A 128-bit integer vector containing the differences of the values
 923 ///    in the operands.
 924 static __inline__ __m128i __DEFAULT_FN_ATTRS
 925 _mm_sub_epi64(__m128i __a, __m128i __b)
 926 {
 927   return (__m128i)((__v2du)__a - (__v2du)__b);
 928 }
 929
 930 /// \brief Subtracts corresponding 8-bit signed integer values in the input and
 931 ///    returns the differences in the corresponding bytes in the destination.
 932 ///    Differences greater than 7Fh are saturated to 7Fh, and differences less
 933 ///    than 80h are saturated to 80h.
 934 ///
 935 /// \headerfile <x86intrin.h>
 936 ///
 937 /// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction.
 938 ///
 939 /// \param __a
 940 ///    A 128-bit integer vector containing the minuends.
 941 /// \param __b
 942 ///    A 128-bit integer vector containing the subtrahends.
 943 /// \returns A 128-bit integer vector containing the differences of the values
 944 ///    in the operands.
 945 static __inline__ __m128i __DEFAULT_FN_ATTRS
 946 _mm_subs_epi8(__m128i __a, __m128i __b)
 947 {
 948   return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
 949 }
 950
 951 /// \brief Subtracts corresponding 16-bit signed integer values in the input and
 952 ///    returns the differences in the corresponding bytes in the destination.
 953 ///    Differences greater than 7FFFh are saturated to 7FFFh, and values less
 954 ///    than 8000h are saturated to 8000h.
 955 ///
 956 /// \headerfile <x86intrin.h>
 957 ///
 958 /// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction.
 959 ///
 960 /// \param __a
 961 ///    A 128-bit integer vector containing the minuends.
 962 /// \param __b
 963 ///    A 128-bit integer vector containing the subtrahends.
 964 /// \returns A 128-bit integer vector containing the differences of the values
 965 ///    in the operands.
 966 static __inline__ __m128i __DEFAULT_FN_ATTRS
 967 _mm_subs_epi16(__m128i __a, __m128i __b)
 968 {
 969   return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
 970 }
 971
 972 /// \brief Subtracts corresponding 8-bit unsigned integer values in the input
 973 ///    and returns the differences in the corresponding bytes in the
 974 ///    destination. Differences less than 00h are saturated to 00h.
 975 ///
 976 /// \headerfile <x86intrin.h>
 977 ///
 978 /// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction.
 979 ///
 980 /// \param __a
 981 ///    A 128-bit integer vector containing the minuends.
 982 /// \param __b
 983 ///    A 128-bit integer vector containing the subtrahends.
 984 /// \returns A 128-bit integer vector containing the unsigned integer
 985 ///    differences of the values in the operands.
 986 static __inline__ __m128i __DEFAULT_FN_ATTRS
 987 _mm_subs_epu8(__m128i __a, __m128i __b)
 988 {
 989   return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
 990 }
 991
 992 /// \brief Subtracts corresponding 16-bit unsigned integer values in the input
 993 ///    and returns the differences in the corresponding bytes in the
 994 ///    destination. Differences less than 0000h are saturated to 0000h.
 995 ///
 996 /// \headerfile <x86intrin.h>
 997 ///
 998 /// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction.
 999 ///
1000 /// \param __a
1001 ///    A 128-bit integer vector containing the minuends.
1002 /// \param __b
1003 ///    A 128-bit integer vector containing the subtrahends.
1004 /// \returns A 128-bit integer vector containing the unsigned integer
1005 ///    differences of the values in the operands.
1006 static __inline__ __m128i __DEFAULT_FN_ATTRS
1007 _mm_subs_epu16(__m128i __a, __m128i __b)
1008 {
1009   return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
1010 }
1011
1012 /// \brief Performs a bitwise AND of two 128-bit integer vectors.
1013 ///
1014 /// \headerfile <x86intrin.h>
1015 ///
1016 /// This intrinsic corresponds to the \c VPAND / PAND instruction.
1017 ///
1018 /// \param __a
1019 ///    A 128-bit integer vector containing one of the source operands.
1020 /// \param __b
1021 ///    A 128-bit integer vector containing one of the source operands.
1022 /// \returns A 128-bit integer vector containing the bitwise AND of the values
1023 ///    in both operands.
1024 static __inline__ __m128i __DEFAULT_FN_ATTRS
1025 _mm_and_si128(__m128i __a, __m128i __b)
1026 {
1027   return (__m128i)((__v2du)__a & (__v2du)__b);
1028 }
1029
1030 /// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
1031 ///    one's complement of the values contained in the first source operand.
1032 ///
1033 /// \headerfile <x86intrin.h>
1034 ///
1035 /// This intrinsic corresponds to the \c VPANDN / PANDN instruction.
1036 ///
1037 /// \param __a
1038 ///    A 128-bit vector containing the left source operand. The one's complement
1039 ///    of this value is used in the bitwise AND.
1040 /// \param __b
1041 ///    A 128-bit vector containing the right source operand.
1042 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
1043 ///    complement of the first operand and the values in the second operand.
1044 static __inline__ __m128i __DEFAULT_FN_ATTRS
1045 _mm_andnot_si128(__m128i __a, __m128i __b)
1046 {
1047   return (__m128i)(~(__v2du)__a & (__v2du)__b);
1048 }
1049 /// \brief Performs a bitwise OR of two 128-bit integer vectors.
1050 ///
1051 /// \headerfile <x86intrin.h>
1052 ///
1053 /// This intrinsic corresponds to the \c VPOR / POR instruction.
1054 ///
1055 /// \param __a
1056 ///    A 128-bit integer vector containing one of the source operands.
1057 /// \param __b
1058 ///    A 128-bit integer vector containing one of the source operands.
1059 /// \returns A 128-bit integer vector containing the bitwise OR of the values
1060 ///    in both operands.
1061 static __inline__ __m128i __DEFAULT_FN_ATTRS
1062 _mm_or_si128(__m128i __a, __m128i __b)
1063 {
1064   return (__m128i)((__v2du)__a | (__v2du)__b);
1065 }
1066
1067 /// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
1068 ///
1069 /// \headerfile <x86intrin.h>
1070 ///
1071 /// This intrinsic corresponds to the \c VPXOR / PXOR instruction.
1072 ///
1073 /// \param __a
1074 ///    A 128-bit integer vector containing one of the source operands.
1075 /// \param __b
1076 ///    A 128-bit integer vector containing one of the source operands.
1077 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
1078 ///    values in both operands.
1079 static __inline__ __m128i __DEFAULT_FN_ATTRS
1080 _mm_xor_si128(__m128i __a, __m128i __b)
1081 {
1082   return (__m128i)((__v2du)__a ^ (__v2du)__b);
1083 }
1084
1085 /// \brief Left-shifts the 128-bit integer vector operand by the specified
1086 ///    number of bytes. Low-order bits are cleared.
1087 ///
1088 /// \headerfile <x86intrin.h>
1089 ///
1090 /// \code
1091 /// __m128i _mm_slli_si128(__m128i a, const int imm);
1092 /// \endcode
1093 ///
1094 /// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction.
1095 ///
1096 /// \param a
1097 ///    A 128-bit integer vector containing the source operand.
1098 /// \param imm
1099 ///    An immediate value specifying the number of bytes to left-shift
1100 ///    operand a.
1101 /// \returns A 128-bit integer vector containing the left-shifted value.
1102 #define _mm_slli_si128(a, imm) __extension__ ({                              \
1103   (__m128i)__builtin_shufflevector(                                          \
1104                                  (__v16qi)_mm_setzero_si128(),               \
1105                                  (__v16qi)(__m128i)(a),                      \
1106                                  ((char)(imm)&0xF0) ?  0 : 16 - (char)(imm), \
1107                                  ((char)(imm)&0xF0) ?  1 : 17 - (char)(imm), \
1108                                  ((char)(imm)&0xF0) ?  2 : 18 - (char)(imm), \
1109                                  ((char)(imm)&0xF0) ?  3 : 19 - (char)(imm), \
1110                                  ((char)(imm)&0xF0) ?  4 : 20 - (char)(imm), \
1111                                  ((char)(imm)&0xF0) ?  5 : 21 - (char)(imm), \
1112                                  ((char)(imm)&0xF0) ?  6 : 22 - (char)(imm), \
1113                                  ((char)(imm)&0xF0) ?  7 : 23 - (char)(imm), \
1114                                  ((char)(imm)&0xF0) ?  8 : 24 - (char)(imm), \
1115                                  ((char)(imm)&0xF0) ?  9 : 25 - (char)(imm), \
1116                                  ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
1117                                  ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
1118                                  ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
1119                                  ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
1120                                  ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
1121                                  ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
1122
1123 #define _mm_bslli_si128(a, imm) \
1124   _mm_slli_si128((a), (imm))
1125
1126 /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
1127 ///    by the specified number of bits. Low-order bits are cleared.
1128 ///
1129 /// \headerfile <x86intrin.h>
1130 ///
1131 /// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
1132 ///
1133 /// \param __a
1134 ///    A 128-bit integer vector containing the source operand.
1135 /// \param __count
1136 ///    An integer value specifying the number of bits to left-shift each value
1137 ///    in operand __a.
1138 /// \returns A 128-bit integer vector containing the left-shifted values.
1139 static __inline__ __m128i __DEFAULT_FN_ATTRS
1140 _mm_slli_epi16(__m128i __a, int __count)
1141 {
1142   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
1143 }
1144
1145 /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
1146 ///    by the specified number of bits. Low-order bits are cleared.
1147 ///
1148 /// \headerfile <x86intrin.h>
1149 ///
1150 /// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction.
1151 ///
1152 /// \param __a
1153 ///    A 128-bit integer vector containing the source operand.
1154 /// \param __count
1155 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1156 ///    to left-shift each value in operand __a.
1157 /// \returns A 128-bit integer vector containing the left-shifted values.
1158 static __inline__ __m128i __DEFAULT_FN_ATTRS
1159 _mm_sll_epi16(__m128i __a, __m128i __count)
1160 {
1161   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
1162 }
1163
1164 /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
1165 ///    by the specified number of bits. Low-order bits are cleared.
1166 ///
1167 /// \headerfile <x86intrin.h>
1168 ///
1169 /// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
1170 ///
1171 /// \param __a
1172 ///    A 128-bit integer vector containing the source operand.
1173 /// \param __count
1174 ///    An integer value specifying the number of bits to left-shift each value
1175 ///    in operand __a.
1176 /// \returns A 128-bit integer vector containing the left-shifted values.
1177 static __inline__ __m128i __DEFAULT_FN_ATTRS
1178 _mm_slli_epi32(__m128i __a, int __count)
1179 {
1180   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
1181 }
1182
1183 /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
1184 ///    by the specified number of bits. Low-order bits are cleared.
1185 ///
1186 /// \headerfile <x86intrin.h>
1187 ///
1188 /// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction.
1189 ///
1190 /// \param __a
1191 ///    A 128-bit integer vector containing the source operand.
1192 /// \param __count
1193 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1194 ///    to left-shift each value in operand __a.
1195 /// \returns A 128-bit integer vector containing the left-shifted values.
1196 static __inline__ __m128i __DEFAULT_FN_ATTRS
1197 _mm_sll_epi32(__m128i __a, __m128i __count)
1198 {
1199   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
1200 }
1201
1202 /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
1203 ///    by the specified number of bits. Low-order bits are cleared.
1204 ///
1205 /// \headerfile <x86intrin.h>
1206 ///
1207 /// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
1208 ///
1209 /// \param __a
1210 ///    A 128-bit integer vector containing the source operand.
1211 /// \param __count
1212 ///    An integer value specifying the number of bits to left-shift each value
1213 ///    in operand __a.
1214 /// \returns A 128-bit integer vector containing the left-shifted values.
1215 static __inline__ __m128i __DEFAULT_FN_ATTRS
1216 _mm_slli_epi64(__m128i __a, int __count)
1217 {
1218   return __builtin_ia32_psllqi128((__v2di)__a, __count);
1219 }
1220
1221 /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
1222 ///    by the specified number of bits. Low-order bits are cleared.
1223 ///
1224 /// \headerfile <x86intrin.h>
1225 ///
1226 /// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction.
1227 ///
1228 /// \param __a
1229 ///    A 128-bit integer vector containing the source operand.
1230 /// \param __count
1231 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1232 ///    to left-shift each value in operand __a.
1233 /// \returns A 128-bit integer vector containing the left-shifted values.
1234 static __inline__ __m128i __DEFAULT_FN_ATTRS
1235 _mm_sll_epi64(__m128i __a, __m128i __count)
1236 {
1237   return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
1238 }
1239
1240 /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
1241 ///    by the specified number of bits. High-order bits are filled with the sign
1242 ///    bit of the initial value.
1243 ///
1244 /// \headerfile <x86intrin.h>
1245 ///
1246 /// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
1247 ///
1248 /// \param __a
1249 ///    A 128-bit integer vector containing the source operand.
1250 /// \param __count
1251 ///    An integer value specifying the number of bits to right-shift each value
1252 ///    in operand __a.
1253 /// \returns A 128-bit integer vector containing the right-shifted values.
1254 static __inline__ __m128i __DEFAULT_FN_ATTRS
1255 _mm_srai_epi16(__m128i __a, int __count)
1256 {
1257   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
1258 }
1259
1260 /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
1261 ///    by the specified number of bits. High-order bits are filled with the sign
1262 ///    bit of the initial value.
1263 ///
1264 /// \headerfile <x86intrin.h>
1265 ///
1266 /// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction.
1267 ///
1268 /// \param __a
1269 ///    A 128-bit integer vector containing the source operand.
1270 /// \param __count
1271 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1272 ///    to right-shift each value in operand __a.
1273 /// \returns A 128-bit integer vector containing the right-shifted values.
1274 static __inline__ __m128i __DEFAULT_FN_ATTRS
1275 _mm_sra_epi16(__m128i __a, __m128i __count)
1276 {
1277   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
1278 }
1279
1280 /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
1281 ///    by the specified number of bits. High-order bits are filled with the sign
1282 ///    bit of the initial value.
1283 ///
1284 /// \headerfile <x86intrin.h>
1285 ///
1286 /// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
1287 ///
1288 /// \param __a
1289 ///    A 128-bit integer vector containing the source operand.
1290 /// \param __count
1291 ///    An integer value specifying the number of bits to right-shift each value
1292 ///    in operand __a.
1293 /// \returns A 128-bit integer vector containing the right-shifted values.
1294 static __inline__ __m128i __DEFAULT_FN_ATTRS
1295 _mm_srai_epi32(__m128i __a, int __count)
1296 {
1297   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
1298 }
1299
1300 /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
1301 ///    by the specified number of bits. High-order bits are filled with the sign
1302 ///    bit of the initial value.
1303 ///
1304 /// \headerfile <x86intrin.h>
1305 ///
1306 /// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction.
1307 ///
1308 /// \param __a
1309 ///    A 128-bit integer vector containing the source operand.
1310 /// \param __count
1311 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1312 ///    to right-shift each value in operand __a.
1313 /// \returns A 128-bit integer vector containing the right-shifted values.
1314 static __inline__ __m128i __DEFAULT_FN_ATTRS
1315 _mm_sra_epi32(__m128i __a, __m128i __count)
1316 {
1317   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
1318 }
1319
1320 /// \brief Right-shifts the 128-bit integer vector operand by the specified
1321 ///    number of bytes. High-order bits are cleared.
1322 ///
1323 /// \headerfile <x86intrin.h>
1324 ///
1325 /// \code
1326 /// __m128i _mm_srli_si128(__m128i a, const int imm);
1327 /// \endcode
1328 ///
1329 /// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction.
1330 ///
1331 /// \param a
1332 ///    A 128-bit integer vector containing the source operand.
1333 /// \param imm
1334 ///    An immediate value specifying the number of bytes to right-shift operand
1335 ///    a.
1336 /// \returns A 128-bit integer vector containing the right-shifted value.
1337 #define _mm_srli_si128(a, imm) __extension__ ({                              \
1338   (__m128i)__builtin_shufflevector(                                          \
1339                                  (__v16qi)(__m128i)(a),                      \
1340                                  (__v16qi)_mm_setzero_si128(),               \
1341                                  ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0,  \
1342                                  ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1,  \
1343                                  ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2,  \
1344                                  ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3,  \
1345                                  ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4,  \
1346                                  ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5,  \
1347                                  ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6,  \
1348                                  ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7,  \
1349                                  ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8,  \
1350                                  ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9,  \
1351                                  ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
1352                                  ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
1353                                  ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
1354                                  ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
1355                                  ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
1356                                  ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
1357
1358 #define _mm_bsrli_si128(a, imm) \
1359   _mm_srli_si128((a), (imm))
1360
1361 /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
1362 ///    operand by the specified number of bits. High-order bits are cleared.
1363 ///
1364 /// \headerfile <x86intrin.h>
1365 ///
1366 /// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
1367 ///
1368 /// \param __a
1369 ///    A 128-bit integer vector containing the source operand.
1370 /// \param __count
1371 ///    An integer value specifying the number of bits to right-shift each value
1372 ///    in operand __a.
1373 /// \returns A 128-bit integer vector containing the right-shifted values.
1374 static __inline__ __m128i __DEFAULT_FN_ATTRS
1375 _mm_srli_epi16(__m128i __a, int __count)
1376 {
1377   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
1378 }
1379
1380 /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
1381 ///    operand by the specified number of bits. High-order bits are cleared.
1382 ///
1383 /// \headerfile <x86intrin.h>
1384 ///
1385 /// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction.
1386 ///
1387 /// \param __a
1388 ///    A 128-bit integer vector containing the source operand.
1389 /// \param __count
1390 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1391 ///    to right-shift each value in operand __a.
1392 /// \returns A 128-bit integer vector containing the right-shifted values.
1393 static __inline__ __m128i __DEFAULT_FN_ATTRS
1394 _mm_srl_epi16(__m128i __a, __m128i __count)
1395 {
1396   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
1397 }
1398
1399 /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
1400 ///    operand by the specified number of bits. High-order bits are cleared.
1401 ///
1402 /// \headerfile <x86intrin.h>
1403 ///
1404 /// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
1405 ///
1406 /// \param __a
1407 ///    A 128-bit integer vector containing the source operand.
1408 /// \param __count
1409 ///    An integer value specifying the number of bits to right-shift each value
1410 ///    in operand __a.
1411 /// \returns A 128-bit integer vector containing the right-shifted values.
1412 static __inline__ __m128i __DEFAULT_FN_ATTRS
1413 _mm_srli_epi32(__m128i __a, int __count)
1414 {
1415   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
1416 }
1417
1418 /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
1419 ///    operand by the specified number of bits. High-order bits are cleared.
1420 ///
1421 /// \headerfile <x86intrin.h>
1422 ///
1423 /// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction.
1424 ///
1425 /// \param __a
1426 ///    A 128-bit integer vector containing the source operand.
1427 /// \param __count
1428 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1429 ///    to right-shift each value in operand __a.
1430 /// \returns A 128-bit integer vector containing the right-shifted values.
1431 static __inline__ __m128i __DEFAULT_FN_ATTRS
1432 _mm_srl_epi32(__m128i __a, __m128i __count)
1433 {
1434   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
1435 }
1436
1437 /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
1438 ///    operand by the specified number of bits. High-order bits are cleared.
1439 ///
1440 /// \headerfile <x86intrin.h>
1441 ///
1442 /// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
1443 ///
1444 /// \param __a
1445 ///    A 128-bit integer vector containing the source operand.
1446 /// \param __count
1447 ///    An integer value specifying the number of bits to right-shift each value
1448 ///    in operand __a.
1449 /// \returns A 128-bit integer vector containing the right-shifted values.
1450 static __inline__ __m128i __DEFAULT_FN_ATTRS
1451 _mm_srli_epi64(__m128i __a, int __count)
1452 {
1453   return __builtin_ia32_psrlqi128((__v2di)__a, __count);
1454 }
1455
1456 /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
1457 ///    operand by the specified number of bits. High-order bits are cleared.
1458 ///
1459 /// \headerfile <x86intrin.h>
1460 ///
1461 /// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction.
1462 ///
1463 /// \param __a
1464 ///    A 128-bit integer vector containing the source operand.
1465 /// \param __count
1466 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
1467 ///    to right-shift each value in operand __a.
1468 /// \returns A 128-bit integer vector containing the right-shifted values.
1469 static __inline__ __m128i __DEFAULT_FN_ATTRS
1470 _mm_srl_epi64(__m128i __a, __m128i __count)
1471 {
1472   return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
1473 }
1474
1475 /// \brief Compares each of the corresponding 8-bit values of the 128-bit
1476 ///    integer vectors for equality. Each comparison yields 0h for false, FFh
1477 ///    for true.
1478 ///
1479 /// \headerfile <x86intrin.h>
1480 ///
1481 /// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction.
1482 ///
1483 /// \param __a
1484 ///    A 128-bit integer vector.
1485 /// \param __b
1486 ///    A 128-bit integer vector.
1487 /// \returns A 128-bit integer vector containing the comparison results.
1488 static __inline__ __m128i __DEFAULT_FN_ATTRS
1489 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
1490 {
1491   return (__m128i)((__v16qi)__a == (__v16qi)__b);
1492 }
1493
1494 /// \brief Compares each of the corresponding 16-bit values of the 128-bit
1495 ///    integer vectors for equality. Each comparison yields 0h for false, FFFFh
1496 ///    for true.
1497 ///
1498 /// \headerfile <x86intrin.h>
1499 ///
1500 /// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction.
1501 ///
1502 /// \param __a
1503 ///    A 128-bit integer vector.
1504 /// \param __b
1505 ///    A 128-bit integer vector.
1506 /// \returns A 128-bit integer vector containing the comparison results.
1507 static __inline__ __m128i __DEFAULT_FN_ATTRS
1508 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
1509 {
1510   return (__m128i)((__v8hi)__a == (__v8hi)__b);
1511 }
1512
1513 /// \brief Compares each of the corresponding 32-bit values of the 128-bit
1514 ///    integer vectors for equality. Each comparison yields 0h for false,
1515 ///    FFFFFFFFh for true.
1516 ///
1517 /// \headerfile <x86intrin.h>
1518 ///
1519 /// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction.
1520 ///
1521 /// \param __a
1522 ///    A 128-bit integer vector.
1523 /// \param __b
1524 ///    A 128-bit integer vector.
1525 /// \returns A 128-bit integer vector containing the comparison results.
1526 static __inline__ __m128i __DEFAULT_FN_ATTRS
1527 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
1528 {
1529   return (__m128i)((__v4si)__a == (__v4si)__b);
1530 }
1531
1532 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
1533 ///    integer vectors to determine if the values in the first operand are
1534 ///    greater than those in the second operand. Each comparison yields 0h for
1535 ///    false, FFh for true.
1536 ///
1537 /// \headerfile <x86intrin.h>
1538 ///
1539 /// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
1540 ///
1541 /// \param __a
1542 ///    A 128-bit integer vector.
1543 /// \param __b
1544 ///    A 128-bit integer vector.
1545 /// \returns A 128-bit integer vector containing the comparison results.
1546 static __inline__ __m128i __DEFAULT_FN_ATTRS
1547 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
1548 {
1549   /* This function always performs a signed comparison, but __v16qi is a char
1550      which may be signed or unsigned, so use __v16qs. */
1551   return (__m128i)((__v16qs)__a > (__v16qs)__b);
1552 }
1553
1554 /// \brief Compares each of the corresponding signed 16-bit values of the
1555 ///    128-bit integer vectors to determine if the values in the first operand
1556 ///    are greater than those in the second operand. Each comparison yields 0h
1557 ///    for false, FFFFh for true.
1558 ///
1559 /// \headerfile <x86intrin.h>
1560 ///
1561 /// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
1562 ///
1563 /// \param __a
1564 ///    A 128-bit integer vector.
1565 /// \param __b
1566 ///    A 128-bit integer vector.
1567 /// \returns A 128-bit integer vector containing the comparison results.
1568 static __inline__ __m128i __DEFAULT_FN_ATTRS
1569 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
1570 {
1571   return (__m128i)((__v8hi)__a > (__v8hi)__b);
1572 }
1573
1574 /// \brief Compares each of the corresponding signed 32-bit values of the
1575 ///    128-bit integer vectors to determine if the values in the first operand
1576 ///    are greater than those in the second operand. Each comparison yields 0h
1577 ///    for false, FFFFFFFFh for true.
1578 ///
1579 /// \headerfile <x86intrin.h>
1580 ///
1581 /// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
1582 ///
1583 /// \param __a
1584 ///    A 128-bit integer vector.
1585 /// \param __b
1586 ///    A 128-bit integer vector.
1587 /// \returns A 128-bit integer vector containing the comparison results.
1588 static __inline__ __m128i __DEFAULT_FN_ATTRS
1589 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
1590 {
1591   return (__m128i)((__v4si)__a > (__v4si)__b);
1592 }
1593
1594 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
1595 ///    integer vectors to determine if the values in the first operand are less
1596 ///    than those in the second operand. Each comparison yields 0h for false,
1597 ///    FFh for true.
1598 ///
1599 /// \headerfile <x86intrin.h>
1600 ///
1601 /// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction.
1602 ///
1603 /// \param __a
1604 ///    A 128-bit integer vector.
1605 /// \param __b
1606 ///    A 128-bit integer vector.
1607 /// \returns A 128-bit integer vector containing the comparison results.
1608 static __inline__ __m128i __DEFAULT_FN_ATTRS
1609 _mm_cmplt_epi8(__m128i __a, __m128i __b)
1610 {
1611   return _mm_cmpgt_epi8(__b, __a);
1612 }
1613
1614 /// \brief Compares each of the corresponding signed 16-bit values of the
1615 ///    128-bit integer vectors to determine if the values in the first operand
1616 ///    are less than those in the second operand. Each comparison yields 0h for
1617 ///    false, FFFFh for true.
1618 ///
1619 /// \headerfile <x86intrin.h>
1620 ///
1621 /// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction.
1622 ///
1623 /// \param __a
1624 ///    A 128-bit integer vector.
1625 /// \param __b
1626 ///    A 128-bit integer vector.
1627 /// \returns A 128-bit integer vector containing the comparison results.
1628 static __inline__ __m128i __DEFAULT_FN_ATTRS
1629 _mm_cmplt_epi16(__m128i __a, __m128i __b)
1630 {
1631   return _mm_cmpgt_epi16(__b, __a);
1632 }
1633
1634 /// \brief Compares each of the corresponding signed 32-bit values of the
1635 ///    128-bit integer vectors to determine if the values in the first operand
1636 ///    are less than those in the second operand. Each comparison yields 0h for
1637 ///    false, FFFFFFFFh for true.
1638 ///
1639 /// \headerfile <x86intrin.h>
1640 ///
1641 /// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction.
1642 ///
1643 /// \param __a
1644 ///    A 128-bit integer vector.
1645 /// \param __b
1646 ///    A 128-bit integer vector.
1647 /// \returns A 128-bit integer vector containing the comparison results.
1648 static __inline__ __m128i __DEFAULT_FN_ATTRS
1649 _mm_cmplt_epi32(__m128i __a, __m128i __b)
1650 {
1651   return _mm_cmpgt_epi32(__b, __a);
1652 }
1653
1654 #ifdef __x86_64__
1655 /// \brief Converts a 64-bit signed integer value from the second operand into a
1656 ///    double-precision value and returns it in the lower element of a [2 x
1657 ///    double] vector; the upper element of the returned vector is copied from
1658 ///    the upper element of the first operand.
1659 ///
1660 /// \headerfile <x86intrin.h>
1661 ///
1662 /// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction.
1663 ///
1664 /// \param __a
1665 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
1666 ///    copied to the upper 64 bits of the destination.
1667 /// \param __b
1668 ///    A 64-bit signed integer operand containing the value to be converted.
1669 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
1670 ///    converted value of the second operand. The upper 64 bits are copied from
1671 ///    the upper 64 bits of the first operand.
1672 static __inline__ __m128d __DEFAULT_FN_ATTRS
1673 _mm_cvtsi64_sd(__m128d __a, long long __b)
1674 {
1675   __a[0] = __b;
1676   return __a;
1677 }
1678
1679 /// \brief Converts the first (lower) element of a vector of [2 x double] into a
1680 ///    64-bit signed integer value, according to the current rounding mode.
1681 ///
1682 /// \headerfile <x86intrin.h>
1683 ///
1684 /// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction.
1685 ///
1686 /// \param __a
1687 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1688 ///    conversion.
1689 /// \returns A 64-bit signed integer containing the converted value.
1690 static __inline__ long long __DEFAULT_FN_ATTRS
1691 _mm_cvtsd_si64(__m128d __a)
1692 {
1693   return __builtin_ia32_cvtsd2si64((__v2df)__a);
1694 }
1695
1696 /// \brief Converts the first (lower) element of a vector of [2 x double] into a
1697 ///    64-bit signed integer value, truncating the result when it is inexact.
1698 ///
1699 /// \headerfile <x86intrin.h>
1700 ///
1701 /// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction.
1702 ///
1703 /// \param __a
1704 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1705 ///    conversion.
1706 /// \returns A 64-bit signed integer containing the converted value.
1707 static __inline__ long long __DEFAULT_FN_ATTRS
1708 _mm_cvttsd_si64(__m128d __a)
1709 {
1710   return __a[0];
1711 }
1712 #endif
1713
1714 /// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
1715 ///
1716 /// \headerfile <x86intrin.h>
1717 ///
1718 /// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction.
1719 ///
1720 /// \param __a
1721 ///    A 128-bit integer vector.
1722 /// \returns A 128-bit vector of [4 x float] containing the converted values.
1723 static __inline__ __m128 __DEFAULT_FN_ATTRS
1724 _mm_cvtepi32_ps(__m128i __a)
1725 {
1726   return __builtin_ia32_cvtdq2ps((__v4si)__a);
1727 }
1728
1729 /// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
1730 ///
1731 /// \headerfile <x86intrin.h>
1732 ///
1733 /// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction.
1734 ///
1735 /// \param __a
1736 ///    A 128-bit vector of [4 x float].
1737 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
1738 ///    values.
1739 static __inline__ __m128i __DEFAULT_FN_ATTRS
1740 _mm_cvtps_epi32(__m128 __a)
1741 {
1742   return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
1743 }
1744
1745 /// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
1746 ///    truncating the result when it is inexact.
1747 ///
1748 /// \headerfile <x86intrin.h>
1749 ///
1750 /// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction.
1751 ///
1752 /// \param __a
1753 ///    A 128-bit vector of [4 x float].
1754 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
1755 static __inline__ __m128i __DEFAULT_FN_ATTRS
1756 _mm_cvttps_epi32(__m128 __a)
1757 {
1758   return (__m128i)__builtin_convertvector((__v4sf)__a, __v4si);
1759 }
1760
1761 /// \brief Returns a vector of [4 x i32] where the lowest element is the input
1762 ///    operand and the remaining elements are zero.
1763 ///
1764 /// \headerfile <x86intrin.h>
1765 ///
1766 /// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
1767 ///
1768 /// \param __a
1769 ///    A 32-bit signed integer operand.
1770 /// \returns A 128-bit vector of [4 x i32].
1771 static __inline__ __m128i __DEFAULT_FN_ATTRS
1772 _mm_cvtsi32_si128(int __a)
1773 {
1774   return (__m128i)(__v4si){ __a, 0, 0, 0 };
1775 }
1776
1777 #ifdef __x86_64__
1778 /// \brief Returns a vector of [2 x i64] where the lower element is the input
1779 ///    operand and the upper element is zero.
1780 ///
1781 /// \headerfile <x86intrin.h>
1782 ///
1783 /// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1784 ///
1785 /// \param __a
1786 ///    A 64-bit signed integer operand containing the value to be converted.
1787 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
1788 static __inline__ __m128i __DEFAULT_FN_ATTRS
1789 _mm_cvtsi64_si128(long long __a)
1790 {
1791   return (__m128i){ __a, 0 };
1792 }
1793 #endif
1794
1795 /// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
1796 ///    32-bit signed integer value.
1797 ///
1798 /// \headerfile <x86intrin.h>
1799 ///
1800 /// This intrinsic corresponds to the \c VMOVD / MOVD instruction.
1801 ///
1802 /// \param __a
1803 ///    A vector of [4 x i32]. The least significant 32 bits are moved to the
1804 ///    destination.
1805 /// \returns A 32-bit signed integer containing the moved value.
1806 static __inline__ int __DEFAULT_FN_ATTRS
1807 _mm_cvtsi128_si32(__m128i __a)
1808 {
1809   __v4si __b = (__v4si)__a;
1810   return __b[0];
1811 }
1812
1813 #ifdef __x86_64__
1814 /// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
1815 ///    64-bit signed integer value.
1816 ///
1817 /// \headerfile <x86intrin.h>
1818 ///
1819 /// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1820 ///
1821 /// \param __a
1822 ///    A vector of [2 x i64]. The least significant 64 bits are moved to the
1823 ///    destination.
1824 /// \returns A 64-bit signed integer containing the moved value.
1825 static __inline__ long long __DEFAULT_FN_ATTRS
1826 _mm_cvtsi128_si64(__m128i __a)
1827 {
1828   return __a[0];
1829 }
1830 #endif
1831
1832 /// \brief Moves packed integer values from an aligned 128-bit memory location
1833 ///    to elements in a 128-bit integer vector.
1834 ///
1835 /// \headerfile <x86intrin.h>
1836 ///
1837 /// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction.
1838 ///
1839 /// \param __p
1840 ///    An aligned pointer to a memory location containing integer values.
1841 /// \returns A 128-bit integer vector containing the moved values.
1842 static __inline__ __m128i __DEFAULT_FN_ATTRS
1843 _mm_load_si128(__m128i const *__p)
1844 {
1845   return *__p;
1846 }
1847
1848 /// \brief Moves packed integer values from an unaligned 128-bit memory location
1849 ///    to elements in a 128-bit integer vector.
1850 ///
1851 /// \headerfile <x86intrin.h>
1852 ///
1853 /// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction.
1854 ///
1855 /// \param __p
1856 ///    A pointer to a memory location containing integer values.
1857 /// \returns A 128-bit integer vector containing the moved values.
1858 static __inline__ __m128i __DEFAULT_FN_ATTRS
1859 _mm_loadu_si128(__m128i const *__p)
1860 {
1861   struct __loadu_si128 {
1862     __m128i __v;
1863   } __attribute__((__packed__, __may_alias__));
1864   return ((struct __loadu_si128*)__p)->__v;
1865 }
1866
1867 /// \brief Returns a vector of [2 x i64] where the lower element is taken from
1868 ///    the lower element of the operand, and the upper element is zero.
1869 ///
1870 /// \headerfile <x86intrin.h>
1871 ///
1872 /// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
1873 ///
1874 /// \param __p
1875 ///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
1876 ///    the destination.
1877 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
1878 ///    moved value. The higher order bits are cleared.
1879 static __inline__ __m128i __DEFAULT_FN_ATTRS
1880 _mm_loadl_epi64(__m128i const *__p)
1881 {
1882   struct __mm_loadl_epi64_struct {
1883     long long __u;
1884   } __attribute__((__packed__, __may_alias__));
1885   return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1886 }
1887
1888 /// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
1889 ///    This could be used as an argument to another intrinsic function where the
1890 ///    argument is required but the value is not actually used.
1891 ///
1892 /// \headerfile <x86intrin.h>
1893 ///
1894 /// This intrinsic has no corresponding instruction.
1895 ///
1896 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
1897 static __inline__ __m128i __DEFAULT_FN_ATTRS
1898 _mm_undefined_si128(void)
1899 {
1900   return (__m128i)__builtin_ia32_undef128();
1901 }
1902
1903 /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
1904 ///    the specified 64-bit integer values.
1905 ///
1906 /// \headerfile <x86intrin.h>
1907 ///
1908 /// This intrinsic is a utility function and does not correspond to a specific
1909 ///    instruction.
1910 ///
1911 /// \param __q1
1912 ///    A 64-bit integer value used to initialize the upper 64 bits of the
1913 ///    destination vector of [2 x i64].
1914 /// \param __q0
1915 ///    A 64-bit integer value used to initialize the lower 64 bits of the
1916 ///    destination vector of [2 x i64].
1917 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
1918 ///    provided in the operands.
1919 static __inline__ __m128i __DEFAULT_FN_ATTRS
1920 _mm_set_epi64x(long long __q1, long long __q0)
1921 {
1922   return (__m128i){ __q0, __q1 };
1923 }
1924
1925 /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
1926 ///    the specified 64-bit integer values.
1927 ///
1928 /// \headerfile <x86intrin.h>
1929 ///
1930 /// This intrinsic is a utility function and does not correspond to a specific
1931 ///    instruction.
1932 ///
1933 /// \param __q1
1934 ///    A 64-bit integer value used to initialize the upper 64 bits of the
1935 ///    destination vector of [2 x i64].
1936 /// \param __q0
1937 ///    A 64-bit integer value used to initialize the lower 64 bits of the
1938 ///    destination vector of [2 x i64].
1939 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
1940 ///    provided in the operands.
1941 static __inline__ __m128i __DEFAULT_FN_ATTRS
1942 _mm_set_epi64(__m64 __q1, __m64 __q0)
1943 {
1944   return (__m128i){ (long long)__q0, (long long)__q1 };
1945 }
1946
1947 /// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
1948 ///    the specified 32-bit integer values.
1949 ///
1950 /// \headerfile <x86intrin.h>
1951 ///
1952 /// This intrinsic is a utility function and does not correspond to a specific
1953 ///    instruction.
1954 ///
1955 /// \param __i3
1956 ///    A 32-bit integer value used to initialize bits [127:96] of the
1957 ///    destination vector.
1958 /// \param __i2
1959 ///    A 32-bit integer value used to initialize bits [95:64] of the destination
1960 ///    vector.
1961 /// \param __i1
1962 ///    A 32-bit integer value used to initialize bits [63:32] of the destination
1963 ///    vector.
1964 /// \param __i0
1965 ///    A 32-bit integer value used to initialize bits [31:0] of the destination
1966 ///    vector.
1967 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
1968 ///    provided in the operands.
1969 static __inline__ __m128i __DEFAULT_FN_ATTRS
1970 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
1971 {
1972   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
1973 }
1974
1975 /// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
1976 ///    the specified 16-bit integer values.
1977 ///
1978 /// \headerfile <x86intrin.h>
1979 ///
1980 /// This intrinsic is a utility function and does not correspond to a specific
1981 ///    instruction.
1982 ///
1983 /// \param __w7
1984 ///    A 16-bit integer value used to initialize bits [127:112] of the
1985 ///    destination vector.
1986 /// \param __w6
1987 ///    A 16-bit integer value used to initialize bits [111:96] of the
1988 ///    destination vector.
1989 /// \param __w5
1990 ///    A 16-bit integer value used to initialize bits [95:80] of the destination
1991 ///    vector.
1992 /// \param __w4
1993 ///    A 16-bit integer value used to initialize bits [79:64] of the destination
1994 ///    vector.
1995 /// \param __w3
1996 ///    A 16-bit integer value used to initialize bits [63:48] of the destination
1997 ///    vector.
1998 /// \param __w2
1999 ///    A 16-bit integer value used to initialize bits [47:32] of the destination
2000 ///    vector.
2001 /// \param __w1
2002 ///    A 16-bit integer value used to initialize bits [31:16] of the destination
2003 ///    vector.
2004 /// \param __w0
2005 ///    A 16-bit integer value used to initialize bits [15:0] of the destination
2006 ///    vector.
2007 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
2008 ///    provided in the operands.
2009 static __inline__ __m128i __DEFAULT_FN_ATTRS
2010 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
2011 {
2012   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
2013 }
2014
2015 /// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
2016 ///    the specified 8-bit integer values.
2017 ///
2018 /// \headerfile <x86intrin.h>
2019 ///
2020 /// This intrinsic is a utility function and does not correspond to a specific
2021 ///    instruction.
2022 ///
2023 /// \param __b15
2024 ///    Initializes bits [127:120] of the destination vector.
2025 /// \param __b14
2026 ///    Initializes bits [119:112] of the destination vector.
2027 /// \param __b13
2028 ///    Initializes bits [111:104] of the destination vector.
2029 /// \param __b12
2030 ///    Initializes bits [103:96] of the destination vector.
2031 /// \param __b11
2032 ///    Initializes bits [95:88] of the destination vector.
2033 /// \param __b10
2034 ///    Initializes bits [87:80] of the destination vector.
2035 /// \param __b9
2036 ///    Initializes bits [79:72] of the destination vector.
2037 /// \param __b8
2038 ///    Initializes bits [71:64] of the destination vector.
2039 /// \param __b7
2040 ///    Initializes bits [63:56] of the destination vector.
2041 /// \param __b6
2042 ///    Initializes bits [55:48] of the destination vector.
2043 /// \param __b5
2044 ///    Initializes bits [47:40] of the destination vector.
2045 /// \param __b4
2046 ///    Initializes bits [39:32] of the destination vector.
2047 /// \param __b3
2048 ///    Initializes bits [31:24] of the destination vector.
2049 /// \param __b2
2050 ///    Initializes bits [23:16] of the destination vector.
2051 /// \param __b1
2052 ///    Initializes bits [15:8] of the destination vector.
2053 /// \param __b0
2054 ///    Initializes bits [7:0] of the destination vector.
2055 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
2056 ///    provided in the operands.
2057 static __inline__ __m128i __DEFAULT_FN_ATTRS
2058 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
2059 {
2060   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
2061 }
2062
2063 /// \brief Initializes both values in a 128-bit integer vector with the
2064 ///    specified 64-bit integer value.
2065 ///
2066 /// \headerfile <x86intrin.h>
2067 ///
2068 /// This intrinsic is a utility function and does not correspond to a specific
2069 ///    instruction.
2070 ///
2071 /// \param __q
2072 ///    Integer value used to initialize the elements of the destination integer
2073 ///    vector.
2074 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
2075 ///    elements containing the value provided in the operand.
2076 static __inline__ __m128i __DEFAULT_FN_ATTRS
2077 _mm_set1_epi64x(long long __q)
2078 {
2079   return (__m128i){ __q, __q };
2080 }
2081
2082 /// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
2083 ///    specified 64-bit value.
2084 ///
2085 /// \headerfile <x86intrin.h>
2086 ///
2087 /// This intrinsic is a utility function and does not correspond to a specific
2088 ///    instruction.
2089 ///
2090 /// \param __q
2091 ///    A 64-bit value used to initialize the elements of the destination integer
2092 ///    vector.
2093 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
2094 ///    containing the value provided in the operand.
2095 static __inline__ __m128i __DEFAULT_FN_ATTRS
2096 _mm_set1_epi64(__m64 __q)
2097 {
2098   return (__m128i){ (long long)__q, (long long)__q };
2099 }
2100
2101 /// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
2102 ///    specified 32-bit value.
2103 ///
2104 /// \headerfile <x86intrin.h>
2105 ///
2106 /// This intrinsic is a utility function and does not correspond to a specific
2107 ///    instruction.
2108 ///
2109 /// \param __i
2110 ///    A 32-bit value used to initialize the elements of the destination integer
2111 ///    vector.
2112 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
2113 ///    containing the value provided in the operand.
2114 static __inline__ __m128i __DEFAULT_FN_ATTRS
2115 _mm_set1_epi32(int __i)
2116 {
2117   return (__m128i)(__v4si){ __i, __i, __i, __i };
2118 }
2119
2120 /// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
2121 ///    specified 16-bit value.
2122 ///
2123 /// \headerfile <x86intrin.h>
2124 ///
2125 /// This intrinsic is a utility function and does not correspond to a specific
2126 ///    instruction.
2127 ///
2128 /// \param __w
2129 ///    A 16-bit value used to initialize the elements of the destination integer
2130 ///    vector.
2131 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
2132 ///    containing the value provided in the operand.
2133 static __inline__ __m128i __DEFAULT_FN_ATTRS
2134 _mm_set1_epi16(short __w)
2135 {
2136   return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
2137 }
2138
2139 /// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
2140 ///    specified 8-bit value.
2141 ///
2142 /// \headerfile <x86intrin.h>
2143 ///
2144 /// This intrinsic is a utility function and does not correspond to a specific
2145 ///    instruction.
2146 ///
2147 /// \param __b
2148 ///    An 8-bit value used to initialize the elements of the destination integer
2149 ///    vector.
2150 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
2151 ///    containing the value provided in the operand.
2152 static __inline__ __m128i __DEFAULT_FN_ATTRS
2153 _mm_set1_epi8(char __b)
2154 {
2155   return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
2156 }
2157
2158 static __inline__ __m128i __DEFAULT_FN_ATTRS
2159 _mm_setr_epi64(__m64 __q0, __m64 __q1)
2160 {
2161   return (__m128i){ (long long)__q0, (long long)__q1 };
2162 }
2163
2164 static __inline__ __m128i __DEFAULT_FN_ATTRS
2165 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
2166 {
2167   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
2168 }
2169
2170 static __inline__ __m128i __DEFAULT_FN_ATTRS
2171 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
2172 {
2173   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
2174 }
2175
2176 static __inline__ __m128i __DEFAULT_FN_ATTRS
2177 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
2178 {
2179   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
2180 }
2181
2182 static __inline__ __m128i __DEFAULT_FN_ATTRS
2183 _mm_setzero_si128(void)
2184 {
2185   return (__m128i){ 0LL, 0LL };
2186 }
2187
2188 static __inline__ void __DEFAULT_FN_ATTRS
2189 _mm_store_si128(__m128i *__p, __m128i __b)
2190 {
2191   *__p = __b;
2192 }
2193
2194 static __inline__ void __DEFAULT_FN_ATTRS
2195 _mm_storeu_si128(__m128i *__p, __m128i __b)
2196 {
2197   struct __storeu_si128 {
2198     __m128i __v;
2199   } __attribute__((__packed__, __may_alias__));
2200   ((struct __storeu_si128*)__p)->__v = __b;
2201 }
2202
2203 static __inline__ void __DEFAULT_FN_ATTRS
2204 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
2205 {
2206   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
2207 }
2208
2209 static __inline__ void __DEFAULT_FN_ATTRS
2210 _mm_storel_epi64(__m128i *__p, __m128i __a)
2211 {
2212   struct __mm_storel_epi64_struct {
2213     long long __u;
2214   } __attribute__((__packed__, __may_alias__));
2215   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
2216 }
2217
2218 static __inline__ void __DEFAULT_FN_ATTRS
2219 _mm_stream_pd(double *__p, __m128d __a)
2220 {
2221   __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
2222 }
2223
2224 static __inline__ void __DEFAULT_FN_ATTRS
2225 _mm_stream_si128(__m128i *__p, __m128i __a)
2226 {
2227   __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
2228 }
2229
2230 static __inline__ void __DEFAULT_FN_ATTRS
2231 _mm_stream_si32(int *__p, int __a)
2232 {
2233   __builtin_ia32_movnti(__p, __a);
2234 }
2235
2236 #ifdef __x86_64__
2237 static __inline__ void __DEFAULT_FN_ATTRS
2238 _mm_stream_si64(long long *__p, long long __a)
2239 {
2240   __builtin_ia32_movnti64(__p, __a);
2241 }
2242 #endif
2243
2244 static __inline__ void __DEFAULT_FN_ATTRS
2245 _mm_clflush(void const *__p)
2246 {
2247   __builtin_ia32_clflush(__p);
2248 }
2249
2250 static __inline__ void __DEFAULT_FN_ATTRS
2251 _mm_lfence(void)
2252 {
2253   __builtin_ia32_lfence();
2254 }
2255
2256 static __inline__ void __DEFAULT_FN_ATTRS
2257 _mm_mfence(void)
2258 {
2259   __builtin_ia32_mfence();
2260 }
2261
2262 static __inline__ __m128i __DEFAULT_FN_ATTRS
2263 _mm_packs_epi16(__m128i __a, __m128i __b)
2264 {
2265   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
2266 }
2267
2268 static __inline__ __m128i __DEFAULT_FN_ATTRS
2269 _mm_packs_epi32(__m128i __a, __m128i __b)
2270 {
2271   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
2272 }
2273
2274 static __inline__ __m128i __DEFAULT_FN_ATTRS
2275 _mm_packus_epi16(__m128i __a, __m128i __b)
2276 {
2277   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
2278 }
2279
2280 static __inline__ int __DEFAULT_FN_ATTRS
2281 _mm_extract_epi16(__m128i __a, int __imm)
2282 {
2283   __v8hi __b = (__v8hi)__a;
2284   return (unsigned short)__b[__imm & 7];
2285 }
2286
2287 static __inline__ __m128i __DEFAULT_FN_ATTRS
2288 _mm_insert_epi16(__m128i __a, int __b, int __imm)
2289 {
2290   __v8hi __c = (__v8hi)__a;
2291   __c[__imm & 7] = __b;
2292   return (__m128i)__c;
2293 }
2294
2295 static __inline__ int __DEFAULT_FN_ATTRS
2296 _mm_movemask_epi8(__m128i __a)
2297 {
2298   return __builtin_ia32_pmovmskb128((__v16qi)__a);
2299 }
2300
2301 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
2302   (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
2303                                    (__v4si)_mm_undefined_si128(), \
2304                                    ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
2305                                    ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
2306
2307 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
2308   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
2309                                    (__v8hi)_mm_undefined_si128(), \
2310                                    ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
2311                                    ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
2312                                    4, 5, 6, 7); })
2313
2314 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
2315   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
2316                                    (__v8hi)_mm_undefined_si128(), \
2317                                    0, 1, 2, 3, \
2318                                    4 + (((imm) >> 0) & 0x3), \
2319                                    4 + (((imm) >> 2) & 0x3), \
2320                                    4 + (((imm) >> 4) & 0x3), \
2321                                    4 + (((imm) >> 6) & 0x3)); })
2322
2323 static __inline__ __m128i __DEFAULT_FN_ATTRS
2324 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
2325 {
2326   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2327 }
2328
2329 static __inline__ __m128i __DEFAULT_FN_ATTRS
2330 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
2331 {
2332   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
2333 }
2334
2335 static __inline__ __m128i __DEFAULT_FN_ATTRS
2336 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
2337 {
2338   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
2339 }
2340
2341 static __inline__ __m128i __DEFAULT_FN_ATTRS
2342 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
2343 {
2344   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
2345 }
2346
2347 static __inline__ __m128i __DEFAULT_FN_ATTRS
2348 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
2349 {
2350   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
2351 }
2352
2353 static __inline__ __m128i __DEFAULT_FN_ATTRS
2354 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
2355 {
2356   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
2357 }
2358
2359 static __inline__ __m128i __DEFAULT_FN_ATTRS
2360 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
2361 {
2362   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
2363 }
2364
2365 static __inline__ __m128i __DEFAULT_FN_ATTRS
2366 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
2367 {
2368   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
2369 }
2370
2371 static __inline__ __m64 __DEFAULT_FN_ATTRS
2372 _mm_movepi64_pi64(__m128i __a)
2373 {
2374   return (__m64)__a[0];
2375 }
2376
2377 static __inline__ __m128i __DEFAULT_FN_ATTRS
2378 _mm_movpi64_epi64(__m64 __a)
2379 {
2380   return (__m128i){ (long long)__a, 0 };
2381 }
2382
2383 static __inline__ __m128i __DEFAULT_FN_ATTRS
2384 _mm_move_epi64(__m128i __a)
2385 {
2386   return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
2387 }
2388
2389 static __inline__ __m128d __DEFAULT_FN_ATTRS
2390 _mm_unpackhi_pd(__m128d __a, __m128d __b)
2391 {
2392   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
2393 }
2394
2395 static __inline__ __m128d __DEFAULT_FN_ATTRS
2396 _mm_unpacklo_pd(__m128d __a, __m128d __b)
2397 {
2398   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
2399 }
2400
2401 static __inline__ int __DEFAULT_FN_ATTRS
2402 _mm_movemask_pd(__m128d __a)
2403 {
2404   return __builtin_ia32_movmskpd((__v2df)__a);
2405 }
2406
2407 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
2408   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
2409                                    0 + (((i) >> 0) & 0x1), \
2410                                    2 + (((i) >> 1) & 0x1)); })
2411
2412 static __inline__ __m128 __DEFAULT_FN_ATTRS
2413 _mm_castpd_ps(__m128d __a)
2414 {
2415   return (__m128)__a;
2416 }
2417
2418 static __inline__ __m128i __DEFAULT_FN_ATTRS
2419 _mm_castpd_si128(__m128d __a)
2420 {
2421   return (__m128i)__a;
2422 }
2423
2424 static __inline__ __m128d __DEFAULT_FN_ATTRS
2425 _mm_castps_pd(__m128 __a)
2426 {
2427   return (__m128d)__a;
2428 }
2429
2430 static __inline__ __m128i __DEFAULT_FN_ATTRS
2431 _mm_castps_si128(__m128 __a)
2432 {
2433   return (__m128i)__a;
2434 }
2435
2436 static __inline__ __m128 __DEFAULT_FN_ATTRS
2437 _mm_castsi128_ps(__m128i __a)
2438 {
2439   return (__m128)__a;
2440 }
2441
2442 static __inline__ __m128d __DEFAULT_FN_ATTRS
2443 _mm_castsi128_pd(__m128i __a)
2444 {
2445   return (__m128d)__a;
2446 }
2447
2448 static __inline__ void __DEFAULT_FN_ATTRS
2449 _mm_pause(void)
2450 {
2451   __builtin_ia32_pause();
2452 }
2453
2454 #undef __DEFAULT_FN_ATTRS
2455
2456 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
2457
2458 #endif /* __EMMINTRIN_H */