contrib/llvm-project/clang/lib/Headers/ppc_wrappers/mmintrin.h

   1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 /* Implemented from the specification included in the Intel C++ Compiler
  11    User Guide and Reference, version 9.0.  */
  12
  13 #ifndef NO_WARN_X86_INTRINSICS
  14 /* This header file is to help porting code using Intel intrinsics
  15    explicitly from x86_64 to powerpc64/powerpc64le.
  16
  17    Since PowerPC target doesn't support native 64-bit vector type, we
  18    typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
  19    works well for _si64 and some _pi32 operations.
  20
  21    For _pi16 and _pi8 operations, it's better to transfer __m64 into
  22    128-bit PowerPC vector first. Power8 introduced direct register
  23    move instructions which helps for more efficient implementation.
  24
  25    It's user's responsibility to determine if the results of such port
  26    are acceptable or further changes are needed. Please note that much
  27    code using Intel intrinsics CAN BE REWRITTEN in more portable and
  28    efficient standard C or GNU C extensions with 64-bit scalar
  29    operations, or 128-bit SSE/Altivec operations, which are more
  30    recommended. */
  31 #error                                                                         \
  32     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  33 #endif
  34
  35 #ifndef _MMINTRIN_H_INCLUDED
  36 #define _MMINTRIN_H_INCLUDED
  37
  38 #include <altivec.h>
  39 /* The Intel API is flexible enough that we must allow aliasing with other
  40    vector types, and their scalar components.  */
  41 typedef __attribute__((__aligned__(8))) unsigned long long __m64;
  42
  43 typedef __attribute__((__aligned__(8))) union {
  44   __m64 as_m64;
  45   char as_char[8];
  46   signed char as_signed_char[8];
  47   short as_short[4];
  48   int as_int[2];
  49   long long as_long_long;
  50   float as_float[2];
  51   double as_double;
  52 } __m64_union;
  53
  54 /* Empty the multimedia state.  */
  55 extern __inline void
  56     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  57     _mm_empty(void) {
  58   /* nothing to do on PowerPC.  */
  59 }
  60
  61 extern __inline void
  62     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  63     _m_empty(void) {
  64   /* nothing to do on PowerPC.  */
  65 }
  66
  67 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
  68 extern __inline __m64
  69     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  70     _mm_cvtsi32_si64(int __i) {
  71   return (__m64)(unsigned int)__i;
  72 }
  73
  74 extern __inline __m64
  75     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  76     _m_from_int(int __i) {
  77   return _mm_cvtsi32_si64(__i);
  78 }
  79
  80 /* Convert the lower 32 bits of the __m64 object into an integer.  */
  81 extern __inline int
  82     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  83     _mm_cvtsi64_si32(__m64 __i) {
  84   return ((int)__i);
  85 }
  86
  87 extern __inline int
  88     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  89     _m_to_int(__m64 __i) {
  90   return _mm_cvtsi64_si32(__i);
  91 }
  92
  93 /* Convert I to a __m64 object.  */
  94
  95 /* Intel intrinsic.  */
  96 extern __inline __m64
  97     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  98     _m_from_int64(long long __i) {
  99   return (__m64)__i;
 100 }
 101
 102 extern __inline __m64
 103     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 104     _mm_cvtsi64_m64(long long __i) {
 105   return (__m64)__i;
 106 }
 107
 108 /* Microsoft intrinsic.  */
 109 extern __inline __m64
 110     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 111     _mm_cvtsi64x_si64(long long __i) {
 112   return (__m64)__i;
 113 }
 114
 115 extern __inline __m64
 116     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 117     _mm_set_pi64x(long long __i) {
 118   return (__m64)__i;
 119 }
 120
 121 /* Convert the __m64 object to a 64bit integer.  */
 122
 123 /* Intel intrinsic.  */
 124 extern __inline long long
 125     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 126     _m_to_int64(__m64 __i) {
 127   return (long long)__i;
 128 }
 129
 130 extern __inline long long
 131     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 132     _mm_cvtm64_si64(__m64 __i) {
 133   return (long long)__i;
 134 }
 135
 136 /* Microsoft intrinsic.  */
 137 extern __inline long long
 138     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 139     _mm_cvtsi64_si64x(__m64 __i) {
 140   return (long long)__i;
 141 }
 142
 143 #ifdef _ARCH_PWR8
 144 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 145    the result, and the four 16-bit values from M2 into the upper four 8-bit
 146    values of the result, all with signed saturation.  */
 147 extern __inline __m64
 148     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 149     _mm_packs_pi16(__m64 __m1, __m64 __m2) {
 150   __vector signed short vm1;
 151   __vector signed char vresult;
 152
 153   vm1 = (__vector signed short)(__vector unsigned long long)
 154 #ifdef __LITTLE_ENDIAN__
 155       {__m1, __m2};
 156 #else
 157       {__m2, __m1};
 158 #endif
 159   vresult = vec_packs(vm1, vm1);
 160   return (__m64)((__vector long long)vresult)[0];
 161 }
 162
 163 extern __inline __m64
 164     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 165     _m_packsswb(__m64 __m1, __m64 __m2) {
 166   return _mm_packs_pi16(__m1, __m2);
 167 }
 168
 169 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
 170    the result, and the two 32-bit values from M2 into the upper two 16-bit
 171    values of the result, all with signed saturation.  */
 172 extern __inline __m64
 173     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 174     _mm_packs_pi32(__m64 __m1, __m64 __m2) {
 175   __vector signed int vm1;
 176   __vector signed short vresult;
 177
 178   vm1 = (__vector signed int)(__vector unsigned long long)
 179 #ifdef __LITTLE_ENDIAN__
 180       {__m1, __m2};
 181 #else
 182       {__m2, __m1};
 183 #endif
 184   vresult = vec_packs(vm1, vm1);
 185   return (__m64)((__vector long long)vresult)[0];
 186 }
 187
 188 extern __inline __m64
 189     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 190     _m_packssdw(__m64 __m1, __m64 __m2) {
 191   return _mm_packs_pi32(__m1, __m2);
 192 }
 193
 194 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 195    the result, and the four 16-bit values from M2 into the upper four 8-bit
 196    values of the result, all with unsigned saturation.  */
 197 extern __inline __m64
 198     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 199     _mm_packs_pu16(__m64 __m1, __m64 __m2) {
 200   __vector unsigned char r;
 201   __vector signed short vm1 = (__vector signed short)(__vector long long)
 202 #ifdef __LITTLE_ENDIAN__
 203       {__m1, __m2};
 204 #else
 205       {__m2, __m1};
 206 #endif
 207   const __vector signed short __zero = {0};
 208   __vector __bool short __select = vec_cmplt(vm1, __zero);
 209   r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1);
 210   __vector __bool char packsel = vec_pack(__select, __select);
 211   r = vec_sel(r, (const __vector unsigned char)__zero, packsel);
 212   return (__m64)((__vector long long)r)[0];
 213 }
 214
 215 extern __inline __m64
 216     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 217     _m_packuswb(__m64 __m1, __m64 __m2) {
 218   return _mm_packs_pu16(__m1, __m2);
 219 }
 220 #endif /* end ARCH_PWR8 */
 221
 222 /* Interleave the four 8-bit values from the high half of M1 with the four
 223    8-bit values from the high half of M2.  */
 224 extern __inline __m64
 225     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 226     _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
 227 #if _ARCH_PWR8
 228   __vector unsigned char a, b, c;
 229
 230   a = (__vector unsigned char)vec_splats(__m1);
 231   b = (__vector unsigned char)vec_splats(__m2);
 232   c = vec_mergel(a, b);
 233   return (__m64)((__vector long long)c)[1];
 234 #else
 235   __m64_union m1, m2, res;
 236
 237   m1.as_m64 = __m1;
 238   m2.as_m64 = __m2;
 239
 240   res.as_char[0] = m1.as_char[4];
 241   res.as_char[1] = m2.as_char[4];
 242   res.as_char[2] = m1.as_char[5];
 243   res.as_char[3] = m2.as_char[5];
 244   res.as_char[4] = m1.as_char[6];
 245   res.as_char[5] = m2.as_char[6];
 246   res.as_char[6] = m1.as_char[7];
 247   res.as_char[7] = m2.as_char[7];
 248
 249   return (__m64)res.as_m64;
 250 #endif
 251 }
 252
 253 extern __inline __m64
 254     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 255     _m_punpckhbw(__m64 __m1, __m64 __m2) {
 256   return _mm_unpackhi_pi8(__m1, __m2);
 257 }
 258
 259 /* Interleave the two 16-bit values from the high half of M1 with the two
 260    16-bit values from the high half of M2.  */
 261 extern __inline __m64
 262     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 263     _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
 264   __m64_union m1, m2, res;
 265
 266   m1.as_m64 = __m1;
 267   m2.as_m64 = __m2;
 268
 269   res.as_short[0] = m1.as_short[2];
 270   res.as_short[1] = m2.as_short[2];
 271   res.as_short[2] = m1.as_short[3];
 272   res.as_short[3] = m2.as_short[3];
 273
 274   return (__m64)res.as_m64;
 275 }
 276
 277 extern __inline __m64
 278     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 279     _m_punpckhwd(__m64 __m1, __m64 __m2) {
 280   return _mm_unpackhi_pi16(__m1, __m2);
 281 }
 282 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
 283    value from the high half of M2.  */
 284 extern __inline __m64
 285     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 286     _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
 287   __m64_union m1, m2, res;
 288
 289   m1.as_m64 = __m1;
 290   m2.as_m64 = __m2;
 291
 292   res.as_int[0] = m1.as_int[1];
 293   res.as_int[1] = m2.as_int[1];
 294
 295   return (__m64)res.as_m64;
 296 }
 297
 298 extern __inline __m64
 299     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 300     _m_punpckhdq(__m64 __m1, __m64 __m2) {
 301   return _mm_unpackhi_pi32(__m1, __m2);
 302 }
 303 /* Interleave the four 8-bit values from the low half of M1 with the four
 304    8-bit values from the low half of M2.  */
 305 extern __inline __m64
 306     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 307     _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
 308 #if _ARCH_PWR8
 309   __vector unsigned char a, b, c;
 310
 311   a = (__vector unsigned char)vec_splats(__m1);
 312   b = (__vector unsigned char)vec_splats(__m2);
 313   c = vec_mergel(a, b);
 314   return (__m64)((__vector long long)c)[0];
 315 #else
 316   __m64_union m1, m2, res;
 317
 318   m1.as_m64 = __m1;
 319   m2.as_m64 = __m2;
 320
 321   res.as_char[0] = m1.as_char[0];
 322   res.as_char[1] = m2.as_char[0];
 323   res.as_char[2] = m1.as_char[1];
 324   res.as_char[3] = m2.as_char[1];
 325   res.as_char[4] = m1.as_char[2];
 326   res.as_char[5] = m2.as_char[2];
 327   res.as_char[6] = m1.as_char[3];
 328   res.as_char[7] = m2.as_char[3];
 329
 330   return (__m64)res.as_m64;
 331 #endif
 332 }
 333
 334 extern __inline __m64
 335     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 336     _m_punpcklbw(__m64 __m1, __m64 __m2) {
 337   return _mm_unpacklo_pi8(__m1, __m2);
 338 }
 339 /* Interleave the two 16-bit values from the low half of M1 with the two
 340    16-bit values from the low half of M2.  */
 341 extern __inline __m64
 342     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 343     _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
 344   __m64_union m1, m2, res;
 345
 346   m1.as_m64 = __m1;
 347   m2.as_m64 = __m2;
 348
 349   res.as_short[0] = m1.as_short[0];
 350   res.as_short[1] = m2.as_short[0];
 351   res.as_short[2] = m1.as_short[1];
 352   res.as_short[3] = m2.as_short[1];
 353
 354   return (__m64)res.as_m64;
 355 }
 356
 357 extern __inline __m64
 358     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 359     _m_punpcklwd(__m64 __m1, __m64 __m2) {
 360   return _mm_unpacklo_pi16(__m1, __m2);
 361 }
 362
 363 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
 364    value from the low half of M2.  */
 365 extern __inline __m64
 366     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 367     _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
 368   __m64_union m1, m2, res;
 369
 370   m1.as_m64 = __m1;
 371   m2.as_m64 = __m2;
 372
 373   res.as_int[0] = m1.as_int[0];
 374   res.as_int[1] = m2.as_int[0];
 375
 376   return (__m64)res.as_m64;
 377 }
 378
 379 extern __inline __m64
 380     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 381     _m_punpckldq(__m64 __m1, __m64 __m2) {
 382   return _mm_unpacklo_pi32(__m1, __m2);
 383 }
 384
 385 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
 386 extern __inline __m64
 387     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 388     _mm_add_pi8(__m64 __m1, __m64 __m2) {
 389 #if _ARCH_PWR8
 390   __vector signed char a, b, c;
 391
 392   a = (__vector signed char)vec_splats(__m1);
 393   b = (__vector signed char)vec_splats(__m2);
 394   c = vec_add(a, b);
 395   return (__m64)((__vector long long)c)[0];
 396 #else
 397   __m64_union m1, m2, res;
 398
 399   m1.as_m64 = __m1;
 400   m2.as_m64 = __m2;
 401
 402   res.as_char[0] = m1.as_char[0] + m2.as_char[0];
 403   res.as_char[1] = m1.as_char[1] + m2.as_char[1];
 404   res.as_char[2] = m1.as_char[2] + m2.as_char[2];
 405   res.as_char[3] = m1.as_char[3] + m2.as_char[3];
 406   res.as_char[4] = m1.as_char[4] + m2.as_char[4];
 407   res.as_char[5] = m1.as_char[5] + m2.as_char[5];
 408   res.as_char[6] = m1.as_char[6] + m2.as_char[6];
 409   res.as_char[7] = m1.as_char[7] + m2.as_char[7];
 410
 411   return (__m64)res.as_m64;
 412 #endif
 413 }
 414
 415 extern __inline __m64
 416     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 417     _m_paddb(__m64 __m1, __m64 __m2) {
 418   return _mm_add_pi8(__m1, __m2);
 419 }
 420
 421 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
 422 extern __inline __m64
 423     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 424     _mm_add_pi16(__m64 __m1, __m64 __m2) {
 425 #if _ARCH_PWR8
 426   __vector signed short a, b, c;
 427
 428   a = (__vector signed short)vec_splats(__m1);
 429   b = (__vector signed short)vec_splats(__m2);
 430   c = vec_add(a, b);
 431   return (__m64)((__vector long long)c)[0];
 432 #else
 433   __m64_union m1, m2, res;
 434
 435   m1.as_m64 = __m1;
 436   m2.as_m64 = __m2;
 437
 438   res.as_short[0] = m1.as_short[0] + m2.as_short[0];
 439   res.as_short[1] = m1.as_short[1] + m2.as_short[1];
 440   res.as_short[2] = m1.as_short[2] + m2.as_short[2];
 441   res.as_short[3] = m1.as_short[3] + m2.as_short[3];
 442
 443   return (__m64)res.as_m64;
 444 #endif
 445 }
 446
 447 extern __inline __m64
 448     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 449     _m_paddw(__m64 __m1, __m64 __m2) {
 450   return _mm_add_pi16(__m1, __m2);
 451 }
 452
 453 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
 454 extern __inline __m64
 455     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 456     _mm_add_pi32(__m64 __m1, __m64 __m2) {
 457 #if _ARCH_PWR9
 458   __vector signed int a, b, c;
 459
 460   a = (__vector signed int)vec_splats(__m1);
 461   b = (__vector signed int)vec_splats(__m2);
 462   c = vec_add(a, b);
 463   return (__m64)((__vector long long)c)[0];
 464 #else
 465   __m64_union m1, m2, res;
 466
 467   m1.as_m64 = __m1;
 468   m2.as_m64 = __m2;
 469
 470   res.as_int[0] = m1.as_int[0] + m2.as_int[0];
 471   res.as_int[1] = m1.as_int[1] + m2.as_int[1];
 472
 473   return (__m64)res.as_m64;
 474 #endif
 475 }
 476
 477 extern __inline __m64
 478     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 479     _m_paddd(__m64 __m1, __m64 __m2) {
 480   return _mm_add_pi32(__m1, __m2);
 481 }
 482
 483 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
 484 extern __inline __m64
 485     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 486     _mm_sub_pi8(__m64 __m1, __m64 __m2) {
 487 #if _ARCH_PWR8
 488   __vector signed char a, b, c;
 489
 490   a = (__vector signed char)vec_splats(__m1);
 491   b = (__vector signed char)vec_splats(__m2);
 492   c = vec_sub(a, b);
 493   return (__m64)((__vector long long)c)[0];
 494 #else
 495   __m64_union m1, m2, res;
 496
 497   m1.as_m64 = __m1;
 498   m2.as_m64 = __m2;
 499
 500   res.as_char[0] = m1.as_char[0] - m2.as_char[0];
 501   res.as_char[1] = m1.as_char[1] - m2.as_char[1];
 502   res.as_char[2] = m1.as_char[2] - m2.as_char[2];
 503   res.as_char[3] = m1.as_char[3] - m2.as_char[3];
 504   res.as_char[4] = m1.as_char[4] - m2.as_char[4];
 505   res.as_char[5] = m1.as_char[5] - m2.as_char[5];
 506   res.as_char[6] = m1.as_char[6] - m2.as_char[6];
 507   res.as_char[7] = m1.as_char[7] - m2.as_char[7];
 508
 509   return (__m64)res.as_m64;
 510 #endif
 511 }
 512
 513 extern __inline __m64
 514     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 515     _m_psubb(__m64 __m1, __m64 __m2) {
 516   return _mm_sub_pi8(__m1, __m2);
 517 }
 518
 519 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
 520 extern __inline __m64
 521     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 522     _mm_sub_pi16(__m64 __m1, __m64 __m2) {
 523 #if _ARCH_PWR8
 524   __vector signed short a, b, c;
 525
 526   a = (__vector signed short)vec_splats(__m1);
 527   b = (__vector signed short)vec_splats(__m2);
 528   c = vec_sub(a, b);
 529   return (__m64)((__vector long long)c)[0];
 530 #else
 531   __m64_union m1, m2, res;
 532
 533   m1.as_m64 = __m1;
 534   m2.as_m64 = __m2;
 535
 536   res.as_short[0] = m1.as_short[0] - m2.as_short[0];
 537   res.as_short[1] = m1.as_short[1] - m2.as_short[1];
 538   res.as_short[2] = m1.as_short[2] - m2.as_short[2];
 539   res.as_short[3] = m1.as_short[3] - m2.as_short[3];
 540
 541   return (__m64)res.as_m64;
 542 #endif
 543 }
 544
 545 extern __inline __m64
 546     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 547     _m_psubw(__m64 __m1, __m64 __m2) {
 548   return _mm_sub_pi16(__m1, __m2);
 549 }
 550
 551 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
 552 extern __inline __m64
 553     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 554     _mm_sub_pi32(__m64 __m1, __m64 __m2) {
 555 #if _ARCH_PWR9
 556   __vector signed int a, b, c;
 557
 558   a = (__vector signed int)vec_splats(__m1);
 559   b = (__vector signed int)vec_splats(__m2);
 560   c = vec_sub(a, b);
 561   return (__m64)((__vector long long)c)[0];
 562 #else
 563   __m64_union m1, m2, res;
 564
 565   m1.as_m64 = __m1;
 566   m2.as_m64 = __m2;
 567
 568   res.as_int[0] = m1.as_int[0] - m2.as_int[0];
 569   res.as_int[1] = m1.as_int[1] - m2.as_int[1];
 570
 571   return (__m64)res.as_m64;
 572 #endif
 573 }
 574
 575 extern __inline __m64
 576     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 577     _m_psubd(__m64 __m1, __m64 __m2) {
 578   return _mm_sub_pi32(__m1, __m2);
 579 }
 580
 581 extern __inline __m64
 582     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 583     _mm_add_si64(__m64 __m1, __m64 __m2) {
 584   return (__m1 + __m2);
 585 }
 586
 587 extern __inline __m64
 588     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 589     _mm_sub_si64(__m64 __m1, __m64 __m2) {
 590   return (__m1 - __m2);
 591 }
 592
 593 /* Shift the 64-bit value in M left by COUNT.  */
 594 extern __inline __m64
 595     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 596     _mm_sll_si64(__m64 __m, __m64 __count) {
 597   return (__m << __count);
 598 }
 599
 600 extern __inline __m64
 601     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 602     _m_psllq(__m64 __m, __m64 __count) {
 603   return _mm_sll_si64(__m, __count);
 604 }
 605
 606 extern __inline __m64
 607     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 608     _mm_slli_si64(__m64 __m, const int __count) {
 609   return (__m << __count);
 610 }
 611
 612 extern __inline __m64
 613     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 614     _m_psllqi(__m64 __m, const int __count) {
 615   return _mm_slli_si64(__m, __count);
 616 }
 617
 618 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
 619 extern __inline __m64
 620     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 621     _mm_srl_si64(__m64 __m, __m64 __count) {
 622   return (__m >> __count);
 623 }
 624
 625 extern __inline __m64
 626     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 627     _m_psrlq(__m64 __m, __m64 __count) {
 628   return _mm_srl_si64(__m, __count);
 629 }
 630
 631 extern __inline __m64
 632     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 633     _mm_srli_si64(__m64 __m, const int __count) {
 634   return (__m >> __count);
 635 }
 636
 637 extern __inline __m64
 638     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 639     _m_psrlqi(__m64 __m, const int __count) {
 640   return _mm_srli_si64(__m, __count);
 641 }
 642
 643 /* Bit-wise AND the 64-bit values in M1 and M2.  */
 644 extern __inline __m64
 645     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 646     _mm_and_si64(__m64 __m1, __m64 __m2) {
 647   return (__m1 & __m2);
 648 }
 649
 650 extern __inline __m64
 651     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 652     _m_pand(__m64 __m1, __m64 __m2) {
 653   return _mm_and_si64(__m1, __m2);
 654 }
 655
 656 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
 657    64-bit value in M2.  */
 658 extern __inline __m64
 659     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 660     _mm_andnot_si64(__m64 __m1, __m64 __m2) {
 661   return (~__m1 & __m2);
 662 }
 663
 664 extern __inline __m64
 665     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 666     _m_pandn(__m64 __m1, __m64 __m2) {
 667   return _mm_andnot_si64(__m1, __m2);
 668 }
 669
 670 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
 671 extern __inline __m64
 672     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 673     _mm_or_si64(__m64 __m1, __m64 __m2) {
 674   return (__m1 | __m2);
 675 }
 676
 677 extern __inline __m64
 678     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 679     _m_por(__m64 __m1, __m64 __m2) {
 680   return _mm_or_si64(__m1, __m2);
 681 }
 682
 683 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
 684 extern __inline __m64
 685     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 686     _mm_xor_si64(__m64 __m1, __m64 __m2) {
 687   return (__m1 ^ __m2);
 688 }
 689
 690 extern __inline __m64
 691     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 692     _m_pxor(__m64 __m1, __m64 __m2) {
 693   return _mm_xor_si64(__m1, __m2);
 694 }
 695
 696 /* Creates a 64-bit zero.  */
 697 extern __inline __m64
 698     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 699     _mm_setzero_si64(void) {
 700   return (__m64)0;
 701 }
 702
 703 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
 704    test is true and zero if false.  */
 705 extern __inline __m64
 706     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 707     _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
 708 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
 709   __m64 res;
 710   __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :);
 711   return (res);
 712 #else
 713   __m64_union m1, m2, res;
 714
 715   m1.as_m64 = __m1;
 716   m2.as_m64 = __m2;
 717
 718   res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0;
 719   res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0;
 720   res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0;
 721   res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0;
 722   res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0;
 723   res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0;
 724   res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0;
 725   res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0;
 726
 727   return (__m64)res.as_m64;
 728 #endif
 729 }
 730
 731 extern __inline __m64
 732     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 733     _m_pcmpeqb(__m64 __m1, __m64 __m2) {
 734   return _mm_cmpeq_pi8(__m1, __m2);
 735 }
 736
 737 extern __inline __m64
 738     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 739     _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
 740 #if _ARCH_PWR8
 741   __vector signed char a, b, c;
 742
 743   a = (__vector signed char)vec_splats(__m1);
 744   b = (__vector signed char)vec_splats(__m2);
 745   c = (__vector signed char)vec_cmpgt(a, b);
 746   return (__m64)((__vector long long)c)[0];
 747 #else
 748   __m64_union m1, m2, res;
 749
 750   m1.as_m64 = __m1;
 751   m2.as_m64 = __m2;
 752
 753   res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0;
 754   res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0;
 755   res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0;
 756   res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0;
 757   res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0;
 758   res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0;
 759   res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0;
 760   res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0;
 761
 762   return (__m64)res.as_m64;
 763 #endif
 764 }
 765
 766 extern __inline __m64
 767     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 768     _m_pcmpgtb(__m64 __m1, __m64 __m2) {
 769   return _mm_cmpgt_pi8(__m1, __m2);
 770 }
 771
 772 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
 773    the test is true and zero if false.  */
 774 extern __inline __m64
 775     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 776     _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
 777 #if _ARCH_PWR8
 778   __vector signed short a, b, c;
 779
 780   a = (__vector signed short)vec_splats(__m1);
 781   b = (__vector signed short)vec_splats(__m2);
 782   c = (__vector signed short)vec_cmpeq(a, b);
 783   return (__m64)((__vector long long)c)[0];
 784 #else
 785   __m64_union m1, m2, res;
 786
 787   m1.as_m64 = __m1;
 788   m2.as_m64 = __m2;
 789
 790   res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0;
 791   res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0;
 792   res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0;
 793   res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0;
 794
 795   return (__m64)res.as_m64;
 796 #endif
 797 }
 798
 799 extern __inline __m64
 800     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 801     _m_pcmpeqw(__m64 __m1, __m64 __m2) {
 802   return _mm_cmpeq_pi16(__m1, __m2);
 803 }
 804
 805 extern __inline __m64
 806     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 807     _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
 808 #if _ARCH_PWR8
 809   __vector signed short a, b, c;
 810
 811   a = (__vector signed short)vec_splats(__m1);
 812   b = (__vector signed short)vec_splats(__m2);
 813   c = (__vector signed short)vec_cmpgt(a, b);
 814   return (__m64)((__vector long long)c)[0];
 815 #else
 816   __m64_union m1, m2, res;
 817
 818   m1.as_m64 = __m1;
 819   m2.as_m64 = __m2;
 820
 821   res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0;
 822   res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0;
 823   res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0;
 824   res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0;
 825
 826   return (__m64)res.as_m64;
 827 #endif
 828 }
 829
 830 extern __inline __m64
 831     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 832     _m_pcmpgtw(__m64 __m1, __m64 __m2) {
 833   return _mm_cmpgt_pi16(__m1, __m2);
 834 }
 835
 836 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
 837    the test is true and zero if false.  */
 838 extern __inline __m64
 839     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 840     _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
 841 #if _ARCH_PWR9
 842   __vector signed int a, b, c;
 843
 844   a = (__vector signed int)vec_splats(__m1);
 845   b = (__vector signed int)vec_splats(__m2);
 846   c = (__vector signed int)vec_cmpeq(a, b);
 847   return (__m64)((__vector long long)c)[0];
 848 #else
 849   __m64_union m1, m2, res;
 850
 851   m1.as_m64 = __m1;
 852   m2.as_m64 = __m2;
 853
 854   res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0;
 855   res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0;
 856
 857   return (__m64)res.as_m64;
 858 #endif
 859 }
 860
 861 extern __inline __m64
 862     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 863     _m_pcmpeqd(__m64 __m1, __m64 __m2) {
 864   return _mm_cmpeq_pi32(__m1, __m2);
 865 }
 866
 867 extern __inline __m64
 868     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 869     _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
 870 #if _ARCH_PWR9
 871   __vector signed int a, b, c;
 872
 873   a = (__vector signed int)vec_splats(__m1);
 874   b = (__vector signed int)vec_splats(__m2);
 875   c = (__vector signed int)vec_cmpgt(a, b);
 876   return (__m64)((__vector long long)c)[0];
 877 #else
 878   __m64_union m1, m2, res;
 879
 880   m1.as_m64 = __m1;
 881   m2.as_m64 = __m2;
 882
 883   res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0;
 884   res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0;
 885
 886   return (__m64)res.as_m64;
 887 #endif
 888 }
 889
 890 extern __inline __m64
 891     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 892     _m_pcmpgtd(__m64 __m1, __m64 __m2) {
 893   return _mm_cmpgt_pi32(__m1, __m2);
 894 }
 895
 896 #if _ARCH_PWR8
 897 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
 898    saturated arithmetic.  */
 899 extern __inline __m64
 900     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 901     _mm_adds_pi8(__m64 __m1, __m64 __m2) {
 902   __vector signed char a, b, c;
 903
 904   a = (__vector signed char)vec_splats(__m1);
 905   b = (__vector signed char)vec_splats(__m2);
 906   c = vec_adds(a, b);
 907   return (__m64)((__vector long long)c)[0];
 908 }
 909
 910 extern __inline __m64
 911     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 912     _m_paddsb(__m64 __m1, __m64 __m2) {
 913   return _mm_adds_pi8(__m1, __m2);
 914 }
 915 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
 916    saturated arithmetic.  */
 917 extern __inline __m64
 918     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 919     _mm_adds_pi16(__m64 __m1, __m64 __m2) {
 920   __vector signed short a, b, c;
 921
 922   a = (__vector signed short)vec_splats(__m1);
 923   b = (__vector signed short)vec_splats(__m2);
 924   c = vec_adds(a, b);
 925   return (__m64)((__vector long long)c)[0];
 926 }
 927
 928 extern __inline __m64
 929     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 930     _m_paddsw(__m64 __m1, __m64 __m2) {
 931   return _mm_adds_pi16(__m1, __m2);
 932 }
 933 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
 934    saturated arithmetic.  */
 935 extern __inline __m64
 936     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 937     _mm_adds_pu8(__m64 __m1, __m64 __m2) {
 938   __vector unsigned char a, b, c;
 939
 940   a = (__vector unsigned char)vec_splats(__m1);
 941   b = (__vector unsigned char)vec_splats(__m2);
 942   c = vec_adds(a, b);
 943   return (__m64)((__vector long long)c)[0];
 944 }
 945
 946 extern __inline __m64
 947     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 948     _m_paddusb(__m64 __m1, __m64 __m2) {
 949   return _mm_adds_pu8(__m1, __m2);
 950 }
 951
 952 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
 953    saturated arithmetic.  */
 954 extern __inline __m64
 955     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 956     _mm_adds_pu16(__m64 __m1, __m64 __m2) {
 957   __vector unsigned short a, b, c;
 958
 959   a = (__vector unsigned short)vec_splats(__m1);
 960   b = (__vector unsigned short)vec_splats(__m2);
 961   c = vec_adds(a, b);
 962   return (__m64)((__vector long long)c)[0];
 963 }
 964
 965 extern __inline __m64
 966     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 967     _m_paddusw(__m64 __m1, __m64 __m2) {
 968   return _mm_adds_pu16(__m1, __m2);
 969 }
 970
 971 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
 972    saturating arithmetic.  */
 973 extern __inline __m64
 974     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 975     _mm_subs_pi8(__m64 __m1, __m64 __m2) {
 976   __vector signed char a, b, c;
 977
 978   a = (__vector signed char)vec_splats(__m1);
 979   b = (__vector signed char)vec_splats(__m2);
 980   c = vec_subs(a, b);
 981   return (__m64)((__vector long long)c)[0];
 982 }
 983
 984 extern __inline __m64
 985     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 986     _m_psubsb(__m64 __m1, __m64 __m2) {
 987   return _mm_subs_pi8(__m1, __m2);
 988 }
 989
 990 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
 991    signed saturating arithmetic.  */
 992 extern __inline __m64
 993     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 994     _mm_subs_pi16(__m64 __m1, __m64 __m2) {
 995   __vector signed short a, b, c;
 996
 997   a = (__vector signed short)vec_splats(__m1);
 998   b = (__vector signed short)vec_splats(__m2);
 999   c = vec_subs(a, b);
1000   return (__m64)((__vector long long)c)[0];
1001 }
1002
1003 extern __inline __m64
1004     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005     _m_psubsw(__m64 __m1, __m64 __m2) {
1006   return _mm_subs_pi16(__m1, __m2);
1007 }
1008
1009 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1010    unsigned saturating arithmetic.  */
1011 extern __inline __m64
1012     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013     _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1014   __vector unsigned char a, b, c;
1015
1016   a = (__vector unsigned char)vec_splats(__m1);
1017   b = (__vector unsigned char)vec_splats(__m2);
1018   c = vec_subs(a, b);
1019   return (__m64)((__vector long long)c)[0];
1020 }
1021
1022 extern __inline __m64
1023     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1024     _m_psubusb(__m64 __m1, __m64 __m2) {
1025   return _mm_subs_pu8(__m1, __m2);
1026 }
1027
1028 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1029    unsigned saturating arithmetic.  */
1030 extern __inline __m64
1031     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032     _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1033   __vector unsigned short a, b, c;
1034
1035   a = (__vector unsigned short)vec_splats(__m1);
1036   b = (__vector unsigned short)vec_splats(__m2);
1037   c = vec_subs(a, b);
1038   return (__m64)((__vector long long)c)[0];
1039 }
1040
1041 extern __inline __m64
1042     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1043     _m_psubusw(__m64 __m1, __m64 __m2) {
1044   return _mm_subs_pu16(__m1, __m2);
1045 }
1046
1047 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1048    four 32-bit intermediate results, which are then summed by pairs to
1049    produce two 32-bit results.  */
1050 extern __inline __m64
1051     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1052     _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1053   __vector signed short a, b;
1054   __vector signed int c;
1055   __vector signed int zero = {0, 0, 0, 0};
1056
1057   a = (__vector signed short)vec_splats(__m1);
1058   b = (__vector signed short)vec_splats(__m2);
1059   c = vec_vmsumshm(a, b, zero);
1060   return (__m64)((__vector long long)c)[0];
1061 }
1062
1063 extern __inline __m64
1064     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065     _m_pmaddwd(__m64 __m1, __m64 __m2) {
1066   return _mm_madd_pi16(__m1, __m2);
1067 }
1068 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1069    M2 and produce the high 16 bits of the 32-bit results.  */
1070 extern __inline __m64
1071     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072     _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1073   __vector signed short a, b;
1074   __vector signed short c;
1075   __vector signed int w0, w1;
1076   __vector unsigned char xform1 = {
1077 #ifdef __LITTLE_ENDIAN__
1078       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1079       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1080 #else
1081       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1082       0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1083 #endif
1084   };
1085
1086   a = (__vector signed short)vec_splats(__m1);
1087   b = (__vector signed short)vec_splats(__m2);
1088
1089   w0 = vec_vmulesh(a, b);
1090   w1 = vec_vmulosh(a, b);
1091   c = (__vector signed short)vec_perm(w0, w1, xform1);
1092
1093   return (__m64)((__vector long long)c)[0];
1094 }
1095
1096 extern __inline __m64
1097     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098     _m_pmulhw(__m64 __m1, __m64 __m2) {
1099   return _mm_mulhi_pi16(__m1, __m2);
1100 }
1101
1102 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1103    the low 16 bits of the results.  */
1104 extern __inline __m64
1105     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1106     _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1107   __vector signed short a, b, c;
1108
1109   a = (__vector signed short)vec_splats(__m1);
1110   b = (__vector signed short)vec_splats(__m2);
1111   c = a * b;
1112   return (__m64)((__vector long long)c)[0];
1113 }
1114
1115 extern __inline __m64
1116     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1117     _m_pmullw(__m64 __m1, __m64 __m2) {
1118   return _mm_mullo_pi16(__m1, __m2);
1119 }
1120
1121 /* Shift four 16-bit values in M left by COUNT.  */
1122 extern __inline __m64
1123     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1124     _mm_sll_pi16(__m64 __m, __m64 __count) {
1125   __vector signed short m, r;
1126   __vector unsigned short c;
1127
1128   if (__count <= 15) {
1129     m = (__vector signed short)vec_splats(__m);
1130     c = (__vector unsigned short)vec_splats((unsigned short)__count);
1131     r = vec_sl(m, (__vector unsigned short)c);
1132     return (__m64)((__vector long long)r)[0];
1133   } else
1134     return (0);
1135 }
1136
1137 extern __inline __m64
1138     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1139     _m_psllw(__m64 __m, __m64 __count) {
1140   return _mm_sll_pi16(__m, __count);
1141 }
1142
1143 extern __inline __m64
1144     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1145     _mm_slli_pi16(__m64 __m, int __count) {
1146   /* Promote int to long then invoke mm_sll_pi16.  */
1147   return _mm_sll_pi16(__m, __count);
1148 }
1149
1150 extern __inline __m64
1151     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1152     _m_psllwi(__m64 __m, int __count) {
1153   return _mm_slli_pi16(__m, __count);
1154 }
1155
1156 /* Shift two 32-bit values in M left by COUNT.  */
1157 extern __inline __m64
1158     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1159     _mm_sll_pi32(__m64 __m, __m64 __count) {
1160   __m64_union m, res;
1161
1162   m.as_m64 = __m;
1163
1164   res.as_int[0] = m.as_int[0] << __count;
1165   res.as_int[1] = m.as_int[1] << __count;
1166   return (res.as_m64);
1167 }
1168
1169 extern __inline __m64
1170     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1171     _m_pslld(__m64 __m, __m64 __count) {
1172   return _mm_sll_pi32(__m, __count);
1173 }
1174
1175 extern __inline __m64
1176     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1177     _mm_slli_pi32(__m64 __m, int __count) {
1178   /* Promote int to long then invoke mm_sll_pi32.  */
1179   return _mm_sll_pi32(__m, __count);
1180 }
1181
1182 extern __inline __m64
1183     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184     _m_pslldi(__m64 __m, int __count) {
1185   return _mm_slli_pi32(__m, __count);
1186 }
1187
1188 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1189 extern __inline __m64
1190     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1191     _mm_sra_pi16(__m64 __m, __m64 __count) {
1192   __vector signed short m, r;
1193   __vector unsigned short c;
1194
1195   if (__count <= 15) {
1196     m = (__vector signed short)vec_splats(__m);
1197     c = (__vector unsigned short)vec_splats((unsigned short)__count);
1198     r = vec_sra(m, (__vector unsigned short)c);
1199     return (__m64)((__vector long long)r)[0];
1200   } else
1201     return (0);
1202 }
1203
1204 extern __inline __m64
1205     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206     _m_psraw(__m64 __m, __m64 __count) {
1207   return _mm_sra_pi16(__m, __count);
1208 }
1209
1210 extern __inline __m64
1211     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212     _mm_srai_pi16(__m64 __m, int __count) {
1213   /* Promote int to long then invoke mm_sra_pi32.  */
1214   return _mm_sra_pi16(__m, __count);
1215 }
1216
1217 extern __inline __m64
1218     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1219     _m_psrawi(__m64 __m, int __count) {
1220   return _mm_srai_pi16(__m, __count);
1221 }
1222
1223 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1224 extern __inline __m64
1225     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226     _mm_sra_pi32(__m64 __m, __m64 __count) {
1227   __m64_union m, res;
1228
1229   m.as_m64 = __m;
1230
1231   res.as_int[0] = m.as_int[0] >> __count;
1232   res.as_int[1] = m.as_int[1] >> __count;
1233   return (res.as_m64);
1234 }
1235
1236 extern __inline __m64
1237     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238     _m_psrad(__m64 __m, __m64 __count) {
1239   return _mm_sra_pi32(__m, __count);
1240 }
1241
1242 extern __inline __m64
1243     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244     _mm_srai_pi32(__m64 __m, int __count) {
1245   /* Promote int to long then invoke mm_sra_pi32.  */
1246   return _mm_sra_pi32(__m, __count);
1247 }
1248
1249 extern __inline __m64
1250     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251     _m_psradi(__m64 __m, int __count) {
1252   return _mm_srai_pi32(__m, __count);
1253 }
1254
1255 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1256 extern __inline __m64
1257     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1258     _mm_srl_pi16(__m64 __m, __m64 __count) {
1259   __vector unsigned short m, r;
1260   __vector unsigned short c;
1261
1262   if (__count <= 15) {
1263     m = (__vector unsigned short)vec_splats(__m);
1264     c = (__vector unsigned short)vec_splats((unsigned short)__count);
1265     r = vec_sr(m, (__vector unsigned short)c);
1266     return (__m64)((__vector long long)r)[0];
1267   } else
1268     return (0);
1269 }
1270
1271 extern __inline __m64
1272     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1273     _m_psrlw(__m64 __m, __m64 __count) {
1274   return _mm_srl_pi16(__m, __count);
1275 }
1276
1277 extern __inline __m64
1278     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1279     _mm_srli_pi16(__m64 __m, int __count) {
1280   /* Promote int to long then invoke mm_sra_pi32.  */
1281   return _mm_srl_pi16(__m, __count);
1282 }
1283
1284 extern __inline __m64
1285     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286     _m_psrlwi(__m64 __m, int __count) {
1287   return _mm_srli_pi16(__m, __count);
1288 }
1289
1290 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1291 extern __inline __m64
1292     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293     _mm_srl_pi32(__m64 __m, __m64 __count) {
1294   __m64_union m, res;
1295
1296   m.as_m64 = __m;
1297
1298   res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1299   res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1300   return (res.as_m64);
1301 }
1302
1303 extern __inline __m64
1304     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305     _m_psrld(__m64 __m, __m64 __count) {
1306   return _mm_srl_pi32(__m, __count);
1307 }
1308
1309 extern __inline __m64
1310     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311     _mm_srli_pi32(__m64 __m, int __count) {
1312   /* Promote int to long then invoke mm_srl_pi32.  */
1313   return _mm_srl_pi32(__m, __count);
1314 }
1315
1316 extern __inline __m64
1317     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318     _m_psrldi(__m64 __m, int __count) {
1319   return _mm_srli_pi32(__m, __count);
1320 }
1321 #endif /* _ARCH_PWR8 */
1322
1323 /* Creates a vector of two 32-bit values; I0 is least significant.  */
1324 extern __inline __m64
1325     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1326     _mm_set_pi32(int __i1, int __i0) {
1327   __m64_union res;
1328
1329   res.as_int[0] = __i0;
1330   res.as_int[1] = __i1;
1331   return (res.as_m64);
1332 }
1333
1334 /* Creates a vector of four 16-bit values; W0 is least significant.  */
1335 extern __inline __m64
1336     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337     _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1338   __m64_union res;
1339
1340   res.as_short[0] = __w0;
1341   res.as_short[1] = __w1;
1342   res.as_short[2] = __w2;
1343   res.as_short[3] = __w3;
1344   return (res.as_m64);
1345 }
1346
1347 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
1348 extern __inline __m64
1349     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1350     _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1351                 char __b2, char __b1, char __b0) {
1352   __m64_union res;
1353
1354   res.as_char[0] = __b0;
1355   res.as_char[1] = __b1;
1356   res.as_char[2] = __b2;
1357   res.as_char[3] = __b3;
1358   res.as_char[4] = __b4;
1359   res.as_char[5] = __b5;
1360   res.as_char[6] = __b6;
1361   res.as_char[7] = __b7;
1362   return (res.as_m64);
1363 }
1364
1365 /* Similar, but with the arguments in reverse order.  */
1366 extern __inline __m64
1367     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1368     _mm_setr_pi32(int __i0, int __i1) {
1369   __m64_union res;
1370
1371   res.as_int[0] = __i0;
1372   res.as_int[1] = __i1;
1373   return (res.as_m64);
1374 }
1375
1376 extern __inline __m64
1377     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1378     _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1379   return _mm_set_pi16(__w3, __w2, __w1, __w0);
1380 }
1381
1382 extern __inline __m64
1383     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1384     _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1385                  char __b5, char __b6, char __b7) {
1386   return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1387 }
1388
1389 /* Creates a vector of two 32-bit values, both elements containing I.  */
1390 extern __inline __m64
1391     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1392     _mm_set1_pi32(int __i) {
1393   __m64_union res;
1394
1395   res.as_int[0] = __i;
1396   res.as_int[1] = __i;
1397   return (res.as_m64);
1398 }
1399
1400 /* Creates a vector of four 16-bit values, all elements containing W.  */
1401 extern __inline __m64
1402     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403     _mm_set1_pi16(short __w) {
1404 #if _ARCH_PWR9
1405   __vector signed short w;
1406
1407   w = (__vector signed short)vec_splats(__w);
1408   return (__m64)((__vector long long)w)[0];
1409 #else
1410   __m64_union res;
1411
1412   res.as_short[0] = __w;
1413   res.as_short[1] = __w;
1414   res.as_short[2] = __w;
1415   res.as_short[3] = __w;
1416   return (res.as_m64);
1417 #endif
1418 }
1419
1420 /* Creates a vector of eight 8-bit values, all elements containing B.  */
1421 extern __inline __m64
1422     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1423     _mm_set1_pi8(signed char __b) {
1424 #if _ARCH_PWR8
1425   __vector signed char b;
1426
1427   b = (__vector signed char)vec_splats(__b);
1428   return (__m64)((__vector long long)b)[0];
1429 #else
1430   __m64_union res;
1431
1432   res.as_char[0] = __b;
1433   res.as_char[1] = __b;
1434   res.as_char[2] = __b;
1435   res.as_char[3] = __b;
1436   res.as_char[4] = __b;
1437   res.as_char[5] = __b;
1438   res.as_char[6] = __b;
1439   res.as_char[7] = __b;
1440   return (res.as_m64);
1441 #endif
1442 }
1443 #endif /* _MMINTRIN_H_INCLUDED */