contrib/llvm-project/clang/lib/Headers/ppc_wrappers/mmintrin.h

   1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 /* Implemented from the specification included in the Intel C++ Compiler
  11    User Guide and Reference, version 9.0.  */
  12
  13 #ifndef NO_WARN_X86_INTRINSICS
  14 /* This header file is to help porting code using Intel intrinsics
  15    explicitly from x86_64 to powerpc64/powerpc64le.
  16
  17    Since PowerPC target doesn't support native 64-bit vector type, we
  18    typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
  19    works well for _si64 and some _pi32 operations.
  20
  21    For _pi16 and _pi8 operations, it's better to transfer __m64 into
  22    128-bit PowerPC vector first. Power8 introduced direct register
  23    move instructions which helps for more efficient implementation.
  24
  25    It's user's responsibility to determine if the results of such port
  26    are acceptable or further changes are needed. Please note that much
  27    code using Intel intrinsics CAN BE REWRITTEN in more portable and
  28    efficient standard C or GNU C extensions with 64-bit scalar
  29    operations, or 128-bit SSE/Altivec operations, which are more
  30    recommended. */
  31 #error                                                                         \
  32     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  33 #endif
  34
  35 #ifndef _MMINTRIN_H_INCLUDED
  36 #define _MMINTRIN_H_INCLUDED
  37
  38 #if defined(__linux__) && defined(__ppc64__)
  39
  40 #include <altivec.h>
  41 /* The Intel API is flexible enough that we must allow aliasing with other
  42    vector types, and their scalar components.  */
  43 typedef __attribute__((__aligned__(8))) unsigned long long __m64;
  44
  45 typedef __attribute__((__aligned__(8))) union {
  46   __m64 as_m64;
  47   char as_char[8];
  48   signed char as_signed_char[8];
  49   short as_short[4];
  50   int as_int[2];
  51   long long as_long_long;
  52   float as_float[2];
  53   double as_double;
  54 } __m64_union;
  55
  56 /* Empty the multimedia state.  */
  57 extern __inline void
  58     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  59     _mm_empty(void) {
  60   /* nothing to do on PowerPC.  */
  61 }
  62
  63 extern __inline void
  64     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  65     _m_empty(void) {
  66   /* nothing to do on PowerPC.  */
  67 }
  68
  69 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
  70 extern __inline __m64
  71     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  72     _mm_cvtsi32_si64(int __i) {
  73   return (__m64)(unsigned int)__i;
  74 }
  75
  76 extern __inline __m64
  77     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  78     _m_from_int(int __i) {
  79   return _mm_cvtsi32_si64(__i);
  80 }
  81
  82 /* Convert the lower 32 bits of the __m64 object into an integer.  */
  83 extern __inline int
  84     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  85     _mm_cvtsi64_si32(__m64 __i) {
  86   return ((int)__i);
  87 }
  88
  89 extern __inline int
  90     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  91     _m_to_int(__m64 __i) {
  92   return _mm_cvtsi64_si32(__i);
  93 }
  94
  95 /* Convert I to a __m64 object.  */
  96
  97 /* Intel intrinsic.  */
  98 extern __inline __m64
  99     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 100     _m_from_int64(long long __i) {
 101   return (__m64)__i;
 102 }
 103
 104 extern __inline __m64
 105     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 106     _mm_cvtsi64_m64(long long __i) {
 107   return (__m64)__i;
 108 }
 109
 110 /* Microsoft intrinsic.  */
 111 extern __inline __m64
 112     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 113     _mm_cvtsi64x_si64(long long __i) {
 114   return (__m64)__i;
 115 }
 116
 117 extern __inline __m64
 118     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 119     _mm_set_pi64x(long long __i) {
 120   return (__m64)__i;
 121 }
 122
 123 /* Convert the __m64 object to a 64bit integer.  */
 124
 125 /* Intel intrinsic.  */
 126 extern __inline long long
 127     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 128     _m_to_int64(__m64 __i) {
 129   return (long long)__i;
 130 }
 131
 132 extern __inline long long
 133     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 134     _mm_cvtm64_si64(__m64 __i) {
 135   return (long long)__i;
 136 }
 137
 138 /* Microsoft intrinsic.  */
 139 extern __inline long long
 140     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 141     _mm_cvtsi64_si64x(__m64 __i) {
 142   return (long long)__i;
 143 }
 144
 145 #ifdef _ARCH_PWR8
 146 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 147    the result, and the four 16-bit values from M2 into the upper four 8-bit
 148    values of the result, all with signed saturation.  */
 149 extern __inline __m64
 150     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 151     _mm_packs_pi16(__m64 __m1, __m64 __m2) {
 152   __vector signed short vm1;
 153   __vector signed char vresult;
 154
 155   vm1 = (__vector signed short)(__vector unsigned long long)
 156 #ifdef __LITTLE_ENDIAN__
 157       {__m1, __m2};
 158 #else
 159       {__m2, __m1};
 160 #endif
 161   vresult = vec_packs(vm1, vm1);
 162   return (__m64)((__vector long long)vresult)[0];
 163 }
 164
 165 extern __inline __m64
 166     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 167     _m_packsswb(__m64 __m1, __m64 __m2) {
 168   return _mm_packs_pi16(__m1, __m2);
 169 }
 170
 171 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
 172    the result, and the two 32-bit values from M2 into the upper two 16-bit
 173    values of the result, all with signed saturation.  */
 174 extern __inline __m64
 175     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 176     _mm_packs_pi32(__m64 __m1, __m64 __m2) {
 177   __vector signed int vm1;
 178   __vector signed short vresult;
 179
 180   vm1 = (__vector signed int)(__vector unsigned long long)
 181 #ifdef __LITTLE_ENDIAN__
 182       {__m1, __m2};
 183 #else
 184       {__m2, __m1};
 185 #endif
 186   vresult = vec_packs(vm1, vm1);
 187   return (__m64)((__vector long long)vresult)[0];
 188 }
 189
 190 extern __inline __m64
 191     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 192     _m_packssdw(__m64 __m1, __m64 __m2) {
 193   return _mm_packs_pi32(__m1, __m2);
 194 }
 195
 196 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 197    the result, and the four 16-bit values from M2 into the upper four 8-bit
 198    values of the result, all with unsigned saturation.  */
 199 extern __inline __m64
 200     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 201     _mm_packs_pu16(__m64 __m1, __m64 __m2) {
 202   __vector unsigned char r;
 203   __vector signed short vm1 = (__vector signed short)(__vector long long)
 204 #ifdef __LITTLE_ENDIAN__
 205       {__m1, __m2};
 206 #else
 207       {__m2, __m1};
 208 #endif
 209   const __vector signed short __zero = {0};
 210   __vector __bool short __select = vec_cmplt(vm1, __zero);
 211   r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1);
 212   __vector __bool char packsel = vec_pack(__select, __select);
 213   r = vec_sel(r, (const __vector unsigned char)__zero, packsel);
 214   return (__m64)((__vector long long)r)[0];
 215 }
 216
 217 extern __inline __m64
 218     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 219     _m_packuswb(__m64 __m1, __m64 __m2) {
 220   return _mm_packs_pu16(__m1, __m2);
 221 }
 222 #endif /* end ARCH_PWR8 */
 223
 224 /* Interleave the four 8-bit values from the high half of M1 with the four
 225    8-bit values from the high half of M2.  */
 226 extern __inline __m64
 227     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 228     _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
 229 #if _ARCH_PWR8
 230   __vector unsigned char a, b, c;
 231
 232   a = (__vector unsigned char)vec_splats(__m1);
 233   b = (__vector unsigned char)vec_splats(__m2);
 234   c = vec_mergel(a, b);
 235   return (__m64)((__vector long long)c)[1];
 236 #else
 237   __m64_union m1, m2, res;
 238
 239   m1.as_m64 = __m1;
 240   m2.as_m64 = __m2;
 241
 242   res.as_char[0] = m1.as_char[4];
 243   res.as_char[1] = m2.as_char[4];
 244   res.as_char[2] = m1.as_char[5];
 245   res.as_char[3] = m2.as_char[5];
 246   res.as_char[4] = m1.as_char[6];
 247   res.as_char[5] = m2.as_char[6];
 248   res.as_char[6] = m1.as_char[7];
 249   res.as_char[7] = m2.as_char[7];
 250
 251   return (__m64)res.as_m64;
 252 #endif
 253 }
 254
 255 extern __inline __m64
 256     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 257     _m_punpckhbw(__m64 __m1, __m64 __m2) {
 258   return _mm_unpackhi_pi8(__m1, __m2);
 259 }
 260
 261 /* Interleave the two 16-bit values from the high half of M1 with the two
 262    16-bit values from the high half of M2.  */
 263 extern __inline __m64
 264     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 265     _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
 266   __m64_union m1, m2, res;
 267
 268   m1.as_m64 = __m1;
 269   m2.as_m64 = __m2;
 270
 271   res.as_short[0] = m1.as_short[2];
 272   res.as_short[1] = m2.as_short[2];
 273   res.as_short[2] = m1.as_short[3];
 274   res.as_short[3] = m2.as_short[3];
 275
 276   return (__m64)res.as_m64;
 277 }
 278
 279 extern __inline __m64
 280     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 281     _m_punpckhwd(__m64 __m1, __m64 __m2) {
 282   return _mm_unpackhi_pi16(__m1, __m2);
 283 }
 284 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
 285    value from the high half of M2.  */
 286 extern __inline __m64
 287     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 288     _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
 289   __m64_union m1, m2, res;
 290
 291   m1.as_m64 = __m1;
 292   m2.as_m64 = __m2;
 293
 294   res.as_int[0] = m1.as_int[1];
 295   res.as_int[1] = m2.as_int[1];
 296
 297   return (__m64)res.as_m64;
 298 }
 299
 300 extern __inline __m64
 301     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 302     _m_punpckhdq(__m64 __m1, __m64 __m2) {
 303   return _mm_unpackhi_pi32(__m1, __m2);
 304 }
 305 /* Interleave the four 8-bit values from the low half of M1 with the four
 306    8-bit values from the low half of M2.  */
 307 extern __inline __m64
 308     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 309     _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
 310 #if _ARCH_PWR8
 311   __vector unsigned char a, b, c;
 312
 313   a = (__vector unsigned char)vec_splats(__m1);
 314   b = (__vector unsigned char)vec_splats(__m2);
 315   c = vec_mergel(a, b);
 316   return (__m64)((__vector long long)c)[0];
 317 #else
 318   __m64_union m1, m2, res;
 319
 320   m1.as_m64 = __m1;
 321   m2.as_m64 = __m2;
 322
 323   res.as_char[0] = m1.as_char[0];
 324   res.as_char[1] = m2.as_char[0];
 325   res.as_char[2] = m1.as_char[1];
 326   res.as_char[3] = m2.as_char[1];
 327   res.as_char[4] = m1.as_char[2];
 328   res.as_char[5] = m2.as_char[2];
 329   res.as_char[6] = m1.as_char[3];
 330   res.as_char[7] = m2.as_char[3];
 331
 332   return (__m64)res.as_m64;
 333 #endif
 334 }
 335
 336 extern __inline __m64
 337     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 338     _m_punpcklbw(__m64 __m1, __m64 __m2) {
 339   return _mm_unpacklo_pi8(__m1, __m2);
 340 }
 341 /* Interleave the two 16-bit values from the low half of M1 with the two
 342    16-bit values from the low half of M2.  */
 343 extern __inline __m64
 344     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 345     _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
 346   __m64_union m1, m2, res;
 347
 348   m1.as_m64 = __m1;
 349   m2.as_m64 = __m2;
 350
 351   res.as_short[0] = m1.as_short[0];
 352   res.as_short[1] = m2.as_short[0];
 353   res.as_short[2] = m1.as_short[1];
 354   res.as_short[3] = m2.as_short[1];
 355
 356   return (__m64)res.as_m64;
 357 }
 358
 359 extern __inline __m64
 360     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 361     _m_punpcklwd(__m64 __m1, __m64 __m2) {
 362   return _mm_unpacklo_pi16(__m1, __m2);
 363 }
 364
 365 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
 366    value from the low half of M2.  */
 367 extern __inline __m64
 368     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 369     _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
 370   __m64_union m1, m2, res;
 371
 372   m1.as_m64 = __m1;
 373   m2.as_m64 = __m2;
 374
 375   res.as_int[0] = m1.as_int[0];
 376   res.as_int[1] = m2.as_int[0];
 377
 378   return (__m64)res.as_m64;
 379 }
 380
 381 extern __inline __m64
 382     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 383     _m_punpckldq(__m64 __m1, __m64 __m2) {
 384   return _mm_unpacklo_pi32(__m1, __m2);
 385 }
 386
 387 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
 388 extern __inline __m64
 389     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 390     _mm_add_pi8(__m64 __m1, __m64 __m2) {
 391 #if _ARCH_PWR8
 392   __vector signed char a, b, c;
 393
 394   a = (__vector signed char)vec_splats(__m1);
 395   b = (__vector signed char)vec_splats(__m2);
 396   c = vec_add(a, b);
 397   return (__m64)((__vector long long)c)[0];
 398 #else
 399   __m64_union m1, m2, res;
 400
 401   m1.as_m64 = __m1;
 402   m2.as_m64 = __m2;
 403
 404   res.as_char[0] = m1.as_char[0] + m2.as_char[0];
 405   res.as_char[1] = m1.as_char[1] + m2.as_char[1];
 406   res.as_char[2] = m1.as_char[2] + m2.as_char[2];
 407   res.as_char[3] = m1.as_char[3] + m2.as_char[3];
 408   res.as_char[4] = m1.as_char[4] + m2.as_char[4];
 409   res.as_char[5] = m1.as_char[5] + m2.as_char[5];
 410   res.as_char[6] = m1.as_char[6] + m2.as_char[6];
 411   res.as_char[7] = m1.as_char[7] + m2.as_char[7];
 412
 413   return (__m64)res.as_m64;
 414 #endif
 415 }
 416
 417 extern __inline __m64
 418     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 419     _m_paddb(__m64 __m1, __m64 __m2) {
 420   return _mm_add_pi8(__m1, __m2);
 421 }
 422
 423 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
 424 extern __inline __m64
 425     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 426     _mm_add_pi16(__m64 __m1, __m64 __m2) {
 427 #if _ARCH_PWR8
 428   __vector signed short a, b, c;
 429
 430   a = (__vector signed short)vec_splats(__m1);
 431   b = (__vector signed short)vec_splats(__m2);
 432   c = vec_add(a, b);
 433   return (__m64)((__vector long long)c)[0];
 434 #else
 435   __m64_union m1, m2, res;
 436
 437   m1.as_m64 = __m1;
 438   m2.as_m64 = __m2;
 439
 440   res.as_short[0] = m1.as_short[0] + m2.as_short[0];
 441   res.as_short[1] = m1.as_short[1] + m2.as_short[1];
 442   res.as_short[2] = m1.as_short[2] + m2.as_short[2];
 443   res.as_short[3] = m1.as_short[3] + m2.as_short[3];
 444
 445   return (__m64)res.as_m64;
 446 #endif
 447 }
 448
 449 extern __inline __m64
 450     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 451     _m_paddw(__m64 __m1, __m64 __m2) {
 452   return _mm_add_pi16(__m1, __m2);
 453 }
 454
 455 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
 456 extern __inline __m64
 457     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 458     _mm_add_pi32(__m64 __m1, __m64 __m2) {
 459 #if _ARCH_PWR9
 460   __vector signed int a, b, c;
 461
 462   a = (__vector signed int)vec_splats(__m1);
 463   b = (__vector signed int)vec_splats(__m2);
 464   c = vec_add(a, b);
 465   return (__m64)((__vector long long)c)[0];
 466 #else
 467   __m64_union m1, m2, res;
 468
 469   m1.as_m64 = __m1;
 470   m2.as_m64 = __m2;
 471
 472   res.as_int[0] = m1.as_int[0] + m2.as_int[0];
 473   res.as_int[1] = m1.as_int[1] + m2.as_int[1];
 474
 475   return (__m64)res.as_m64;
 476 #endif
 477 }
 478
 479 extern __inline __m64
 480     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 481     _m_paddd(__m64 __m1, __m64 __m2) {
 482   return _mm_add_pi32(__m1, __m2);
 483 }
 484
 485 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
 486 extern __inline __m64
 487     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 488     _mm_sub_pi8(__m64 __m1, __m64 __m2) {
 489 #if _ARCH_PWR8
 490   __vector signed char a, b, c;
 491
 492   a = (__vector signed char)vec_splats(__m1);
 493   b = (__vector signed char)vec_splats(__m2);
 494   c = vec_sub(a, b);
 495   return (__m64)((__vector long long)c)[0];
 496 #else
 497   __m64_union m1, m2, res;
 498
 499   m1.as_m64 = __m1;
 500   m2.as_m64 = __m2;
 501
 502   res.as_char[0] = m1.as_char[0] - m2.as_char[0];
 503   res.as_char[1] = m1.as_char[1] - m2.as_char[1];
 504   res.as_char[2] = m1.as_char[2] - m2.as_char[2];
 505   res.as_char[3] = m1.as_char[3] - m2.as_char[3];
 506   res.as_char[4] = m1.as_char[4] - m2.as_char[4];
 507   res.as_char[5] = m1.as_char[5] - m2.as_char[5];
 508   res.as_char[6] = m1.as_char[6] - m2.as_char[6];
 509   res.as_char[7] = m1.as_char[7] - m2.as_char[7];
 510
 511   return (__m64)res.as_m64;
 512 #endif
 513 }
 514
 515 extern __inline __m64
 516     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 517     _m_psubb(__m64 __m1, __m64 __m2) {
 518   return _mm_sub_pi8(__m1, __m2);
 519 }
 520
 521 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
 522 extern __inline __m64
 523     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 524     _mm_sub_pi16(__m64 __m1, __m64 __m2) {
 525 #if _ARCH_PWR8
 526   __vector signed short a, b, c;
 527
 528   a = (__vector signed short)vec_splats(__m1);
 529   b = (__vector signed short)vec_splats(__m2);
 530   c = vec_sub(a, b);
 531   return (__m64)((__vector long long)c)[0];
 532 #else
 533   __m64_union m1, m2, res;
 534
 535   m1.as_m64 = __m1;
 536   m2.as_m64 = __m2;
 537
 538   res.as_short[0] = m1.as_short[0] - m2.as_short[0];
 539   res.as_short[1] = m1.as_short[1] - m2.as_short[1];
 540   res.as_short[2] = m1.as_short[2] - m2.as_short[2];
 541   res.as_short[3] = m1.as_short[3] - m2.as_short[3];
 542
 543   return (__m64)res.as_m64;
 544 #endif
 545 }
 546
 547 extern __inline __m64
 548     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 549     _m_psubw(__m64 __m1, __m64 __m2) {
 550   return _mm_sub_pi16(__m1, __m2);
 551 }
 552
 553 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
 554 extern __inline __m64
 555     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 556     _mm_sub_pi32(__m64 __m1, __m64 __m2) {
 557 #if _ARCH_PWR9
 558   __vector signed int a, b, c;
 559
 560   a = (__vector signed int)vec_splats(__m1);
 561   b = (__vector signed int)vec_splats(__m2);
 562   c = vec_sub(a, b);
 563   return (__m64)((__vector long long)c)[0];
 564 #else
 565   __m64_union m1, m2, res;
 566
 567   m1.as_m64 = __m1;
 568   m2.as_m64 = __m2;
 569
 570   res.as_int[0] = m1.as_int[0] - m2.as_int[0];
 571   res.as_int[1] = m1.as_int[1] - m2.as_int[1];
 572
 573   return (__m64)res.as_m64;
 574 #endif
 575 }
 576
 577 extern __inline __m64
 578     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 579     _m_psubd(__m64 __m1, __m64 __m2) {
 580   return _mm_sub_pi32(__m1, __m2);
 581 }
 582
 583 extern __inline __m64
 584     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 585     _mm_add_si64(__m64 __m1, __m64 __m2) {
 586   return (__m1 + __m2);
 587 }
 588
 589 extern __inline __m64
 590     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 591     _mm_sub_si64(__m64 __m1, __m64 __m2) {
 592   return (__m1 - __m2);
 593 }
 594
 595 /* Shift the 64-bit value in M left by COUNT.  */
 596 extern __inline __m64
 597     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 598     _mm_sll_si64(__m64 __m, __m64 __count) {
 599   return (__m << __count);
 600 }
 601
 602 extern __inline __m64
 603     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 604     _m_psllq(__m64 __m, __m64 __count) {
 605   return _mm_sll_si64(__m, __count);
 606 }
 607
 608 extern __inline __m64
 609     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 610     _mm_slli_si64(__m64 __m, const int __count) {
 611   return (__m << __count);
 612 }
 613
 614 extern __inline __m64
 615     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 616     _m_psllqi(__m64 __m, const int __count) {
 617   return _mm_slli_si64(__m, __count);
 618 }
 619
 620 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
 621 extern __inline __m64
 622     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 623     _mm_srl_si64(__m64 __m, __m64 __count) {
 624   return (__m >> __count);
 625 }
 626
 627 extern __inline __m64
 628     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 629     _m_psrlq(__m64 __m, __m64 __count) {
 630   return _mm_srl_si64(__m, __count);
 631 }
 632
 633 extern __inline __m64
 634     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 635     _mm_srli_si64(__m64 __m, const int __count) {
 636   return (__m >> __count);
 637 }
 638
 639 extern __inline __m64
 640     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 641     _m_psrlqi(__m64 __m, const int __count) {
 642   return _mm_srli_si64(__m, __count);
 643 }
 644
 645 /* Bit-wise AND the 64-bit values in M1 and M2.  */
 646 extern __inline __m64
 647     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 648     _mm_and_si64(__m64 __m1, __m64 __m2) {
 649   return (__m1 & __m2);
 650 }
 651
 652 extern __inline __m64
 653     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 654     _m_pand(__m64 __m1, __m64 __m2) {
 655   return _mm_and_si64(__m1, __m2);
 656 }
 657
 658 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
 659    64-bit value in M2.  */
 660 extern __inline __m64
 661     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 662     _mm_andnot_si64(__m64 __m1, __m64 __m2) {
 663   return (~__m1 & __m2);
 664 }
 665
 666 extern __inline __m64
 667     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 668     _m_pandn(__m64 __m1, __m64 __m2) {
 669   return _mm_andnot_si64(__m1, __m2);
 670 }
 671
 672 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
 673 extern __inline __m64
 674     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 675     _mm_or_si64(__m64 __m1, __m64 __m2) {
 676   return (__m1 | __m2);
 677 }
 678
 679 extern __inline __m64
 680     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 681     _m_por(__m64 __m1, __m64 __m2) {
 682   return _mm_or_si64(__m1, __m2);
 683 }
 684
 685 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
 686 extern __inline __m64
 687     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 688     _mm_xor_si64(__m64 __m1, __m64 __m2) {
 689   return (__m1 ^ __m2);
 690 }
 691
 692 extern __inline __m64
 693     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 694     _m_pxor(__m64 __m1, __m64 __m2) {
 695   return _mm_xor_si64(__m1, __m2);
 696 }
 697
 698 /* Creates a 64-bit zero.  */
 699 extern __inline __m64
 700     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 701     _mm_setzero_si64(void) {
 702   return (__m64)0;
 703 }
 704
 705 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
 706    test is true and zero if false.  */
 707 extern __inline __m64
 708     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 709     _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
 710 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
 711   __m64 res;
 712   __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :);
 713   return (res);
 714 #else
 715   __m64_union m1, m2, res;
 716
 717   m1.as_m64 = __m1;
 718   m2.as_m64 = __m2;
 719
 720   res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0;
 721   res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0;
 722   res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0;
 723   res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0;
 724   res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0;
 725   res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0;
 726   res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0;
 727   res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0;
 728
 729   return (__m64)res.as_m64;
 730 #endif
 731 }
 732
 733 extern __inline __m64
 734     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 735     _m_pcmpeqb(__m64 __m1, __m64 __m2) {
 736   return _mm_cmpeq_pi8(__m1, __m2);
 737 }
 738
 739 extern __inline __m64
 740     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 741     _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
 742 #if _ARCH_PWR8
 743   __vector signed char a, b, c;
 744
 745   a = (__vector signed char)vec_splats(__m1);
 746   b = (__vector signed char)vec_splats(__m2);
 747   c = (__vector signed char)vec_cmpgt(a, b);
 748   return (__m64)((__vector long long)c)[0];
 749 #else
 750   __m64_union m1, m2, res;
 751
 752   m1.as_m64 = __m1;
 753   m2.as_m64 = __m2;
 754
 755   res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0;
 756   res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0;
 757   res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0;
 758   res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0;
 759   res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0;
 760   res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0;
 761   res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0;
 762   res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0;
 763
 764   return (__m64)res.as_m64;
 765 #endif
 766 }
 767
 768 extern __inline __m64
 769     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 770     _m_pcmpgtb(__m64 __m1, __m64 __m2) {
 771   return _mm_cmpgt_pi8(__m1, __m2);
 772 }
 773
 774 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
 775    the test is true and zero if false.  */
 776 extern __inline __m64
 777     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 778     _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
 779 #if _ARCH_PWR8
 780   __vector signed short a, b, c;
 781
 782   a = (__vector signed short)vec_splats(__m1);
 783   b = (__vector signed short)vec_splats(__m2);
 784   c = (__vector signed short)vec_cmpeq(a, b);
 785   return (__m64)((__vector long long)c)[0];
 786 #else
 787   __m64_union m1, m2, res;
 788
 789   m1.as_m64 = __m1;
 790   m2.as_m64 = __m2;
 791
 792   res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0;
 793   res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0;
 794   res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0;
 795   res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0;
 796
 797   return (__m64)res.as_m64;
 798 #endif
 799 }
 800
 801 extern __inline __m64
 802     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 803     _m_pcmpeqw(__m64 __m1, __m64 __m2) {
 804   return _mm_cmpeq_pi16(__m1, __m2);
 805 }
 806
 807 extern __inline __m64
 808     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 809     _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
 810 #if _ARCH_PWR8
 811   __vector signed short a, b, c;
 812
 813   a = (__vector signed short)vec_splats(__m1);
 814   b = (__vector signed short)vec_splats(__m2);
 815   c = (__vector signed short)vec_cmpgt(a, b);
 816   return (__m64)((__vector long long)c)[0];
 817 #else
 818   __m64_union m1, m2, res;
 819
 820   m1.as_m64 = __m1;
 821   m2.as_m64 = __m2;
 822
 823   res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0;
 824   res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0;
 825   res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0;
 826   res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0;
 827
 828   return (__m64)res.as_m64;
 829 #endif
 830 }
 831
 832 extern __inline __m64
 833     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 834     _m_pcmpgtw(__m64 __m1, __m64 __m2) {
 835   return _mm_cmpgt_pi16(__m1, __m2);
 836 }
 837
 838 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
 839    the test is true and zero if false.  */
 840 extern __inline __m64
 841     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 842     _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
 843 #if _ARCH_PWR9
 844   __vector signed int a, b, c;
 845
 846   a = (__vector signed int)vec_splats(__m1);
 847   b = (__vector signed int)vec_splats(__m2);
 848   c = (__vector signed int)vec_cmpeq(a, b);
 849   return (__m64)((__vector long long)c)[0];
 850 #else
 851   __m64_union m1, m2, res;
 852
 853   m1.as_m64 = __m1;
 854   m2.as_m64 = __m2;
 855
 856   res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0;
 857   res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0;
 858
 859   return (__m64)res.as_m64;
 860 #endif
 861 }
 862
 863 extern __inline __m64
 864     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 865     _m_pcmpeqd(__m64 __m1, __m64 __m2) {
 866   return _mm_cmpeq_pi32(__m1, __m2);
 867 }
 868
 869 extern __inline __m64
 870     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 871     _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
 872 #if _ARCH_PWR9
 873   __vector signed int a, b, c;
 874
 875   a = (__vector signed int)vec_splats(__m1);
 876   b = (__vector signed int)vec_splats(__m2);
 877   c = (__vector signed int)vec_cmpgt(a, b);
 878   return (__m64)((__vector long long)c)[0];
 879 #else
 880   __m64_union m1, m2, res;
 881
 882   m1.as_m64 = __m1;
 883   m2.as_m64 = __m2;
 884
 885   res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0;
 886   res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0;
 887
 888   return (__m64)res.as_m64;
 889 #endif
 890 }
 891
 892 extern __inline __m64
 893     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 894     _m_pcmpgtd(__m64 __m1, __m64 __m2) {
 895   return _mm_cmpgt_pi32(__m1, __m2);
 896 }
 897
 898 #if _ARCH_PWR8
 899 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
 900    saturated arithmetic.  */
 901 extern __inline __m64
 902     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 903     _mm_adds_pi8(__m64 __m1, __m64 __m2) {
 904   __vector signed char a, b, c;
 905
 906   a = (__vector signed char)vec_splats(__m1);
 907   b = (__vector signed char)vec_splats(__m2);
 908   c = vec_adds(a, b);
 909   return (__m64)((__vector long long)c)[0];
 910 }
 911
 912 extern __inline __m64
 913     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 914     _m_paddsb(__m64 __m1, __m64 __m2) {
 915   return _mm_adds_pi8(__m1, __m2);
 916 }
 917 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
 918    saturated arithmetic.  */
 919 extern __inline __m64
 920     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 921     _mm_adds_pi16(__m64 __m1, __m64 __m2) {
 922   __vector signed short a, b, c;
 923
 924   a = (__vector signed short)vec_splats(__m1);
 925   b = (__vector signed short)vec_splats(__m2);
 926   c = vec_adds(a, b);
 927   return (__m64)((__vector long long)c)[0];
 928 }
 929
 930 extern __inline __m64
 931     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 932     _m_paddsw(__m64 __m1, __m64 __m2) {
 933   return _mm_adds_pi16(__m1, __m2);
 934 }
 935 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
 936    saturated arithmetic.  */
 937 extern __inline __m64
 938     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 939     _mm_adds_pu8(__m64 __m1, __m64 __m2) {
 940   __vector unsigned char a, b, c;
 941
 942   a = (__vector unsigned char)vec_splats(__m1);
 943   b = (__vector unsigned char)vec_splats(__m2);
 944   c = vec_adds(a, b);
 945   return (__m64)((__vector long long)c)[0];
 946 }
 947
 948 extern __inline __m64
 949     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 950     _m_paddusb(__m64 __m1, __m64 __m2) {
 951   return _mm_adds_pu8(__m1, __m2);
 952 }
 953
 954 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
 955    saturated arithmetic.  */
 956 extern __inline __m64
 957     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 958     _mm_adds_pu16(__m64 __m1, __m64 __m2) {
 959   __vector unsigned short a, b, c;
 960
 961   a = (__vector unsigned short)vec_splats(__m1);
 962   b = (__vector unsigned short)vec_splats(__m2);
 963   c = vec_adds(a, b);
 964   return (__m64)((__vector long long)c)[0];
 965 }
 966
 967 extern __inline __m64
 968     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 969     _m_paddusw(__m64 __m1, __m64 __m2) {
 970   return _mm_adds_pu16(__m1, __m2);
 971 }
 972
 973 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
 974    saturating arithmetic.  */
 975 extern __inline __m64
 976     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 977     _mm_subs_pi8(__m64 __m1, __m64 __m2) {
 978   __vector signed char a, b, c;
 979
 980   a = (__vector signed char)vec_splats(__m1);
 981   b = (__vector signed char)vec_splats(__m2);
 982   c = vec_subs(a, b);
 983   return (__m64)((__vector long long)c)[0];
 984 }
 985
 986 extern __inline __m64
 987     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 988     _m_psubsb(__m64 __m1, __m64 __m2) {
 989   return _mm_subs_pi8(__m1, __m2);
 990 }
 991
 992 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
 993    signed saturating arithmetic.  */
 994 extern __inline __m64
 995     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 996     _mm_subs_pi16(__m64 __m1, __m64 __m2) {
 997   __vector signed short a, b, c;
 998
 999   a = (__vector signed short)vec_splats(__m1);
1000   b = (__vector signed short)vec_splats(__m2);
1001   c = vec_subs(a, b);
1002   return (__m64)((__vector long long)c)[0];
1003 }
1004
1005 extern __inline __m64
1006     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007     _m_psubsw(__m64 __m1, __m64 __m2) {
1008   return _mm_subs_pi16(__m1, __m2);
1009 }
1010
1011 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1012    unsigned saturating arithmetic.  */
1013 extern __inline __m64
1014     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015     _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1016   __vector unsigned char a, b, c;
1017
1018   a = (__vector unsigned char)vec_splats(__m1);
1019   b = (__vector unsigned char)vec_splats(__m2);
1020   c = vec_subs(a, b);
1021   return (__m64)((__vector long long)c)[0];
1022 }
1023
1024 extern __inline __m64
1025     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026     _m_psubusb(__m64 __m1, __m64 __m2) {
1027   return _mm_subs_pu8(__m1, __m2);
1028 }
1029
1030 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1031    unsigned saturating arithmetic.  */
1032 extern __inline __m64
1033     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034     _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1035   __vector unsigned short a, b, c;
1036
1037   a = (__vector unsigned short)vec_splats(__m1);
1038   b = (__vector unsigned short)vec_splats(__m2);
1039   c = vec_subs(a, b);
1040   return (__m64)((__vector long long)c)[0];
1041 }
1042
1043 extern __inline __m64
1044     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045     _m_psubusw(__m64 __m1, __m64 __m2) {
1046   return _mm_subs_pu16(__m1, __m2);
1047 }
1048
1049 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1050    four 32-bit intermediate results, which are then summed by pairs to
1051    produce two 32-bit results.  */
1052 extern __inline __m64
1053     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1054     _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1055   __vector signed short a, b;
1056   __vector signed int c;
1057   __vector signed int zero = {0, 0, 0, 0};
1058
1059   a = (__vector signed short)vec_splats(__m1);
1060   b = (__vector signed short)vec_splats(__m2);
1061   c = vec_vmsumshm(a, b, zero);
1062   return (__m64)((__vector long long)c)[0];
1063 }
1064
1065 extern __inline __m64
1066     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1067     _m_pmaddwd(__m64 __m1, __m64 __m2) {
1068   return _mm_madd_pi16(__m1, __m2);
1069 }
1070 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1071    M2 and produce the high 16 bits of the 32-bit results.  */
1072 extern __inline __m64
1073     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074     _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1075   __vector signed short a, b;
1076   __vector signed short c;
1077   __vector signed int w0, w1;
1078   __vector unsigned char xform1 = {
1079 #ifdef __LITTLE_ENDIAN__
1080       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1081       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1082 #else
1083       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1084       0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1085 #endif
1086   };
1087
1088   a = (__vector signed short)vec_splats(__m1);
1089   b = (__vector signed short)vec_splats(__m2);
1090
1091   w0 = vec_vmulesh(a, b);
1092   w1 = vec_vmulosh(a, b);
1093   c = (__vector signed short)vec_perm(w0, w1, xform1);
1094
1095   return (__m64)((__vector long long)c)[0];
1096 }
1097
1098 extern __inline __m64
1099     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1100     _m_pmulhw(__m64 __m1, __m64 __m2) {
1101   return _mm_mulhi_pi16(__m1, __m2);
1102 }
1103
1104 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1105    the low 16 bits of the results.  */
1106 extern __inline __m64
1107     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108     _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1109   __vector signed short a, b, c;
1110
1111   a = (__vector signed short)vec_splats(__m1);
1112   b = (__vector signed short)vec_splats(__m2);
1113   c = a * b;
1114   return (__m64)((__vector long long)c)[0];
1115 }
1116
1117 extern __inline __m64
1118     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1119     _m_pmullw(__m64 __m1, __m64 __m2) {
1120   return _mm_mullo_pi16(__m1, __m2);
1121 }
1122
1123 /* Shift four 16-bit values in M left by COUNT.  */
1124 extern __inline __m64
1125     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126     _mm_sll_pi16(__m64 __m, __m64 __count) {
1127   __vector signed short m, r;
1128   __vector unsigned short c;
1129
1130   if (__count <= 15) {
1131     m = (__vector signed short)vec_splats(__m);
1132     c = (__vector unsigned short)vec_splats((unsigned short)__count);
1133     r = vec_sl(m, (__vector unsigned short)c);
1134     return (__m64)((__vector long long)r)[0];
1135   } else
1136     return (0);
1137 }
1138
1139 extern __inline __m64
1140     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141     _m_psllw(__m64 __m, __m64 __count) {
1142   return _mm_sll_pi16(__m, __count);
1143 }
1144
1145 extern __inline __m64
1146     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147     _mm_slli_pi16(__m64 __m, int __count) {
1148   /* Promote int to long then invoke mm_sll_pi16.  */
1149   return _mm_sll_pi16(__m, __count);
1150 }
1151
1152 extern __inline __m64
1153     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1154     _m_psllwi(__m64 __m, int __count) {
1155   return _mm_slli_pi16(__m, __count);
1156 }
1157
1158 /* Shift two 32-bit values in M left by COUNT.  */
1159 extern __inline __m64
1160     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161     _mm_sll_pi32(__m64 __m, __m64 __count) {
1162   __m64_union m, res;
1163
1164   m.as_m64 = __m;
1165
1166   res.as_int[0] = m.as_int[0] << __count;
1167   res.as_int[1] = m.as_int[1] << __count;
1168   return (res.as_m64);
1169 }
1170
1171 extern __inline __m64
1172     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1173     _m_pslld(__m64 __m, __m64 __count) {
1174   return _mm_sll_pi32(__m, __count);
1175 }
1176
1177 extern __inline __m64
1178     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1179     _mm_slli_pi32(__m64 __m, int __count) {
1180   /* Promote int to long then invoke mm_sll_pi32.  */
1181   return _mm_sll_pi32(__m, __count);
1182 }
1183
1184 extern __inline __m64
1185     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1186     _m_pslldi(__m64 __m, int __count) {
1187   return _mm_slli_pi32(__m, __count);
1188 }
1189
1190 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1191 extern __inline __m64
1192     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1193     _mm_sra_pi16(__m64 __m, __m64 __count) {
1194   __vector signed short m, r;
1195   __vector unsigned short c;
1196
1197   if (__count <= 15) {
1198     m = (__vector signed short)vec_splats(__m);
1199     c = (__vector unsigned short)vec_splats((unsigned short)__count);
1200     r = vec_sra(m, (__vector unsigned short)c);
1201     return (__m64)((__vector long long)r)[0];
1202   } else
1203     return (0);
1204 }
1205
1206 extern __inline __m64
1207     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1208     _m_psraw(__m64 __m, __m64 __count) {
1209   return _mm_sra_pi16(__m, __count);
1210 }
1211
1212 extern __inline __m64
1213     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214     _mm_srai_pi16(__m64 __m, int __count) {
1215   /* Promote int to long then invoke mm_sra_pi32.  */
1216   return _mm_sra_pi16(__m, __count);
1217 }
1218
1219 extern __inline __m64
1220     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221     _m_psrawi(__m64 __m, int __count) {
1222   return _mm_srai_pi16(__m, __count);
1223 }
1224
1225 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1226 extern __inline __m64
1227     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1228     _mm_sra_pi32(__m64 __m, __m64 __count) {
1229   __m64_union m, res;
1230
1231   m.as_m64 = __m;
1232
1233   res.as_int[0] = m.as_int[0] >> __count;
1234   res.as_int[1] = m.as_int[1] >> __count;
1235   return (res.as_m64);
1236 }
1237
1238 extern __inline __m64
1239     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1240     _m_psrad(__m64 __m, __m64 __count) {
1241   return _mm_sra_pi32(__m, __count);
1242 }
1243
1244 extern __inline __m64
1245     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1246     _mm_srai_pi32(__m64 __m, int __count) {
1247   /* Promote int to long then invoke mm_sra_pi32.  */
1248   return _mm_sra_pi32(__m, __count);
1249 }
1250
1251 extern __inline __m64
1252     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253     _m_psradi(__m64 __m, int __count) {
1254   return _mm_srai_pi32(__m, __count);
1255 }
1256
1257 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1258 extern __inline __m64
1259     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1260     _mm_srl_pi16(__m64 __m, __m64 __count) {
1261   __vector unsigned short m, r;
1262   __vector unsigned short c;
1263
1264   if (__count <= 15) {
1265     m = (__vector unsigned short)vec_splats(__m);
1266     c = (__vector unsigned short)vec_splats((unsigned short)__count);
1267     r = vec_sr(m, (__vector unsigned short)c);
1268     return (__m64)((__vector long long)r)[0];
1269   } else
1270     return (0);
1271 }
1272
1273 extern __inline __m64
1274     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275     _m_psrlw(__m64 __m, __m64 __count) {
1276   return _mm_srl_pi16(__m, __count);
1277 }
1278
1279 extern __inline __m64
1280     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281     _mm_srli_pi16(__m64 __m, int __count) {
1282   /* Promote int to long then invoke mm_sra_pi32.  */
1283   return _mm_srl_pi16(__m, __count);
1284 }
1285
1286 extern __inline __m64
1287     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1288     _m_psrlwi(__m64 __m, int __count) {
1289   return _mm_srli_pi16(__m, __count);
1290 }
1291
1292 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1293 extern __inline __m64
1294     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1295     _mm_srl_pi32(__m64 __m, __m64 __count) {
1296   __m64_union m, res;
1297
1298   m.as_m64 = __m;
1299
1300   res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1301   res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1302   return (res.as_m64);
1303 }
1304
1305 extern __inline __m64
1306     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307     _m_psrld(__m64 __m, __m64 __count) {
1308   return _mm_srl_pi32(__m, __count);
1309 }
1310
1311 extern __inline __m64
1312     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1313     _mm_srli_pi32(__m64 __m, int __count) {
1314   /* Promote int to long then invoke mm_srl_pi32.  */
1315   return _mm_srl_pi32(__m, __count);
1316 }
1317
1318 extern __inline __m64
1319     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1320     _m_psrldi(__m64 __m, int __count) {
1321   return _mm_srli_pi32(__m, __count);
1322 }
1323 #endif /* _ARCH_PWR8 */
1324
1325 /* Creates a vector of two 32-bit values; I0 is least significant.  */
1326 extern __inline __m64
1327     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328     _mm_set_pi32(int __i1, int __i0) {
1329   __m64_union res;
1330
1331   res.as_int[0] = __i0;
1332   res.as_int[1] = __i1;
1333   return (res.as_m64);
1334 }
1335
1336 /* Creates a vector of four 16-bit values; W0 is least significant.  */
1337 extern __inline __m64
1338     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1339     _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1340   __m64_union res;
1341
1342   res.as_short[0] = __w0;
1343   res.as_short[1] = __w1;
1344   res.as_short[2] = __w2;
1345   res.as_short[3] = __w3;
1346   return (res.as_m64);
1347 }
1348
1349 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
1350 extern __inline __m64
1351     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352     _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1353                 char __b2, char __b1, char __b0) {
1354   __m64_union res;
1355
1356   res.as_char[0] = __b0;
1357   res.as_char[1] = __b1;
1358   res.as_char[2] = __b2;
1359   res.as_char[3] = __b3;
1360   res.as_char[4] = __b4;
1361   res.as_char[5] = __b5;
1362   res.as_char[6] = __b6;
1363   res.as_char[7] = __b7;
1364   return (res.as_m64);
1365 }
1366
1367 /* Similar, but with the arguments in reverse order.  */
1368 extern __inline __m64
1369     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370     _mm_setr_pi32(int __i0, int __i1) {
1371   __m64_union res;
1372
1373   res.as_int[0] = __i0;
1374   res.as_int[1] = __i1;
1375   return (res.as_m64);
1376 }
1377
1378 extern __inline __m64
1379     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1380     _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1381   return _mm_set_pi16(__w3, __w2, __w1, __w0);
1382 }
1383
1384 extern __inline __m64
1385     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1386     _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1387                  char __b5, char __b6, char __b7) {
1388   return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1389 }
1390
1391 /* Creates a vector of two 32-bit values, both elements containing I.  */
1392 extern __inline __m64
1393     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394     _mm_set1_pi32(int __i) {
1395   __m64_union res;
1396
1397   res.as_int[0] = __i;
1398   res.as_int[1] = __i;
1399   return (res.as_m64);
1400 }
1401
1402 /* Creates a vector of four 16-bit values, all elements containing W.  */
1403 extern __inline __m64
1404     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405     _mm_set1_pi16(short __w) {
1406 #if _ARCH_PWR9
1407   __vector signed short w;
1408
1409   w = (__vector signed short)vec_splats(__w);
1410   return (__m64)((__vector long long)w)[0];
1411 #else
1412   __m64_union res;
1413
1414   res.as_short[0] = __w;
1415   res.as_short[1] = __w;
1416   res.as_short[2] = __w;
1417   res.as_short[3] = __w;
1418   return (res.as_m64);
1419 #endif
1420 }
1421
1422 /* Creates a vector of eight 8-bit values, all elements containing B.  */
1423 extern __inline __m64
1424     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1425     _mm_set1_pi8(signed char __b) {
1426 #if _ARCH_PWR8
1427   __vector signed char b;
1428
1429   b = (__vector signed char)vec_splats(__b);
1430   return (__m64)((__vector long long)b)[0];
1431 #else
1432   __m64_union res;
1433
1434   res.as_char[0] = __b;
1435   res.as_char[1] = __b;
1436   res.as_char[2] = __b;
1437   res.as_char[3] = __b;
1438   res.as_char[4] = __b;
1439   res.as_char[5] = __b;
1440   res.as_char[6] = __b;
1441   res.as_char[7] = __b;
1442   return (res.as_m64);
1443 #endif
1444 }
1445
1446 #else
1447 #include_next <mmintrin.h>
1448 #endif /* defined(__linux__) && defined(__ppc64__) */
1449
1450 #endif /* _MMINTRIN_H_INCLUDED */